From 6beba0d4674d1a45bba97790581e121992ee8295 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 10 Aug 2015 19:21:02 -0400 Subject: [PATCH] API: add DatetimeBlockTZ #8260 fix scalar comparisons vs None generally fix NaT formattting in Series TST: skip postgresql test with tz's update for msgpack Conflicts: pandas/core/base.py pandas/core/categorical.py pandas/core/format.py pandas/tests/test_base.py pandas/util/testing.py full interop for tz-aware Series & timedeltas #10763 --- doc/source/basics.rst | 15 +- doc/source/timeseries.rst | 27 + doc/source/whatsnew/v0.17.0.txt | 82 +++ pandas/core/algorithms.py | 22 +- pandas/core/base.py | 10 +- pandas/core/categorical.py | 3 +- pandas/core/common.py | 286 ++++++---- pandas/core/dtypes.py | 196 +++++++ pandas/core/format.py | 38 +- pandas/core/frame.py | 15 +- pandas/core/index.py | 44 +- pandas/core/internals.py | 365 ++++++++---- pandas/core/ops.py | 162 ++++-- pandas/core/series.py | 87 +-- pandas/io/pytables.py | 78 ++- .../data/legacy_hdf/datetimetz_object.h5 | Bin 0 -> 106271 bytes .../io/tests/generate_legacy_storage_files.py | 8 +- pandas/io/tests/test_packers.py | 30 +- pandas/io/tests/test_pickle.py | 35 +- pandas/io/tests/test_pytables.py | 527 +++++++++--------- pandas/io/tests/test_sql.py | 4 +- pandas/lib.pyx | 4 +- pandas/sparse/frame.py | 8 +- pandas/sparse/series.py | 6 +- pandas/src/inference.pyx | 12 +- pandas/tests/test_base.py | 46 +- pandas/tests/test_categorical.py | 52 +- pandas/tests/test_dtypes.py | 142 +++++ pandas/tests/test_frame.py | 167 +++++- pandas/tests/test_index.py | 24 +- pandas/tests/test_internals.py | 19 +- pandas/tests/test_multilevel.py | 15 + pandas/tests/test_series.py | 208 ++++++- pandas/tools/tests/test_pivot.py | 8 +- pandas/tseries/base.py | 14 +- pandas/tseries/common.py | 36 +- pandas/tseries/frequencies.py | 7 +- pandas/tseries/index.py | 67 ++- pandas/tseries/tests/test_base.py | 44 +- pandas/tseries/tests/test_timeseries.py | 63 +++ pandas/tseries/tests/test_timezones.py | 9 +- pandas/tseries/tests/test_tslib.py | 6 + pandas/tseries/tools.py | 8 + pandas/tslib.pyx | 67 ++- pandas/util/testing.py | 8 +- vb_suite/binary_ops.py | 25 + vb_suite/timeseries.py | 2 +- 47 files changed, 2217 insertions(+), 884 deletions(-) create mode 100644 pandas/core/dtypes.py create mode 100644 pandas/io/tests/data/legacy_hdf/datetimetz_object.h5 create mode 100644 pandas/tests/test_dtypes.py diff --git a/doc/source/basics.rst b/doc/source/basics.rst index 624e10b431de5..a4ba83e1f59d7 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -1549,9 +1549,10 @@ dtypes ------ The main types stored in pandas objects are ``float``, ``int``, ``bool``, -``datetime64[ns]``, ``timedelta[ns]`` and ``object``. In addition these dtypes -have item sizes, e.g. ``int64`` and ``int32``. A convenient :attr:`~DataFrame.dtypes`` -attribute for DataFrames returns a Series with the data type of each column. +``datetime64[ns]`` and ``datetime64[ns, tz]`` (in >= 0.17.0), ``timedelta[ns]``, ``category`` (in >= 0.15.0), and ``object``. In addition these dtypes +have item sizes, e.g. ``int64`` and ``int32``. See :ref:`Series with TZ ` for more detail on ``datetime64[ns, tz]`` dtypes. + +A convenient :attr:`~DataFrame.dtypes`` attribute for DataFrames returns a Series with the data type of each column. .. ipython:: python @@ -1773,8 +1774,14 @@ dtypes: df['tdeltas'] = df.dates.diff() df['uint64'] = np.arange(3, 6).astype('u8') df['other_dates'] = pd.date_range('20130101', periods=3).values + df['tz_aware_dates'] = pd.date_range('20130101', periods=3, tz='US/Eastern') df +And the dtypes + +.. ipython:: python + + df.dtypes :meth:`~DataFrame.select_dtypes` has two parameters ``include`` and ``exclude`` that allow you to say "give me the columns WITH these dtypes" (``include``) and/or "give the @@ -1827,7 +1834,7 @@ All numpy dtypes are subclasses of ``numpy.generic``: .. note:: - Pandas also defines an additional ``category`` dtype, which is not integrated into the normal + Pandas also defines an types `category``, and ``datetime64[ns, tz]``, which are not integrated into the normal numpy hierarchy and wont show up with the above function. .. note:: diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index 6f30ff3f51ad5..aa8057ac2540d 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -1734,3 +1734,30 @@ constructor as well as ``tz_localize``. # tz_convert(None) is identical with tz_convert('UTC').tz_localize(None) didx.tz_convert('UCT').tz_localize(None) + +.. _timeseries.timezone_series: + +TZ aware Dtypes +~~~~~~~~~~~~~~~ + +.. versionadded:: 0.17.0 + +``Series/DatetimeIndex`` with a timezone naive value are represented with a dtype of ``datetime64[ns]``. + +.. ipython:: python + + dr = pd.date_range('20130101',periods=3) + dr + s = Series(dr) + s + +``Series/DatetimeIndex`` with a timezone aware value are represented with a dtype of ``datetime64[ns, tz]``. + +.. ipython:: python + + dr = pd.date_range('20130101',periods=3,tz='US/Eastern') + dr + s = Series(dr) + s + +Both of these ``Series`` can be manipulated via the ``.dt`` accessor, see the :ref:`docs ` as well. diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 2a00263f973e9..b225b17f1940e 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -14,6 +14,7 @@ users upgrade to this version. Highlights include: - Release the Global Interpreter Lock (GIL) on some cython operations, see :ref:`here ` +- Support for a ``datetime64[ns]`` with timezones as a first-class dtype, see :ref:`here ` - The default for ``to_datetime`` will now be to ``raise`` when presented with unparseable formats, previously this would return the original input, see :ref:`here ` - The default for ``dropna`` in ``HDFStore`` has changed to ``False``, to store by default all rows even @@ -564,6 +565,84 @@ Removal of prior version deprecations/changes - Remove use of some deprecated numpy comparison operations, mainly in tests. (:issue:`10569`) +.. _dask: https://dask.readthedocs.org/en/latest/ + +.. _whatsnew_0170.tz: + +Datetime with TZ +~~~~~~~~~~~~~~~~ + +We are adding an implementation that natively supports datetime with timezones. A ``Series`` or a ``DataFrame`` column previously +*could* be assigned a datetime with timezones, and would work as an ``object`` dtype. This had performance issues with a large +number rows. (:issue:`8260`, :issue:`10763`) + +The new implementation allows for having a single-timezone across all rows, and operating on it in a performant manner. + +.. ipython:: python + + df = DataFrame({'A' : date_range('20130101',periods=3), + 'B' : date_range('20130101',periods=3,tz='US/Eastern'), + 'C' : date_range('20130101',periods=3,tz='CET')}) + df + df.dtypes + +.. ipython:: python + + df.B + df.B.dt.tz_localize(None) + +This uses a new-dtype representation as well, that is very similar in look-and-feel to its numpy cousin ``datetime64[ns]`` + +.. ipython:: python + + df['B'].dtype + type(df['B']).dtype + +.. note:: + + There is a slightly different string repr for the underlying ``DatetimeIndex`` as a result of the dtype changes, but + functionaily these are the same. + + Previously + + .. code-block:: python + + In [1]: pd.date_range('20130101',periods=3,tz='US/Eastern') + Out[1]: DatetimeIndex(['2013-01-01 00:00:00-05:00', '2013-01-02 00:00:00-05:00', + '2013-01-03 00:00:00-05:00'], + dtype='datetime64[ns]', freq='D', tz='US/Eastern') + + In [2]: pd.date_range('20130101',periods=3,tz='US/Eastern').dtype + Out[2]: dtype('.+), (?P.+)\]") + + def __init__(self, unit, tz=None): + """ + Parameters + ---------- + unit : string unit that this represents, currently must be 'ns' + tz : string tz that this represents + """ + + if isinstance(unit, DatetimeTZDtype): + self.unit, self.tz = unit.unit, unit.tz + return + + if tz is None: + + # we were passed a string that we can construct + try: + m = self._match.search(unit) + if m is not None: + self.unit = m.groupdict()['unit'] + self.tz = m.groupdict()['tz'] + return + except: + raise ValueError("could not construct DatetimeTZDtype") + + raise ValueError("DatetimeTZDtype constructor must have a tz supplied") + + if unit != 'ns': + raise ValueError("DatetimeTZDtype only supports ns units") + self.unit = unit + self.tz = tz + + @classmethod + def construct_from_string(cls, string): + """ attempt to construct this type from a string, raise a TypeError if its not possible """ + try: + return cls(unit=string) + except ValueError: + raise TypeError("could not construct DatetimeTZDtype") + + def __unicode__(self): + # format the tz + return "datetime64[{unit}, {tz}]".format(unit=self.unit,tz=self.tz) + + @property + def name(self): + return str(self) + + def __hash__(self): + # make myself hashable + return hash(str(self)) + + def __eq__(self, other): + if isinstance(other, compat.string_types): + return other == self.name + + return isinstance(other, DatetimeTZDtype) and self.unit == other.unit and self.tz == other.tz diff --git a/pandas/core/format.py b/pandas/core/format.py index 4ec4375349764..758987e6e150c 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -18,6 +18,7 @@ from pandas.tslib import iNaT, Timestamp, Timedelta, format_array_from_datetime from pandas.tseries.index import DatetimeIndex from pandas.tseries.period import PeriodIndex +import pandas as pd import numpy as np import itertools @@ -1904,6 +1905,8 @@ def format_array(values, formatter, float_format=None, na_rep='NaN', fmt_klass = PeriodArrayFormatter elif com.is_integer_dtype(values.dtype): fmt_klass = IntArrayFormatter + elif com.is_datetimetz(values): + fmt_klass = Datetime64TZFormatter elif com.is_datetime64_dtype(values.dtype): fmt_klass = Datetime64Formatter elif com.is_timedelta64_dtype(values.dtype): @@ -1960,6 +1963,8 @@ def _format(x): if self.na_rep is not None and lib.checknull(x): if x is None: return 'None' + elif x is pd.NaT: + return 'NaT' return self.na_rep elif isinstance(x, PandasObject): return '%s' % x @@ -2065,27 +2070,16 @@ def __init__(self, values, nat_rep='NaT', date_format=None, **kwargs): self.date_format = date_format def _format_strings(self): + """ we by definition have DO NOT have a TZ """ - # we may have a tz, if so, then need to process element-by-element - # when DatetimeBlockWithTimezones is a reality this could be fixed values = self.values if not isinstance(values, DatetimeIndex): values = DatetimeIndex(values) - if values.tz is None: - fmt_values = format_array_from_datetime(values.asi8.ravel(), - format=_get_format_datetime64_from_values(values, self.date_format), - na_rep=self.nat_rep).reshape(values.shape) - fmt_values = fmt_values.tolist() - - else: - - values = values.asobject - is_dates_only = _is_dates_only(values) - formatter = (self.formatter or _get_format_datetime64(is_dates_only, values, date_format=self.date_format)) - fmt_values = [ formatter(x) for x in values ] - - return fmt_values + fmt_values = format_array_from_datetime(values.asi8.ravel(), + format=_get_format_datetime64_from_values(values, self.date_format), + na_rep=self.nat_rep).reshape(values.shape) + return fmt_values.tolist() class PeriodArrayFormatter(IntArrayFormatter): @@ -2164,6 +2158,18 @@ def _get_format_datetime64_from_values(values, date_format): return date_format +class Datetime64TZFormatter(Datetime64Formatter): + + def _format_strings(self): + """ we by definition have a TZ """ + + values = self.values.asobject + is_dates_only = _is_dates_only(values) + formatter = (self.formatter or _get_format_datetime64(is_dates_only, date_format=self.date_format)) + fmt_values = [ formatter(x) for x in values ] + + return fmt_values + class Timedelta64Formatter(GenericArrayFormatter): def __init__(self, values, nat_rep='NaT', box=False, **kwargs): diff --git a/pandas/core/frame.py b/pandas/core/frame.py index fe9c9bece1f79..46b04a4d1f8f4 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -28,6 +28,7 @@ _infer_dtype_from_scalar, _values_from_object, is_list_like, _maybe_box_datetimelike, is_categorical_dtype, is_object_dtype, + is_internal_type, is_datetimetz, _possibly_infer_to_datetimelike, _dict_compat) from pandas.core.generic import NDFrame, _shared_docs from pandas.core.index import Index, MultiIndex, _ensure_index @@ -386,6 +387,9 @@ def _get_axes(N, K, index=index, columns=columns): index, columns = _get_axes(len(values),1) return _arrays_to_mgr([ values ], columns, index, columns, dtype=dtype) + elif is_datetimetz(values): + return self._init_dict({ 0 : values }, index, columns, + dtype=dtype) # by definition an array here # the dtypes will be coerced to a single dtype @@ -844,6 +848,7 @@ def from_records(cls, data, index=None, exclude=None, columns=None, ------- df : DataFrame """ + # Make a copy of the input columns so we can modify it if columns is not None: columns = _ensure_index(columns) @@ -1790,6 +1795,8 @@ def _ixs(self, i, axis=0): copy=True else: new_values = self._data.fast_xs(i) + if lib.isscalar(new_values): + return new_values # if we are a copy, mark as such copy = isinstance(new_values,np.ndarray) and new_values.base is None @@ -2434,7 +2441,7 @@ def reindexer(value): # possibly infer to datetimelike if is_object_dtype(value.dtype): - value = _possibly_infer_to_datetimelike(value.ravel()).reshape(value.shape) + value = _possibly_infer_to_datetimelike(value) else: # upcast the scalar @@ -2442,8 +2449,8 @@ def reindexer(value): value = np.repeat(value, len(self.index)).astype(dtype) value = com._possibly_cast_to_datetime(value, dtype) - # return unconsolidatables directly - if isinstance(value, (Categorical, SparseArray)): + # return internal types directly + if is_internal_type(value): return value # broadcast across multiple columns if necessary @@ -2730,7 +2737,7 @@ def _maybe_casted_values(index, labels=None): values = index.asobject.values elif (isinstance(index, DatetimeIndex) and index.tz is not None): - values = index.asobject + values = index else: values = index.values if values.dtype == np.object_: diff --git a/pandas/core/index.py b/pandas/core/index.py index 12ad8a590c304..4087a4be93ca1 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -19,6 +19,7 @@ deprecate, deprecate_kwarg) import pandas.core.common as com from pandas.core.common import (isnull, array_equivalent, is_dtype_equal, is_object_dtype, + is_datetimetz, _values_from_object, is_float, is_integer, is_iterator, is_categorical_dtype, ABCSeries, ABCCategorical, _ensure_object, _ensure_int64, is_bool_indexer, is_list_like, is_bool_dtype, is_null_slice, is_integer_dtype) @@ -115,7 +116,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=False, from pandas.tseries.period import PeriodIndex if isinstance(data, (np.ndarray, Index, ABCSeries)): - if issubclass(data.dtype.type, np.datetime64): + if issubclass(data.dtype.type, np.datetime64) or is_datetimetz(data): from pandas.tseries.index import DatetimeIndex result = DatetimeIndex(data, copy=copy, name=name, **kwargs) if dtype is not None and _o_dtype == dtype: @@ -207,7 +208,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=False, return cls._simple_new(subarr, name) @classmethod - def _simple_new(cls, values, name=None, **kwargs): + def _simple_new(cls, values, name=None, dtype=None, **kwargs): """ we require the we have a dtype compat for the values if we are passed a non-dtype compat, then coerce using the constructor @@ -215,9 +216,12 @@ def _simple_new(cls, values, name=None, **kwargs): Must be careful not to recurse. """ if not hasattr(values, 'dtype'): - values = np.array(values,copy=False) - if is_object_dtype(values): - values = cls(values, name=name, **kwargs).values + if values is None and dtype is not None: + values = np.empty(0, dtype=dtype) + else: + values = np.array(values,copy=False) + if is_object_dtype(values): + values = cls(values, name=name, dtype=dtype, **kwargs).values result = object.__new__(cls) result._data = values @@ -606,7 +610,7 @@ def _to_embed(self, keep_tz=False): return an array repr of this object, potentially casting to object """ - return self.values + return self.values.copy() def astype(self, dtype): return Index(self.values.astype(dtype), name=self.name, @@ -1194,10 +1198,15 @@ def _ensure_compat_concat(indexes): return indexes - def take(self, indices, axis=0): + def take(self, indices, axis=0, allow_fill=True, fill_value=None): """ return a new Index of the values selected by the indexer + For internal compatibility with numpy arrays. + + # filling must always be None/nan here + # but is passed thru internally + See also -------- numpy.ndarray.take @@ -1513,11 +1522,11 @@ def intersection(self, other): pass try: - indexer = self.get_indexer(other.values) + indexer = Index(self.values).get_indexer(other.values) indexer = indexer.take((indexer != -1).nonzero()[0]) except: # duplicates - indexer = self.get_indexer_non_unique(other.values)[0].unique() + indexer = Index(self.values).get_indexer_non_unique(other.values)[0].unique() indexer = indexer[indexer != -1] taken = self.take(indexer) @@ -1636,6 +1645,13 @@ def get_value(self, series, key): Fast lookup of value from 1-dimensional ndarray. Only use this if you know what you're doing """ + + # if we have a .values that is an Index + # e.g. DatetimeIndex + s = getattr(series,'values',None) + if isinstance(s, Index) and lib.isscalar(key): + return s[key] + s = _values_from_object(series) k = _values_from_object(key) @@ -1936,7 +1952,7 @@ def reindex(self, target, method=None, level=None, limit=None): if not isinstance(target, Index) and len(target) == 0: attrs = self._get_attributes_dict() attrs.pop('freq', None) # don't preserve freq - target = self._simple_new(np.empty(0, dtype=self.dtype), **attrs) + target = self._simple_new(None, dtype=self.dtype, **attrs) else: target = _ensure_index(target) @@ -3235,9 +3251,13 @@ def _convert_list_indexer(self, keyarr, kind=None): return None - def take(self, indexer, axis=0): + def take(self, indexer, axis=0, allow_fill=True, fill_value=None): """ - return a new CategoricalIndex of the values selected by the indexer + For internal compatibility with numpy arrays. + + # filling must always be None/nan here + # but is passed thru internally + assert isnull(fill_value) See also -------- diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 127327dd058c9..35a0545a910a9 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -11,12 +11,17 @@ from pandas.core.common import (_possibly_downcast_to_dtype, isnull, _NS_DTYPE, _TD_DTYPE, ABCSeries, is_list_like, ABCSparseSeries, _infer_dtype_from_scalar, + is_null_slice, is_dtype_equal, is_null_datelike_scalar, _maybe_promote, is_timedelta64_dtype, is_datetime64_dtype, + is_datetime64tz_dtype, is_datetimetz, is_sparse, array_equivalent, _maybe_convert_string_to_object, - is_categorical, needs_i8_conversion, is_datetimelike_v_numeric) + is_categorical, needs_i8_conversion, is_datetimelike_v_numeric, + is_internal_type) + from pandas.core.index import Index, MultiIndex, _ensure_index from pandas.core.indexing import maybe_convert_indices, length_of_indexer +from pandas.tseries.index import DatetimeIndex from pandas.core.categorical import Categorical, maybe_to_categorical import pandas.core.common as com from pandas.sparse.array import _maybe_to_sparse, SparseArray @@ -48,11 +53,13 @@ class Block(PandasObject): is_integer = False is_complex = False is_datetime = False + is_datetimetz = False is_timedelta = False is_bool = False is_object = False is_categorical = False is_sparse = False + _box_to_block_values = True _can_hold_na = False _downcast_dtype = None _can_consolidate = True @@ -125,6 +132,18 @@ def array_dtype(self): """ the dtype to return if I want to construct this block as an array """ return self.dtype + def make_block(self, values, placement=None, ndim=None, **kwargs): + """ + Create a new block, with type inference + propogate any values that are not specified + """ + if placement is None: + placement = self.mgr_locs + if ndim is None: + ndim = self.ndim + + return make_block(values, placement=placement, ndim=ndim, **kwargs) + def make_block_same_class(self, values, placement, copy=False, fastpath=True, **kwargs): """ @@ -248,9 +267,8 @@ def reindex_axis(self, indexer, method=None, axis=1, fill_value=None, new_values = com.take_nd(self.values, indexer, axis, fill_value=fill_value, mask_info=mask_info) - return make_block(new_values, - ndim=self.ndim, fastpath=True, - placement=self.mgr_locs) + return self.make_block(new_values, + fastpath=True) def get(self, item): loc = self.items.get_loc(item) @@ -280,7 +298,7 @@ def apply(self, func, **kwargs): """ apply the function to my values; return a block if we are not one """ result = func(self.values, **kwargs) if not isinstance(result, Block): - result = make_block(values=_block_shape(result), placement=self.mgr_locs,) + result = self.make_block(values=_block_shape(result)) return result @@ -334,8 +352,8 @@ def downcast(self, dtypes=None): dtypes = 'infer' nv = _possibly_downcast_to_dtype(values, dtypes) - return [make_block(nv, ndim=self.ndim, - fastpath=True, placement=self.mgr_locs)] + return [self.make_block(nv, + fastpath=True)] # ndim > 1 if dtypes is None: @@ -362,9 +380,9 @@ def downcast(self, dtypes=None): nv = _possibly_downcast_to_dtype(values[i], dtype) nv = _block_shape(nv, ndim=self.ndim) - blocks.append(make_block(nv, - ndim=self.ndim, fastpath=True, - placement=[rl])) + blocks.append(self.make_block(nv, + fastpath=True, + placement=[rl])) return blocks @@ -382,9 +400,7 @@ def _astype(self, dtype, copy=False, raise_on_error=True, values=None, # may need to convert to categorical # this is only called for non-categoricals if self.is_categorical_astype(dtype): - return make_block(Categorical(self.values, **kwargs), - ndim=self.ndim, - placement=self.mgr_locs) + return self.make_block(Categorical(self.values, **kwargs)) # astype processing dtype = np.dtype(dtype) @@ -399,12 +415,20 @@ def _astype(self, dtype, copy=False, raise_on_error=True, values=None, try: # force the copy here if values is None: + + if issubclass(dtype.type, (compat.text_type, compat.string_types)): + values = self.to_native_types() + else: + values = self.get_values(dtype=dtype) + # _astype_nansafe works fine with 1-d only - values = com._astype_nansafe(self.values.ravel(), dtype, copy=True) - values = values.reshape(self.values.shape) + values = com._astype_nansafe(values.ravel(), dtype, copy=True) + values = values.reshape(self.shape) + newb = make_block(values, - ndim=self.ndim, placement=self.mgr_locs, - fastpath=True, dtype=dtype, klass=klass) + placement=self.mgr_locs, + dtype=dtype, + klass=klass) except: if raise_on_error is True: raise @@ -484,7 +508,7 @@ def _try_coerce_and_cast_result(self, result, dtype=None): def _try_fill(self, value): return value - def to_native_types(self, slicer=None, na_rep='', quoting=None, **kwargs): + def to_native_types(self, slicer=None, na_rep='nan', quoting=None, **kwargs): """ convert to our native types format, slicing if desired """ values = self.values @@ -505,9 +529,9 @@ def copy(self, deep=True): values = self.values if deep: values = values.copy() - return make_block(values, ndim=self.ndim, - klass=self.__class__, fastpath=True, - placement=self.mgr_locs) + return self.make_block(values, + klass=self.__class__, + fastpath=True) def replace(self, to_replace, value, inplace=False, filter=None, regex=False): @@ -616,9 +640,8 @@ def _is_empty_indexer(indexer): else: dtype = 'infer' values = self._try_coerce_and_cast_result(values, dtype) - block = make_block(transf(values), - ndim=self.ndim, placement=self.mgr_locs, - fastpath=True) + block = self.make_block(transf(values), + fastpath=True) # may have to soft convert_objects here if block.is_object and not self.is_object: @@ -701,25 +724,24 @@ def putmask(self, mask, new, align=True, inplace=False): # Put back the dimension that was taken from it and make # a block out of the result. - block = make_block(values=nv[np.newaxis], - placement=[ref_loc], - fastpath=True) + block = self.make_block(values=nv[np.newaxis], + placement=[ref_loc], + fastpath=True) new_blocks.append(block) else: nv = _putmask_smart(new_values, mask, new) - new_blocks.append(make_block(values=nv, - placement=self.mgr_locs, - fastpath=True)) + new_blocks.append(self.make_block(values=nv, + fastpath=True)) return new_blocks if inplace: return [self] - return [make_block(new_values, - placement=self.mgr_locs, fastpath=True)] + return [self.make_block(new_values, + fastpath=True)] def interpolate(self, method='pad', axis=0, index=None, values=None, inplace=False, limit=None, @@ -798,9 +820,9 @@ def _interpolate_with_fill(self, method='pad', axis=0, inplace=False, dtype=self.dtype) values = self._try_coerce_result(values) - blocks = [make_block(values, - ndim=self.ndim, klass=self.__class__, - fastpath=True, placement=self.mgr_locs)] + blocks = [self.make_block(values, + klass=self.__class__, + fastpath=True)] return self._maybe_downcast(blocks, downcast) def _interpolate(self, method=None, index=None, values=None, @@ -837,9 +859,9 @@ def func(x): # interp each column independently interp_values = np.apply_along_axis(func, axis, data) - blocks = [make_block(interp_values, - ndim=self.ndim, klass=self.__class__, - fastpath=True, placement=self.mgr_locs)] + blocks = [self.make_block(interp_values, + klass=self.__class__, + fastpath=True)] return self._maybe_downcast(blocks, downcast) def take_nd(self, indexer, axis, new_mgr_locs=None, fill_tuple=None): @@ -847,13 +869,22 @@ def take_nd(self, indexer, axis, new_mgr_locs=None, fill_tuple=None): Take values according to indexer and return them as a block.bb """ + + # com.take_nd dispatches for DatetimeTZBlock, CategoricalBlock + # so need to preserve types + # sparse is treated like an ndarray, but needs .get_values() shaping + + values = self.values + if self.is_sparse: + values = self.get_values() + if fill_tuple is None: fill_value = self.fill_value - new_values = com.take_nd(self.get_values(), indexer, axis=axis, + new_values = com.take_nd(values, indexer, axis=axis, allow_fill=False) else: fill_value = fill_tuple[0] - new_values = com.take_nd(self.get_values(), indexer, axis=axis, + new_values = com.take_nd(values, indexer, axis=axis, allow_fill=True, fill_value=fill_value) if new_mgr_locs is None: @@ -866,8 +897,8 @@ def take_nd(self, indexer, axis, new_mgr_locs=None, fill_tuple=None): else: new_mgr_locs = self.mgr_locs - if new_values.dtype != self.dtype: - return make_block(new_values, new_mgr_locs) + if not is_dtype_equal(new_values.dtype, self.dtype): + return self.make_block(new_values, new_mgr_locs) else: return self.make_block_same_class(new_values, new_mgr_locs) @@ -877,15 +908,16 @@ def get_values(self, dtype=None): def diff(self, n, axis=1): """ return block for the diff of the values """ new_values = com.diff(self.values, n, axis=axis) - return [make_block(values=new_values, - ndim=self.ndim, fastpath=True, - placement=self.mgr_locs)] + return [self.make_block(values=new_values, + fastpath=True)] def shift(self, periods, axis=0): """ shift the block by periods, possibly upcast """ + # convert integer to float if necessary. need to do a lot more than # that, handle boolean etc also new_values, fill_value = com._maybe_upcast(self.values) + # make sure array sent to np.roll is c_contiguous f_ordered = new_values.flags.f_contiguous if f_ordered: @@ -906,9 +938,8 @@ def shift(self, periods, axis=0): if f_ordered: new_values = new_values.T - return [make_block(new_values, - ndim=self.ndim, fastpath=True, - placement=self.mgr_locs)] + return [self.make_block(new_values, + fastpath=True)] def eval(self, func, other, raise_on_error=True, try_cast=False): """ @@ -999,8 +1030,8 @@ def handle_error(): if try_cast: result = self._try_cast_result(result) - return [make_block(result, ndim=self.ndim, - fastpath=True, placement=self.mgr_locs)] + return [self.make_block(result, + fastpath=True,)] def where(self, other, cond, align=True, raise_on_error=True, try_cast=False): @@ -1097,8 +1128,7 @@ def func(c, v, o): if try_cast: result = self._try_cast_result(result) - return make_block(result, - ndim=self.ndim, placement=self.mgr_locs) + return self.make_block(result) # might need to separate out blocks axis = cond.ndim - 1 @@ -1111,8 +1141,8 @@ def func(c, v, o): if m.any(): r = self._try_cast_result( result.take(m.nonzero()[0], axis=axis)) - result_blocks.append(make_block(r.T, - placement=self.mgr_locs[m])) + result_blocks.append(self.make_block(r.T, + placement=self.mgr_locs[m])) return result_blocks @@ -1129,7 +1159,7 @@ class NonConsolidatableMixIn(object): _holder = None def __init__(self, values, placement, - ndim=None, fastpath=False,): + ndim=None, fastpath=False, **kwargs): # Placement must be converted to BlockPlacement via property setter # before ndim logic, because placement may be a slice which doesn't @@ -1149,6 +1179,12 @@ def __init__(self, values, placement, self.values = values + @property + def shape(self): + if self.ndim == 1: + return (len(self.values)), + return (len(self.mgr_locs), len(self.values)) + def get_values(self, dtype=None): """ need to to_dense myself (and always return a ndim sized object) """ values = self.values.to_dense() @@ -1160,7 +1196,7 @@ def iget(self, col): if self.ndim == 2 and isinstance(col, tuple): col, loc = col - if col != 0: + if not is_null_slice(col) and col != 0: raise IndexError("{0} only contains one item".format(self)) return self.values[loc] else: @@ -1440,13 +1476,13 @@ class ObjectBlock(Block): _can_hold_na = True def __init__(self, values, ndim=2, fastpath=False, - placement=None): + placement=None, **kwargs): if issubclass(values.dtype.type, compat.string_types): values = np.array(values, dtype=object) super(ObjectBlock, self).__init__(values, ndim=ndim, fastpath=fastpath, - placement=placement) + placement=placement, **kwargs) @property def is_bool(self): @@ -1480,8 +1516,8 @@ def convert(self, datetime=True, numeric=True, timedelta=True, coerce=False, copy=copy ).reshape(values.shape) values = _block_shape(values, ndim=self.ndim) - newb = make_block(values, - ndim=self.ndim, placement=[rl]) + newb = self.make_block(values, + placement=[rl]) blocks.append(newb) else: @@ -1494,8 +1530,7 @@ def convert(self, datetime=True, numeric=True, timedelta=True, coerce=False, coerce=coerce, copy=copy ).reshape(self.values.shape) - blocks.append(make_block(values, - ndim=self.ndim, placement=self.mgr_locs)) + blocks.append(self.make_block(values)) return blocks @@ -1549,7 +1584,7 @@ def _try_cast(self, element): def should_store(self, value): return not (issubclass(value.dtype.type, (np.integer, np.floating, np.complexfloating, - np.datetime64, np.bool_)) or com.is_categorical_dtype(value)) + np.datetime64, np.bool_)) or is_internal_type(value)) def replace(self, to_replace, value, inplace=False, filter=None, regex=False): @@ -1652,12 +1687,13 @@ def re_replacer(s): new_values[filt] = f(new_values[filt]) return [self if inplace else - make_block(new_values, - fastpath=True, placement=self.mgr_locs)] + self.make_block(new_values, + fastpath=True)] class CategoricalBlock(NonConsolidatableMixIn, ObjectBlock): __slots__ = () is_categorical = True + _verify_integrity = True _can_hold_na = True _holder = Categorical @@ -1680,10 +1716,6 @@ def to_dense(self): def convert(self, copy=True, **kwargs): return [self.copy() if copy else self] - @property - def shape(self): - return (len(self.mgr_locs), len(self.values)) - @property def array_dtype(self): """ the dtype to return if I want to construct this block as an array """ @@ -1780,9 +1812,7 @@ def _astype(self, dtype, copy=False, raise_on_error=True, values=None, if copy: values = values.copy() - return make_block(values, - ndim=self.ndim, - placement=self.mgr_locs) + return self.make_block(values) def to_native_types(self, slicer=None, na_rep='', quoting=None, **kwargs): """ convert to our native types format, slicing if desired """ @@ -1882,8 +1912,8 @@ def fillna(self, value, limit=None, np.putmask(values, mask, value) return [self if inplace else - make_block(values, - fastpath=True, placement=self.mgr_locs)] + self.make_block(values, + fastpath=True)] def to_native_types(self, slicer=None, na_rep=None, date_format=None, quoting=None, **kwargs): @@ -1891,19 +1921,19 @@ def to_native_types(self, slicer=None, na_rep=None, date_format=None, values = self.values if slicer is not None: - values = values[:, slicer] + values = values[..., slicer] from pandas.core.format import _get_format_datetime64_from_values format = _get_format_datetime64_from_values(values, date_format) result = tslib.format_array_from_datetime(values.view('i8').ravel(), - tz=None, + tz=getattr(self.values,'tz',None), format=format, na_rep=na_rep).reshape(values.shape) - return result + return np.atleast_2d(result) def should_store(self, value): - return issubclass(value.dtype.type, np.datetime64) + return issubclass(value.dtype.type, np.datetime64) and not is_datetimetz(value) def set(self, locs, values, check=False): """ @@ -1926,12 +1956,99 @@ def get_values(self, dtype=None): .reshape(self.values.shape) return self.values +class DatetimeTZBlock(NonConsolidatableMixIn, DatetimeBlock): + """ implement a datetime64 block with a tz attribute """ + __slots__ = tuple(list(DatetimeBlock.__slots__) + [ 'tz' ]) + _holder = DatetimeIndex + is_datetimetz = True + + def __init__(self, values, placement, ndim=2, + **kwargs): + + if isinstance(values, ABCSeries): + values = values.values + if not isinstance(values, self._holder): + raise ValueError("cannot create a DatetimeTZBlock w/o a proper DatetimeIndex values") + if values.tz is None: + raise ValueError("cannot create a DatetimeTZBlock without a tz") + + super(DatetimeTZBlock, self).__init__(values, + placement=placement, + ndim=ndim, + **kwargs) + def _slice(self, slicer): + """ return a slice of my values """ + if isinstance(slicer, tuple): + col, loc = slicer + if not is_null_slice(col) and col != 0: + raise IndexError("{0} only contains one item".format(self)) + return self.values[loc] + return self.values[slicer] + + def get_values(self, dtype=None): + # return object dtype as Timestamps with the zones + if dtype == object: + return lib.map_infer(self.values.ravel(), lambda x: lib.Timestamp(x,tz=self.values.tz))\ + .reshape(self.values.shape) + return super(DatetimeTZBlock, self).get_values(dtype) + + def _try_coerce_args(self, values, other): + """ localize and return i8 for the values """ + values = values.tz_localize(None).asi8 + + if is_null_datelike_scalar(other): + other = tslib.iNaT + elif isinstance(other, self._holder): + if other.tz != self.tz: + raise ValueError("incompatible or non tz-aware value") + other = other.tz_localize(None).asi8 + else: + other = lib.Timestamp(other) + if not getattr(other, 'tz', None): + raise ValueError("incompatible or non tz-aware value") + other = other.value + + return values, other + + def _try_coerce_result(self, result): + """ reverse of try_coerce_args """ + result = super(DatetimeTZBlock, self)._try_coerce_result(result) + + if isinstance(result, np.ndarray): + result = self._holder(result, tz=self.values.tz) + elif isinstance(result, (np.integer, np.datetime64)): + result = lib.Timestamp(result, tz=self.values.tz) + return result + + def shift(self, periods, axis=0): + """ shift the block by periods """ + + ### think about moving this to the DatetimeIndex. This is a non-freq (number of periods) shift ### + + N = len(self) + indexer = np.zeros(N, dtype=int) + if periods > 0: + indexer[periods:] = np.arange(N - periods) + else: + indexer[:periods] = np.arange(-periods, N) + + # move to UTC & take + new_values = self.values.tz_localize(None).asi8.take(indexer) + + if periods > 0: + new_values[:periods] = tslib.iNaT + else: + new_values[periods:] = tslib.iNaT + + new_values = DatetimeIndex(new_values,tz=self.values.tz) + return [self.make_block_same_class(new_values, placement=self.mgr_locs)] class SparseBlock(NonConsolidatableMixIn, Block): """ implement as a list of sparse arrays of the same dtype """ __slots__ = () is_sparse = True is_numeric = True + _box_to_block_values = False _can_hold_na = True _ftype = 'sparse' _holder = SparseArray @@ -1956,6 +2073,9 @@ def fill_value(self, v): v = float(v) self.values.fill_value = v + def to_dense(self): + return self.values.to_dense().view() + @property def sp_values(self): return self.values.sp_values @@ -1990,7 +2110,7 @@ def copy(self, deep=True): def make_block_same_class(self, values, placement, sparse_index=None, kind=None, dtype=None, - fill_value=None, copy=False, fastpath=True): + fill_value=None, copy=False, fastpath=True, **kwargs): """ return a new block """ if dtype is None: dtype = self.dtype @@ -2008,8 +2128,9 @@ def make_block_same_class(self, values, placement, # output is 0-item, so let's convert it to a dense block: it # won't take space since there's 0 items, plus it will preserve # the dtype. - return make_block(np.empty(values.shape, dtype=dtype), - placement, fastpath=True,) + return self.make_block(np.empty(values.shape, dtype=dtype), + placement, + fastpath=True) elif nitems > 1: raise ValueError("Only 1-item 2d sparse blocks are supported") else: @@ -2018,8 +2139,9 @@ def make_block_same_class(self, values, placement, new_values = SparseArray(values, sparse_index=sparse_index, kind=kind or self.kind, dtype=dtype, fill_value=fill_value, copy=copy) - return make_block(new_values, ndim=self.ndim, - fastpath=fastpath, placement=placement) + return self.make_block(new_values, + fastpath=fastpath, + placement=placement) def interpolate(self, method='pad', axis=0, inplace=False, limit=None, fill_value=None, **kwargs): @@ -2103,7 +2225,12 @@ def make_block(values, placement, klass=None, ndim=None, elif dtype == np.bool_: klass = BoolBlock elif issubclass(vtype, np.datetime64): - klass = DatetimeBlock + if hasattr(values,'tz'): + klass = DatetimeTZBlock + else: + klass = DatetimeBlock + elif is_datetimetz(values): + klass = DatetimeTZBlock elif issubclass(vtype, np.complexfloating): klass = ComplexBlock elif is_categorical(values): @@ -2399,7 +2526,7 @@ def _verify_integrity(self): mgr_shape = self.shape tot_items = sum(len(x.mgr_locs) for x in self.blocks) for block in self.blocks: - if not block.is_sparse and block.shape[1:] != mgr_shape[1:]: + if block._verify_integrity and block.shape[1:] != mgr_shape[1:]: construction_error(tot_items, block.shape[1:], self.axes) if len(self.items) != tot_items: raise AssertionError('Number of manager items must equal union of ' @@ -2809,7 +2936,7 @@ def fast_xs(self, loc): single block """ if len(self.blocks) == 1: - return self.blocks[0].values[:, loc] + return self.blocks[0].iget((slice(None), loc)) items = self.items @@ -2890,10 +3017,9 @@ def iget(self, i, fastpath=True): Otherwise return as a ndarray """ - block = self.blocks[self._blknos[i]] values = block.iget(self._blklocs[i]) - if not fastpath or block.is_sparse or values.ndim != 1: + if not fastpath or not block._box_to_block_values or values.ndim != 1: return values # fastpath shortcut for select a single-dim from a 2-dim BM @@ -2963,18 +3089,10 @@ def set(self, item, value, check=False): # FIXME: refactor, clearly separate broadcasting & zip-like assignment # can prob also fix the various if tests for sparse/categorical - value_is_sparse = isinstance(value, SparseArray) - value_is_cat = is_categorical(value) - value_is_nonconsolidatable = value_is_sparse or value_is_cat - - if value_is_sparse: - # sparse - assert self.ndim == 2 + value_is_internal_type = is_internal_type(value) - def value_getitem(placement): - return value - elif value_is_cat: - # categorical + # categorical/spares/datetimetz + if value_is_internal_type: def value_getitem(placement): return value else: @@ -3043,7 +3161,7 @@ def value_getitem(placement): unfit_count = len(unfit_mgr_locs) new_blocks = [] - if value_is_nonconsolidatable: + if value_is_internal_type: # This code (ab-)uses the fact that sparse blocks contain only # one item. new_blocks.extend( @@ -3466,7 +3584,7 @@ def convert(self, **kwargs): @property def dtype(self): - return self._values.dtype + return self._block.dtype @property def array_dtype(self): @@ -3490,7 +3608,7 @@ def get_ftypes(self): @property def values(self): - return self._values.view() + return self._block.values.view() def get_values(self): """ return a dense type view """ @@ -3498,7 +3616,7 @@ def get_values(self): @property def itemsize(self): - return self._values.itemsize + return self._block.values.itemsize @property def _can_hold_na(self): @@ -3565,6 +3683,7 @@ def create_block_manager_from_blocks(blocks, axes): def create_block_manager_from_arrays(arrays, names, axes): + try: blocks = form_blocks(arrays, names, axes) mgr = BlockManager(blocks, axes) @@ -3584,6 +3703,7 @@ def form_blocks(arrays, names, axes): object_items = [] sparse_items = [] datetime_items = [] + datetime_tz_items = [] cat_items = [] extra_locs = [] @@ -3602,7 +3722,7 @@ def form_blocks(arrays, names, axes): k = names[name_idx] v = arrays[name_idx] - if isinstance(v, (SparseArray, ABCSparseSeries)): + if is_sparse(v): sparse_items.append((i, k, v)) elif issubclass(v.dtype.type, np.floating): float_items.append((i, k, v)) @@ -3612,10 +3732,12 @@ def form_blocks(arrays, names, axes): if v.dtype != _NS_DTYPE: v = tslib.cast_to_nanoseconds(v) - if hasattr(v, 'tz') and v.tz is not None: - object_items.append((i, k, v)) + if is_datetimetz(v): + datetime_tz_items.append((i, k, v)) else: datetime_items.append((i, k, v)) + elif is_datetimetz(v): + datetime_tz_items.append((i, k, v)) elif issubclass(v.dtype.type, np.integer): if v.dtype == np.uint64: # HACK #2355 definite overflow @@ -3649,6 +3771,14 @@ def form_blocks(arrays, names, axes): datetime_items, _NS_DTYPE) blocks.extend(datetime_blocks) + if len(datetime_tz_items): + dttz_blocks = [ make_block(array, + klass=DatetimeTZBlock, + fastpath=True, + placement=[i], + ) for i, names, array in datetime_tz_items ] + blocks.extend(dttz_blocks) + if len(bool_items): bool_blocks = _simple_blockify( bool_items, np.bool_) @@ -3781,18 +3911,20 @@ def _lcd_dtype(l): have_float = len(counts[FloatBlock]) > 0 have_complex = len(counts[ComplexBlock]) > 0 have_dt64 = len(counts[DatetimeBlock]) > 0 + have_dt64_tz = len(counts[DatetimeTZBlock]) > 0 have_td64 = len(counts[TimeDeltaBlock]) > 0 have_cat = len(counts[CategoricalBlock]) > 0 have_sparse = len(counts[SparseBlock]) > 0 have_numeric = have_float or have_complex or have_int - has_non_numeric = have_dt64 or have_td64 or have_cat + has_non_numeric = have_dt64 or have_dt64_tz or have_td64 or have_cat if (have_object or - (have_bool and (have_numeric or have_dt64 or have_td64)) or + (have_bool and (have_numeric or have_dt64 or have_dt64_tz or have_td64)) or (have_numeric and has_non_numeric) or have_cat or have_dt64 or + have_dt64_tz or have_td64): return np.dtype(object) elif have_bool: @@ -4120,6 +4252,8 @@ def get_empty_dtype_and_na(join_units): if com.is_categorical_dtype(dtype): upcast_cls = 'category' + elif com.is_datetimetz(dtype): + upcast_cls = 'datetimetz' elif issubclass(dtype.type, np.bool_): upcast_cls = 'bool' elif issubclass(dtype.type, np.object_): @@ -4154,6 +4288,8 @@ def get_empty_dtype_and_na(join_units): return np.dtype(np.object_), np.nan elif 'float' in upcast_classes: return np.dtype(np.float64), np.nan + elif 'datetimetz' in upcast_classes: + return np.dtype('M8[ns]'), tslib.iNaT elif 'datetime' in upcast_classes: return np.dtype('M8[ns]'), tslib.iNaT elif 'timedelta' in upcast_classes: @@ -4412,12 +4548,6 @@ def is_null(self): return True - @cache_readonly - def needs_block_conversion(self): - """ we might need to convert the joined values to a suitable block repr """ - block = self.block - return block is not None and (block.is_sparse or block.is_categorical) - def get_reindexed_values(self, empty_dtype, upcasted_na): if upcasted_na is None: # No upcasting is necessary @@ -4442,11 +4572,8 @@ def get_reindexed_values(self, empty_dtype, upcasted_na): return missing_arr if not self.indexers: - if self.block.is_categorical: - # preserve the categoricals for validation in _concat_compat - return self.block.values - elif self.block.is_sparse: - # preserve the sparse array for validation in _concat_compat + if not self.block._can_consolidate: + # preserve these for validation in _concat_compat return self.block.values if self.block.is_bool: diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 8e3dd3836855c..9c65dd2064f76 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -9,6 +9,7 @@ import warnings import numpy as np import pandas as pd +import datetime from pandas import compat, lib, tslib import pandas.index as _index from pandas.util.decorators import Appender @@ -21,8 +22,10 @@ _values_from_object, _maybe_match_name, needs_i8_conversion, is_datetimelike_v_numeric, is_integer_dtype, is_categorical_dtype, is_object_dtype, - is_timedelta64_dtype, is_datetime64_dtype, is_bool_dtype) + is_timedelta64_dtype, is_datetime64_dtype, is_datetime64tz_dtype, + is_bool_dtype) from pandas.io.common import PerformanceWarning + # ----------------------------------------------------------------------------- # Functions that add arithmetic methods to objects, given arithmetic factory # methods @@ -268,50 +271,61 @@ class _TimeOp(object): wrap_results = staticmethod(lambda x: x) dtype = None - def __init__(self, left, right, name): - self.name = name + def __init__(self, left, right, name, na_op): # need to make sure that we are aligning the data if isinstance(left, pd.Series) and isinstance(right, pd.Series): left, right = left.align(right,copy=False) - self.left = left - self.right = right + lvalues = self._convert_to_array(left, name=name) + rvalues = self._convert_to_array(right, name=name, other=lvalues) - self.is_offset_lhs = self._is_offset(left) - self.is_offset_rhs = self._is_offset(right) + self.name = name + self.na_op = na_op - lvalues = self._convert_to_array(left, name=name) - self.is_timedelta_lhs = is_timedelta64_dtype(left) - self.is_datetime_lhs = is_datetime64_dtype(left) + # left + self.left = left + self.is_offset_lhs = self._is_offset(left) + self.is_timedelta_lhs = is_timedelta64_dtype(lvalues) + self.is_datetime64_lhs = is_datetime64_dtype(lvalues) + self.is_datetime64tz_lhs = is_datetime64tz_dtype(lvalues) + self.is_datetime_lhs = self.is_datetime64_lhs or self.is_datetime64tz_lhs self.is_integer_lhs = left.dtype.kind in ['i', 'u'] - rvalues = self._convert_to_array(right, name=name, other=lvalues) - self.is_datetime_rhs = is_datetime64_dtype(rvalues) + # right + self.right = right + self.is_offset_rhs = self._is_offset(right) + self.is_datetime64_rhs = is_datetime64_dtype(rvalues) + self.is_datetime64tz_rhs = is_datetime64tz_dtype(rvalues) + self.is_datetime_rhs = self.is_datetime64_rhs or self.is_datetime64tz_rhs self.is_timedelta_rhs = is_timedelta64_dtype(rvalues) self.is_integer_rhs = rvalues.dtype.kind in ('i', 'u') - self._validate() + self._validate(lvalues, rvalues, name) + self.lvalues, self.rvalues = self._convert_for_datetime(lvalues, rvalues) - self._convert_for_datetime(lvalues, rvalues) - - def _validate(self): + def _validate(self, lvalues, rvalues, name): # timedelta and integer mul/div - if (self.is_timedelta_lhs and self.is_integer_rhs) or\ - (self.is_integer_lhs and self.is_timedelta_rhs): + if (self.is_timedelta_lhs and self.is_integer_rhs) or ( + self.is_integer_lhs and self.is_timedelta_rhs): - if self.name not in ('__truediv__', '__div__', '__mul__'): + if name not in ('__div__', '__truediv__', '__mul__'): raise TypeError("can only operate on a timedelta and an " "integer for division, but the operator [%s]" - "was passed" % self.name) + "was passed" % name) # 2 datetimes elif self.is_datetime_lhs and self.is_datetime_rhs: - if self.name != '__sub__': + + if name not in ('__sub__','__rsub__'): raise TypeError("can only operate on a datetimes for" " subtraction, but the operator [%s] was" - " passed" % self.name) + " passed" % name) + + # if tz's must be equal (same or None) + if getattr(lvalues,'tz',None) != getattr(rvalues,'tz',None): + raise ValueError("Incompatbile tz's on datetime subtraction ops") # 2 timedeltas elif ((self.is_timedelta_lhs and @@ -319,29 +333,29 @@ def _validate(self): (self.is_timedelta_rhs and (self.is_timedelta_lhs or self.is_offset_lhs))): - if self.name not in ('__div__', '__truediv__', '__add__', - '__sub__'): + if name not in ('__div__', '__rdiv__', '__truediv__', '__rtruediv__', + '__add__', '__radd__', '__sub__', '__rsub__'): raise TypeError("can only operate on a timedeltas for " "addition, subtraction, and division, but the" - " operator [%s] was passed" % self.name) + " operator [%s] was passed" % name) # datetime and timedelta/DateOffset elif (self.is_datetime_lhs and (self.is_timedelta_rhs or self.is_offset_rhs)): - if self.name not in ('__add__', '__sub__'): + if name not in ('__add__', '__radd__', '__sub__'): raise TypeError("can only operate on a datetime with a rhs of" " a timedelta/DateOffset for addition and subtraction," " but the operator [%s] was passed" % - self.name) + name) elif ((self.is_timedelta_lhs or self.is_offset_lhs) and self.is_datetime_rhs): - if self.name != '__add__': + if name not in ('__add__', '__radd__'): raise TypeError("can only operate on a timedelta/DateOffset and" " a datetime for addition, but the operator" - " [%s] was passed" % self.name) + " [%s] was passed" % name) else: raise TypeError('cannot operate on a series with out a rhs ' 'of a series/ndarray of type datetime64[ns] ' @@ -351,8 +365,10 @@ def _convert_to_array(self, values, name=None, other=None): """converts values to ndarray""" from pandas.tseries.timedeltas import to_timedelta + ovalues = values if not is_list_like(values): values = np.array([values]) + inferred_type = lib.infer_dtype(values) if inferred_type in ('datetime64', 'datetime', 'date', 'time'): @@ -366,6 +382,13 @@ def _convert_to_array(self, values, name=None, other=None): # a datelike elif isinstance(values, pd.DatetimeIndex): values = values.to_series() + # datetime with tz + elif isinstance(ovalues, datetime.datetime) and hasattr(ovalues,'tz'): + values = pd.DatetimeIndex(values) + # datetime array with tz + elif com.is_datetimetz(values): + if isinstance(values, pd.Series): + values = values.values elif not (isinstance(values, (np.ndarray, pd.Series)) and is_datetime64_dtype(values)): values = tslib.array_to_datetime(values) @@ -400,19 +423,25 @@ def _convert_to_array(self, values, name=None, other=None): def _convert_for_datetime(self, lvalues, rvalues): from pandas.tseries.timedeltas import to_timedelta - mask = None + + mask = isnull(lvalues) | isnull(rvalues) + # datetimes require views if self.is_datetime_lhs or self.is_datetime_rhs: + # datetime subtraction means timedelta if self.is_datetime_lhs and self.is_datetime_rhs: self.dtype = 'timedelta64[ns]' + elif self.is_datetime64tz_lhs: + self.dtype = lvalues.dtype + elif self.is_datetime64tz_rhs: + self.dtype = rvalues.dtype else: self.dtype = 'datetime64[ns]' - mask = isnull(lvalues) | isnull(rvalues) # if adding single offset try vectorized path # in DatetimeIndex; otherwise elementwise apply - if self.is_offset_lhs: + def _offset(lvalues, rvalues): if len(lvalues) == 1: rvalues = pd.DatetimeIndex(rvalues) lvalues = lvalues[0] @@ -420,22 +449,31 @@ def _convert_for_datetime(self, lvalues, rvalues): warnings.warn("Adding/subtracting array of DateOffsets to Series not vectorized", PerformanceWarning) rvalues = rvalues.astype('O') + + # pass thru on the na_op + self.na_op = lambda x, y: getattr(x,self.name)(y) + return lvalues, rvalues + + + if self.is_offset_lhs: + lvalues, rvalues = _offset(lvalues, rvalues) elif self.is_offset_rhs: - if len(rvalues) == 1: - lvalues = pd.DatetimeIndex(lvalues) - rvalues = rvalues[0] - else: - warnings.warn("Adding/subtracting array of DateOffsets to Series not vectorized", - PerformanceWarning) - lvalues = lvalues.astype('O') + rvalues, lvalues = _offset(rvalues, lvalues) else: + + # with tz, convert to UTC + if self.is_datetime64tz_lhs: + lvalues = lvalues.tz_localize(None) + if self.is_datetime64tz_rhs: + rvalues = rvalues.tz_localize(None) + lvalues = lvalues.view(np.int64) rvalues = rvalues.view(np.int64) # otherwise it's a timedelta else: + self.dtype = 'timedelta64[ns]' - mask = isnull(lvalues) | isnull(rvalues) # convert Tick DateOffset to underlying delta if self.is_offset_lhs: @@ -458,15 +496,20 @@ def _convert_for_datetime(self, lvalues, rvalues): rvalues = rvalues.astype(np.float64) # if we need to mask the results - if mask is not None: - if mask.any(): - def f(x): + if mask.any(): + def f(x): + + # datetime64[ns]/timedelta64[ns] masking + try: x = np.array(x, dtype=self.dtype) - np.putmask(x, mask, self.fill_value) - return x - self.wrap_results = f - self.lvalues = lvalues - self.rvalues = rvalues + except TypeError: + x = np.array(x, dtype='datetime64[ns]') + + np.putmask(x, mask, self.fill_value) + return x + self.wrap_results = f + + return lvalues, rvalues def _is_offset(self, arr_or_obj): @@ -479,7 +522,7 @@ def _is_offset(self, arr_or_obj): return False @classmethod - def maybe_convert_for_time_op(cls, left, right, name): + def maybe_convert_for_time_op(cls, left, right, name, na_op): """ if ``left`` and ``right`` are appropriate for datetime arithmetic with operation ``name``, processes them and returns a ``_TimeOp`` object @@ -490,15 +533,12 @@ def maybe_convert_for_time_op(cls, left, right, name): """ # decide if we can do it is_timedelta_lhs = is_timedelta64_dtype(left) - is_datetime_lhs = is_datetime64_dtype(left) + is_datetime_lhs = is_datetime64_dtype(left) or is_datetime64tz_dtype(left) + if not (is_datetime_lhs or is_timedelta_lhs): return None - # rops are allowed. No need for special checks, just strip off - # r part. - if name.startswith('__r'): - name = "__" + name[3:] - return cls(left, right, name) + return cls(left, right, name, na_op) def _arith_method_SERIES(op, name, str_rep, fill_zeros=None, @@ -529,12 +569,12 @@ def na_op(x, y): result = com._fill_zeros(result, x, y, name, fill_zeros) return result - def wrapper(left, right, name=name): + def wrapper(left, right, name=name, na_op=na_op): if isinstance(right, pd.DataFrame): return NotImplemented - time_converted = _TimeOp.maybe_convert_for_time_op(left, right, name) + time_converted = _TimeOp.maybe_convert_for_time_op(left, right, name, na_op) if time_converted is None: lvalues, rvalues = left, right @@ -547,6 +587,7 @@ def wrapper(left, right, name=name): lvalues, rvalues = time_converted.lvalues, time_converted.rvalues dtype = time_converted.dtype wrap_results = time_converted.wrap_results + na_op = time_converted.na_op if isinstance(rvalues, pd.Series): rindex = getattr(rvalues,'index',rvalues) @@ -616,7 +657,10 @@ def na_op(x, y): # numpy does not like comparisons vs None if isscalar(y) and isnull(y): - y = np.nan + if name == '__ne__': + return np.ones(len(x), dtype=bool) + else: + return np.zeros(len(x), dtype=bool) # we have a datetime/timedelta and may need to convert mask = None @@ -642,7 +686,7 @@ def na_op(x, y): result = op(x, y) if mask is not None and mask.any(): - result[mask] = False + result[mask] = masker return result diff --git a/pandas/core/series.py b/pandas/core/series.py index 87fde996aaa67..0da2e4ac9eb58 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -17,10 +17,12 @@ _default_index, _maybe_upcast, _asarray_tuplesafe, _infer_dtype_from_scalar, is_list_like, _values_from_object, + is_categorical_dtype, is_datetime64tz_dtype, + needs_i8_conversion, i8_boxer, _possibly_cast_to_datetime, _possibly_castable, _possibly_convert_platform, _try_sort, - is_int64_dtype, - ABCSparseArray, _maybe_match_name, + is_int64_dtype, is_internal_type, is_datetimetz, + _maybe_match_name, ABCSparseArray, _coerce_to_dtype, SettingWithCopyError, _maybe_box_datetimelike, ABCDataFrame, _dict_compat) @@ -307,11 +309,29 @@ def ftypes(self): @property def values(self): """ - Return Series as ndarray + Return Series as ndarray or ndarray-like + depending on the dtype Returns ------- - arr : numpy.ndarray + arr : numpy.ndarray or ndarray-like + + Examples + -------- + >>> pd.Series([1, 2, 3]).values + array([1, 2, 3]) + + >>> pd.Series(list('aabc')).values + array(['a', 'a', 'b', 'c'], dtype=object) + + >>> pd.Series(list('aabc')).astype('category').values + [a, a, b, c] + Categories (3, object): [a, b, c] + + >>> pd.Series(pd.date_range('20130101',periods=3,tz='US/Eastern')).values + DatetimeIndex(['2013-01-01 00:00:00-05:00', '2013-01-02 00:00:00-05:00', '2013-01-03 00:00:00-05:00'], + dtype='datetime64[ns]', freq='D', tz='US/Eastern') + """ return self._data.values @@ -932,7 +952,7 @@ def _repr_footer(self): return u('%s%sLength: %d') % (freqstr, namestr, len(self)) # Categorical - if com.is_categorical_dtype(self.dtype): + if is_categorical_dtype(self.dtype): level_info = self.values._repr_categories_info() return u('%sLength: %d, dtype: %s\n%s') % (namestr, len(self), @@ -1017,12 +1037,11 @@ def _get_repr( return result def __iter__(self): - if com.is_categorical_dtype(self.dtype): - return iter(self.values) - elif np.issubdtype(self.dtype, np.datetime64): - return (lib.Timestamp(x) for x in self.values) - elif np.issubdtype(self.dtype, np.timedelta64): - return (lib.Timedelta(x) for x in self.values) + """ provide iteration over the values of the Series + box values if necessary """ + if needs_i8_conversion(self.dtype): + boxer = i8_boxer(self) + return (boxer(x) for x in self.values) else: return iter(self.values) @@ -2119,8 +2138,8 @@ def _maybe_box(self, func, dropna=False): else: values = self.values - if com.needs_i8_conversion(self): - boxer = com.i8_boxer(self) + if needs_i8_conversion(self): + boxer = i8_boxer(self) if len(values) == 0: return boxer(tslib.iNaT) @@ -2558,7 +2577,7 @@ def _make_dt_accessor(self): # Categorical methods def _make_cat_accessor(self): - if not com.is_categorical_dtype(self.dtype): + if not is_categorical_dtype(self.dtype): raise AttributeError("Can only use .cat accessor with a " "'category' dtype") return CategoricalAccessor(self.values, self.index) @@ -2599,6 +2618,9 @@ def remove_na(series): def _sanitize_index(data, index, copy=False): """ sanitize an index type to return an ndarray of the underlying, pass thru a non-Index """ + if index is None: + return data + if len(data) != len(index): raise ValueError('Length of values does not match length of ' 'index') @@ -2640,10 +2662,11 @@ def _try_cast(arr, take_fast_path): return arr try: - arr = _possibly_cast_to_datetime(arr, dtype) - subarr = np.array(arr, dtype=dtype, copy=copy) + subarr = _possibly_cast_to_datetime(arr, dtype) + if not is_internal_type(subarr): + subarr = np.array(subarr, dtype=dtype, copy=copy) except (ValueError, TypeError): - if com.is_categorical_dtype(dtype): + if is_categorical_dtype(dtype): subarr = Categorical(arr) elif dtype is not None and raise_cast_failure: raise @@ -2664,15 +2687,7 @@ def _try_cast(arr, take_fast_path): elif copy: subarr = data.copy() else: - if (com.is_datetime64_dtype(data.dtype) and - not com.is_datetime64_dtype(dtype)): - if dtype == object: - ints = np.asarray(data).view('i8') - subarr = tslib.ints_to_pydatetime(ints) - elif raise_cast_failure: - raise TypeError('Cannot cast datetime64 to %s' % dtype) - else: - subarr = _try_cast(data, True) + subarr = _try_cast(data, True) elif isinstance(data, Index): # don't coerce Index types # e.g. indexes can have different conversions (so don't fast path them) @@ -2709,6 +2724,19 @@ def _try_cast(arr, take_fast_path): else: subarr = _try_cast(data, False) + def create_from_value(value, index, dtype): + # return a new empty value suitable for the dtype + + if is_datetimetz(dtype): + subarr = DatetimeIndex([value]*len(index)) + else: + if not isinstance(dtype, (np.dtype, type(np.dtype))): + dtype = dtype.dtype + subarr = np.empty(len(index), dtype=dtype) + subarr.fill(value) + + return subarr + # scalar like if subarr.ndim == 0: if isinstance(data, list): # pragma: no cover @@ -2723,8 +2751,7 @@ def _try_cast(arr, take_fast_path): # need to possibly convert the value here value = _possibly_cast_to_datetime(value, dtype) - subarr = np.empty(len(index), dtype=dtype) - subarr.fill(value) + subarr = create_from_value(value, index, dtype) else: return subarr.item() @@ -2735,9 +2762,7 @@ def _try_cast(arr, take_fast_path): # a 1-element ndarray if len(subarr) != len(index) and len(subarr) == 1: - value = subarr[0] - subarr = np.empty(len(index), dtype=subarr.dtype) - subarr.fill(value) + subarr = create_from_value(subarr[0], index, subarr) elif subarr.ndim > 1: if isinstance(data, np.ndarray): diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 2c9ffe6b74536..551e917fc3745 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1800,6 +1800,8 @@ def set_atom(self, block, block_items, existing_col, min_itemsize, # short-cut certain block types if block.is_categorical: return self.set_atom_categorical(block, items=block_items, info=info) + elif block.is_datetimetz: + return self.set_atom_datetime64tz(block, info=info) elif block.is_datetime: return self.set_atom_datetime64(block) elif block.is_timedelta: @@ -1814,50 +1816,14 @@ def set_atom(self, block, block_items, existing_col, min_itemsize, raise TypeError( "[date] is not implemented as a table column") elif inferred_type == 'datetime': - rvalues = block.values.ravel() - if getattr(rvalues[0], 'tzinfo', None) is not None: + # after 8260 + # this only would be hit for a mutli-timezone dtype + # which is an error - # if this block has more than one timezone, raise - try: - # pytz timezones: compare on zone name (to avoid issues with DST being a different zone to STD). - zones = [r.tzinfo.zone for r in rvalues] - except: - # dateutil timezones: compare on == - zones = [r.tzinfo for r in rvalues] - if any(zones[0] != zone_i for zone_i in zones[1:]): - raise TypeError( - "too many timezones in this block, create separate " - "data columns" - ) - else: - if len(set(zones)) != 1: - raise TypeError( - "too many timezones in this block, create separate " - "data columns" - ) - - # convert this column to datetime64[ns] utc, and save the tz - index = DatetimeIndex(rvalues) - tz = getattr(index, 'tz', None) - if tz is None: - raise TypeError( - "invalid timezone specification") - - values = index.tz_convert('UTC').values.view('i8') - - # store a converted timezone - zone = tslib.get_timezone(index.tz) - if zone is None: - zone = tslib.tot_seconds(index.tz.utcoffset()) - self.tz = zone - - self.update_info(info) - self.set_atom_datetime64( - block, values.reshape(block.values.shape)) - - else: - raise TypeError( - "[datetime] is not implemented as a table column") + raise TypeError( + "too many timezones in this block, create separate " + "data columns" + ) elif inferred_type == 'unicode': raise TypeError( "[unicode] is not implemented as a table column") @@ -1986,6 +1952,25 @@ def set_atom_datetime64(self, block, values=None): values = block.values.view('i8') self.set_data(values, 'datetime64') + def set_atom_datetime64tz(self, block, info, values=None): + + if values is None: + values = block.values + + # convert this column to datetime64[ns] utc, and save the tz + values = values.tz_convert('UTC').values.view('i8').reshape(block.shape) + + # store a converted timezone + zone = tslib.get_timezone(block.values.tz) + if zone is None: + zone = tslib.tot_seconds(block.values.tz.utcoffset()) + self.tz = zone + self.update_info(info) + + self.kind = 'datetime64' + self.typ = self.get_atom_datetime64(block) + self.set_data(values, 'datetime64') + def get_atom_timedelta64(self, block): return _tables().Int64Col(shape=block.shape[0]) @@ -2047,9 +2032,8 @@ def convert(self, values, nan_rep, encoding): # we stored as utc, so just set the tz index = DatetimeIndex( - self.data.ravel(), tz='UTC').tz_convert(self.tz) - self.data = np.asarray( - index.tolist(), dtype=object).reshape(self.data.shape) + self.data.ravel(), tz='UTC').tz_convert(tslib.maybe_get_tz(self.tz)) + self.data = index else: self.data = np.asarray(self.data, dtype='M8[ns]') @@ -4057,7 +4041,7 @@ def read(self, where=None, columns=None, **kwargs): cols_ = cols # if we have a DataIndexableCol, its shape will only be 1 dim - if values.ndim == 1: + if values.ndim == 1 and isinstance(values, np.ndarray): values = values.reshape((1, values.shape[0])) block = make_block(values, placement=np.arange(len(cols_))) diff --git a/pandas/io/tests/data/legacy_hdf/datetimetz_object.h5 b/pandas/io/tests/data/legacy_hdf/datetimetz_object.h5 new file mode 100644 index 0000000000000000000000000000000000000000..8cb4eda470398cf22278829cc91b531dbdca661a GIT binary patch literal 106271 zcmeI5U2I%O6~|}SakAMsjgyGth8AuMt=$%!wbQ1CN@&;drZzaX@j7W*R5qLJb>eot zUU}DPT2z`)DpFd&2vJ29rFq~XKpzZJC5sm$=!#~0oR=qL%x z{X_PHxvSJ^8$K5_9|CH<;(IMNzS-vQG?<9@Z2ZV@Jg!Jm@|UZ)L8Pa%oG^`;`w z8;=cfgAdwPi>vxF$jr5K5lLD*lR531F z%9mpO@_YJYi3H2jPoqcvk>SDp+|ZId=N?iA;=_r)!2vDLLr3!OXpc~c<-7DXKqHOi zhaMY=b@#^;$-RTa`(h)>YUw-MJ8ZhGI#2&NZMFl>S*OL+V&imZvyrpOOaSMbhJb&Z zI=1CGYrh3VA$2(8QXn~&l#lwB=(W*ikzV|o7#QpckOqt1g{;0xei0iU^3`MUoNq1( zv3Gu6C42N~vv*C}-sMu+bSj@LJdPRbQq4%&le!8`s+aEaeUpAHn|4nm zk4$9pI$EdyoW5_uZJAt0xUDY|PPA&#)^M&fT*&PTr`okZlZxtVfMx);`^Qt6DK}rr zkmr|e^ds%i`6XIot>ILnwVTc+8%izpGCZ;q*`%pk>V!*m<9r-{K5czew+vOto1v*( zsa~6Pz0#>difc7Bp4BZ~dcE4HRIb072eYYca?;I}&4U!LV*5oOHfWL=_qfY_!t)>frz5J*l1{Gv^2XglircD%>BQxE^+@xlGMm(l z&970{b1XYL!4WJ!O+S)RIYnx!$2r$86?+E>E^sXRrJ5^y1r@KMfnPke*~Iyxx7lE!s( zYLQ*xd^@i%xNgn1g0h}Ja|=b%F>lxP4eFehyOqMcrqH^iC2F6%ykl0>@UNcEo2J`} z@ffUic+;>P_X0O5*kI6yY&#o*!9zTL==d!?B+2rK#+0cQr@7k{obl0<35H8ZexAjm(EU7y9QT?|Rxh-t;P%(v{?m9r3P* zd1)a1)7kzI7vqul&39H-F@2n%eaQ4ohPZ3T7 zhBQsxn(XR%~juSvKZ z-ei$-XpV3TW8?1b2Pl#bfBm{^*RMEp`|H=8tgn9Eb=Pmb)|-Lh!ABER?`rxY>SmXnrgtoiVx!AP(eRoe&L98+AOHd&00K860cU3XukPQsskxVvw_R@7 zF6$=ci|4hmK78@^F;w~G?KWUpykvtb2!H?xfB*=900@8p2!H?xfB*=900@8p2!H?x zfB*=900@8p2!H?xfB*=900@8p2!H?xfB*=900@8p2!H?xfB*=900@8p2!H?xfB*=9 z00@8p2!H?xfB*=900@8p2!H?xfB*=900@8p2!H?xfB*=900@8p2!H?xfB*=900@8p z2!H?xfB*=900@8p2!H?xfB*=900@8p2!H?xfB*=900@8p2!H?xfB*=900@8p2!H?x zfB*=900@8p2!H?xfB*=900@8p2!H?xfB*=900@8p2!H?xfB*=900@8p2!H?xfB*=9 z00@8p2!H?xfB*=900@8p2!H?xfB*=900@8p2!H?xfB*=900@8p2!H?xfB*=900@8p z2!H?xfB*=900@8p2!H?xfB*=900@8p2!H?xfB*=900@8p2!H?xfB*=900@8p2!H?x zfB*=900@8p2!H?xfB*=900@8p2!H?xfB*=900@8p2!H?xfWRk1AfOsmfQoLh#W$;F z{V93}$0{|i&q|F9$K#45ulf#qTg^L0J|q#i-c$s7Z=|0TSZ$}DyXi;D>*M;k zdFRMPwvZajx|7c6#H3S59nQGU&h+R`{fXq8R2!W+oyl)ce=L!pFz2^k$J2?9k-mYR zI6)}MbLk;Sv8ywK`4J0S!ceF=zx`4Vx$F=B?qE?x4*nF3f%mvL2GY*4Q$Kmwd zMm}ouaUL+?db!%KO=_8kgF&MW@S8<=n}`nTaD$<;R|N*$!4tuM}&g zmrGcu_axQ#tEV>xwN$=IpU;dPaZS<1^rj?7hlR6JmD3+NHkGwiubtjxN_Ccg-Kgt7 zkt<}}<8DSLrc*V3Jm>IHN^j~v>87WS6vif~%k@z$y|g2sHW}Tq5doR!f+Arvvh9`S zpR`}}_53|Xe%6XNQm=DNu)$=%iQ38GfSt#J`hGy=k4=q^W?ZWytRI{%pwjMWCRK2C z8qP9gSEvoa2C0HBkT7cX+Gca$$U9zqb$>}8%k_N|*Ml>qLr>SS;o;b0CT#3jqt#}m zdfP?P-1nc`Y{Z!_*Op^F+i)6RiZX3EV|d@0=qJ+9&hM|%49nfg>7&jwLIg>{}YHsgB7L$TeI zowuBN$AdKwRF}HPgHF0!=^vHyJYR=xda=DfP>8P9cGjq9>!dt!TN30Sgc_4i*ycR{88+e~+) z|Atf9qi$)qU3Y(-F7{W8Dc`OSbbwA`^jE)sf>N|{WGO=RDIZ^ECJ=7k=fe#oFWjVW zma|{Hc5t7Y9!q7NN$#HIu!D<@qvxq@d{z(4=h|qdDjD8J44&ZBg2D_dRA=3EOk5NdDPJ%I^8c8O(GUv z&FV?1p?hn~Nhv44R9mjsX5YN4wp^E$ySTHq+``(fUP$z$*>;{hXP3rL`96K+`6H?; zE$ck*^Qc32r1kx&e8Cx>c-qOE2h-#F!PL25jF;^9O9H>1%l?X7rT#g`lpy_5ub%6N zLtW46&y`#P>G`FoZ49N6WGklURHQx(F2(POcq2~9LU!DK5CfC4AD*?>aeFoC`-dez zn@=%9`bYSQT)kK6k7H%}WJW?I{n$!{NclzmkcQd{R9iRu_xZAJ#vOOF1-&?-wEle- zDTnuV`6%W~pG{{v98K#Qsjt~TtT$hNAI1B1vi_HAROeoPnt5GnX75?e5_?ph0x=)} z0w4eaAOHeuoj|qq!o5`*cB#sGp|~Kek`B=z00JNY0w4eaYnOod_xk<)O8d*}X%8c$ zdisV(*oBApLA?7LzTF1esi^E{8|gQaOZBsz88q@yi#GZl6PA6@wM!f|UkZV0ezt8X zqX)7-c;1Gub-&qP7s)qj{bYx%KxrSZ{JrKjk}e7b*Z!fgqHic?QQ8MDe~$$J zQ052!P}Y-r{ozDkuM8OtoaIdDsQ0~v>d7}7q$7F`F3qNE4J#F;`<2d9n-g_#w9zn)c0`<`_cP8 z&ij5E{-Ia3BGB4MFOQaMJk;M`rk^XlAA^5rO@DXzhmw^le@|{qm%MkZW>;2jzJY%z z_dvCEf%t<;|44ZC`w9M`JORKz6#k+0{l3+w-@bX*>+laXt7#tV=J(*t6Ns$0yncu` zlYi*xq$Ukso~SL?C3Shcww#Eq_Eq*NBDVA7IeR&uZyairw7%&7x6J;b=k4#;OTW~s z=VtGm>$#|ZX!BFHG4f2weYR41&Uy2(*8D?Hm+6xk31W}E`xQ5nf9T_;JIl_u{YplN zy=4_r>`_?)M1TMYfB*=900^vk0`>Zb=Bun1WF5QafuI2(00JNY0w4eaWdy`O)c^gG z%k3XJblmJqEY&~s?PrX9)YkZreeqV@2Q6y}B5ouC)%-(yUp9In`^{Hu_*(ay{mdla zL-zNkhO9vS{-NFQ4<&Z&muL1Kqnc!CchY=gT!}0ELm5ytKM?sI3-JSyu;^>ccMaKD zO?hSB&vDkP{r#b*zhBZA+q4CjHXAyBvbG%SAs=sBP9{`-7CX}QquTjJZPM8nYs;0F zusSi-{DbO$9~aRTuJpbI{-G7egWfu+=RKa+|HAJtU4MT?tM(WCL*XClw^s6fz3^+h zcE2{S9mMyCYCm~=f2i+!_hi#}mr4{FPm*LZbe zWS?T_6;nTX&R)*v8;6=AtzRks&@1-uDoMZ8tLO4wz1DoqZ6H05{K7Vd(nzuj^Yy~# zbMJiZolh70o?Ov_wdNmsu1ufINRagl{6nQ@&Gna6{5`}TEtEzG1V8`;KmY_l;6@-& zuYc(Os;n1^3(^|_3fceyAOHd&00JPefPnai`v0!@wfcvi2>kc$jkGZpT6gxtf4}kW z?b+s2BYzuxA<(eCsU>n~?qszOm!7_;F1)L@aQXmmZ2kG{<-zeS8@9Ij(w(||D$q>( zfGkLV+xJd&GP>)`+~iM=?|SJUmp?ky5^c!7@(m~OyScmm^Suij&o#FHX7g)@QV%B6 0, 'Msgpack files are not tested' diff --git a/pandas/io/tests/test_pickle.py b/pandas/io/tests/test_pickle.py index e691fac215002..16a2fad65c614 100644 --- a/pandas/io/tests/test_pickle.py +++ b/pandas/io/tests/test_pickle.py @@ -8,6 +8,8 @@ import nose import os +from distutils.version import LooseVersion + import numpy as np import pandas.util.testing as tm import pandas as pd @@ -39,7 +41,7 @@ def setUp(self): self.data = create_pickle_data() self.path = u('__%s__.pickle' % tm.rands(10)) - def compare_element(self, typ, result, expected): + def compare_element(self, result, expected, typ, version=None): if isinstance(expected,Index): tm.assert_index_equal(expected, result) return @@ -51,7 +53,7 @@ def compare_element(self, typ, result, expected): comparator = getattr(tm,"assert_%s_equal" % typ) comparator(result,expected) - def compare(self, vf): + def compare(self, vf, version): # py3 compat when reading py2 pickle try: @@ -70,9 +72,28 @@ def compare(self, vf): except (KeyError): continue - self.compare_element(typ, result, expected) + # use a specific comparator + # if available + comparator = getattr(self,"compare_{typ}_{dt}".format(typ=typ,dt=dt), self.compare_element) + comparator(result, expected, typ, version) return data + def compare_series_dt_tz(self, result, expected, typ, version): + # 8260 + # dtype is object < 0.17.0 + if LooseVersion(version) < '0.17.0': + tm.assert_series_equal(result, expected, check_dtype=False) + else: + tm.assert_series_equal(result, expected) + + def compare_frame_dt_mixed_tzs(self, result, expected, typ, version): + # 8260 + # dtype is object < 0.17.0 + if LooseVersion(version) < '0.17.0': + tm.assert_frame_equal(result, expected, check_dtype=False) + else: + tm.assert_frame_equal(result, expected) + def read_pickles(self, version): if not is_little_endian(): raise nose.SkipTest("known failure on non-little endian") @@ -81,7 +102,7 @@ def read_pickles(self, version): n = 0 for f in os.listdir(pth): vf = os.path.join(pth, f) - data = self.compare(vf) + data = self.compare(vf, version) if data is None: continue @@ -144,14 +165,14 @@ def python_unpickler(path): # test reading with each unpickler result = pd.read_pickle(path) - self.compare_element(typ, result, expected) + self.compare_element(result, expected, typ) if c_unpickler is not None: result = c_unpickler(path) - self.compare_element(typ, result, expected) + self.compare_element(result, expected, typ) result = python_unpickler(path) - self.compare_element(typ, result, expected) + self.compare_element(result, expected, typ) def _validate_timeseries(self, pickled, current): # GH 7748 diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 210852d83094f..44679aa0491a3 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -156,7 +156,7 @@ def tearDown(self): pass -class TestHDFStore(Base): +class TestHDFStore(Base, tm.TestCase): def test_factory_fun(self): path = create_tempfile(self.path) @@ -1040,7 +1040,7 @@ def test_append_all_nans(self): store.append('df2', df[10:], dropna=False) tm.assert_frame_equal(store['df2'], df) - # Test to make sure defaults are to not drop. + # Test to make sure defaults are to not drop. # Corresponding to Issue 9382 df_with_missing = DataFrame({'col1':[0, np.nan, 2], 'col2':[1, np.nan, np.nan]}) @@ -1059,7 +1059,7 @@ def test_append_all_nans(self): with ensure_clean_path(self.path) as path: panel_with_missing.to_hdf(path, 'panel_with_missing', format='table') - reloaded_panel = read_hdf(path, 'panel_with_missing') + reloaded_panel = read_hdf(path, 'panel_with_missing') tm.assert_panel_equal(panel_with_missing, reloaded_panel) def test_append_frame_column_oriented(self): @@ -1927,73 +1927,6 @@ def test_unimplemented_dtypes_table_columns(self): # this fails because we have a date in the object block...... self.assertRaises(TypeError, store.append, 'df_unimplemented', df) - def test_append_with_timezones_pytz(self): - - from datetime import timedelta - - def compare(a,b): - tm.assert_frame_equal(a,b) - - # compare the zones on each element - for c in a.columns: - for i in a.index: - a_e = a[c][i] - b_e = b[c][i] - if not (a_e == b_e and a_e.tz == b_e.tz): - raise AssertionError("invalid tz comparsion [%s] [%s]" % (a_e,b_e)) - - # as columns - with ensure_clean_store(self.path) as store: - - _maybe_remove(store, 'df_tz') - df = DataFrame(dict(A = [ Timestamp('20130102 2:00:00',tz='US/Eastern') + timedelta(hours=1)*i for i in range(5) ])) - store.append('df_tz',df,data_columns=['A']) - result = store['df_tz'] - compare(result,df) - assert_frame_equal(result,df) - - # select with tz aware - compare(store.select('df_tz',where=Term('A>=df.A[3]')),df[df.A>=df.A[3]]) - - _maybe_remove(store, 'df_tz') - # ensure we include dates in DST and STD time here. - df = DataFrame(dict(A = Timestamp('20130102',tz='US/Eastern'), B = Timestamp('20130603',tz='US/Eastern')),index=range(5)) - store.append('df_tz',df) - result = store['df_tz'] - compare(result,df) - assert_frame_equal(result,df) - - _maybe_remove(store, 'df_tz') - df = DataFrame(dict(A = Timestamp('20130102',tz='US/Eastern'), B = Timestamp('20130102',tz='EET')),index=range(5)) - self.assertRaises(TypeError, store.append, 'df_tz', df) - - # this is ok - _maybe_remove(store, 'df_tz') - store.append('df_tz',df,data_columns=['A','B']) - result = store['df_tz'] - compare(result,df) - assert_frame_equal(result,df) - - # can't append with diff timezone - df = DataFrame(dict(A = Timestamp('20130102',tz='US/Eastern'), B = Timestamp('20130102',tz='CET')),index=range(5)) - self.assertRaises(ValueError, store.append, 'df_tz', df) - - # as index - with ensure_clean_store(self.path) as store: - - # GH 4098 example - df = DataFrame(dict(A = Series(lrange(3), index=date_range('2000-1-1',periods=3,freq='H', tz='US/Eastern')))) - - _maybe_remove(store, 'df') - store.put('df',df) - result = store.select('df') - assert_frame_equal(result,df) - - _maybe_remove(store, 'df') - store.append('df',df) - result = store.select('df') - assert_frame_equal(result,df) - def test_calendar_roundtrip_issue(self): # 8591 @@ -2016,128 +1949,6 @@ def test_calendar_roundtrip_issue(self): result = store.select('table') assert_series_equal(result, s) - def test_append_with_timezones_dateutil(self): - - from datetime import timedelta - tm._skip_if_no_dateutil() - - # use maybe_get_tz instead of dateutil.tz.gettz to handle the windows filename issues. - from pandas.tslib import maybe_get_tz - gettz = lambda x: maybe_get_tz('dateutil/' + x) - - def compare(a, b): - tm.assert_frame_equal(a, b) - - # compare the zones on each element - for c in a.columns: - for i in a.index: - a_e = a[c][i] - b_e = b[c][i] - if not (a_e == b_e and a_e.tz == b_e.tz): - raise AssertionError("invalid tz comparsion [%s] [%s]" % (a_e, b_e)) - - # as columns - with ensure_clean_store(self.path) as store: - - _maybe_remove(store, 'df_tz') - df = DataFrame(dict(A=[ Timestamp('20130102 2:00:00', tz=gettz('US/Eastern')) + timedelta(hours=1) * i for i in range(5) ])) - store.append('df_tz', df, data_columns=['A']) - result = store['df_tz'] - compare(result, df) - assert_frame_equal(result, df) - - # select with tz aware - compare(store.select('df_tz', where=Term('A>=df.A[3]')), df[df.A >= df.A[3]]) - - _maybe_remove(store, 'df_tz') - # ensure we include dates in DST and STD time here. - df = DataFrame(dict(A=Timestamp('20130102', tz=gettz('US/Eastern')), B=Timestamp('20130603', tz=gettz('US/Eastern'))), index=range(5)) - store.append('df_tz', df) - result = store['df_tz'] - compare(result, df) - assert_frame_equal(result, df) - - _maybe_remove(store, 'df_tz') - df = DataFrame(dict(A=Timestamp('20130102', tz=gettz('US/Eastern')), B=Timestamp('20130102', tz=gettz('EET'))), index=range(5)) - self.assertRaises(TypeError, store.append, 'df_tz', df) - - # this is ok - _maybe_remove(store, 'df_tz') - store.append('df_tz', df, data_columns=['A', 'B']) - result = store['df_tz'] - compare(result, df) - assert_frame_equal(result, df) - - # can't append with diff timezone - df = DataFrame(dict(A=Timestamp('20130102', tz=gettz('US/Eastern')), B=Timestamp('20130102', tz=gettz('CET'))), index=range(5)) - self.assertRaises(ValueError, store.append, 'df_tz', df) - - # as index - with ensure_clean_store(self.path) as store: - - # GH 4098 example - df = DataFrame(dict(A=Series(lrange(3), index=date_range('2000-1-1', periods=3, freq='H', tz=gettz('US/Eastern'))))) - - _maybe_remove(store, 'df') - store.put('df', df) - result = store.select('df') - assert_frame_equal(result, df) - - _maybe_remove(store, 'df') - store.append('df', df) - result = store.select('df') - assert_frame_equal(result, df) - - def test_store_timezone(self): - # GH2852 - # issue storing datetime.date with a timezone as it resets when read back in a new timezone - - # timezone setting not supported on windows - tm._skip_if_windows() - - import datetime - import time - import os - - # original method - with ensure_clean_store(self.path) as store: - - today = datetime.date(2013,9,10) - df = DataFrame([1,2,3], index = [today, today, today]) - store['obj1'] = df - result = store['obj1'] - assert_frame_equal(result, df) - - # with tz setting - orig_tz = os.environ.get('TZ') - - def setTZ(tz): - if tz is None: - try: - del os.environ['TZ'] - except: - pass - else: - os.environ['TZ']=tz - time.tzset() - - try: - - with ensure_clean_store(self.path) as store: - - setTZ('EST5EDT') - today = datetime.date(2013,9,10) - df = DataFrame([1,2,3], index = [today, today, today]) - store['obj1'] = df - - setTZ('CST6CDT') - result = store['obj1'] - - assert_frame_equal(result, df) - - finally: - setTZ(orig_tz) - def test_append_with_timedelta(self): # GH 3577 # append timedelta @@ -2813,26 +2624,6 @@ def test_can_serialize_dates(self): self._check_roundtrip(frame, tm.assert_frame_equal) - def test_timezones(self): - rng = date_range('1/1/2000', '1/30/2000', tz='US/Eastern') - frame = DataFrame(np.random.randn(len(rng), 4), index=rng) - - with ensure_clean_store(self.path) as store: - store['frame'] = frame - recons = store['frame'] - self.assertTrue(recons.index.equals(rng)) - self.assertEqual(rng.tz, recons.index.tz) - - def test_fixed_offset_tz(self): - rng = date_range('1/1/2000 00:00:00-07:00', '1/30/2000 00:00:00-07:00') - frame = DataFrame(np.random.randn(len(rng), 4), index=rng) - - with ensure_clean_store(self.path) as store: - store['frame'] = frame - recons = store['frame'] - self.assertTrue(recons.index.equals(rng)) - self.assertEqual(rng.tz, recons.index.tz) - def test_store_hierarchical(self): index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], @@ -4232,35 +4023,25 @@ def f(): def test_pytables_native_read(self): - try: - store = HDFStore(tm.get_data_path('legacy_hdf/pytables_native.h5'), 'r') + with ensure_clean_store(tm.get_data_path('legacy_hdf/pytables_native.h5'), mode='r') as store: d2 = store['detector/readout'] - assert isinstance(d2, DataFrame) - finally: - safe_close(store) + self.assertIsInstance(d2, DataFrame) - try: - store = HDFStore(tm.get_data_path('legacy_hdf/pytables_native2.h5'), 'r') + with ensure_clean_store(tm.get_data_path('legacy_hdf/pytables_native2.h5'), mode='r') as store: str(store) d1 = store['detector'] - assert isinstance(d1, DataFrame) - finally: - safe_close(store) + self.assertIsInstance(d1, DataFrame) def test_legacy_read(self): - try: - store = HDFStore(tm.get_data_path('legacy_hdf/legacy.h5'), 'r') + with ensure_clean_store(tm.get_data_path('legacy_hdf/legacy.h5'), mode='r') as store: store['a'] store['b'] store['c'] store['d'] - finally: - safe_close(store) def test_legacy_table_read(self): # legacy table types - try: - store = HDFStore(tm.get_data_path('legacy_hdf/legacy_table.h5'), 'r') + with ensure_clean_store(tm.get_data_path('legacy_hdf/legacy_table.h5'), mode='r') as store: store.select('df1') store.select('df2') store.select('wp1') @@ -4278,24 +4059,17 @@ def test_legacy_table_read(self): expected = df2[df2.index > df2.index[2]] assert_frame_equal(expected, result) - finally: - safe_close(store) - def test_legacy_0_10_read(self): # legacy from 0.10 - try: - store = HDFStore(tm.get_data_path('legacy_hdf/legacy_0.10.h5'), 'r') + with ensure_clean_store(tm.get_data_path('legacy_hdf/legacy_0.10.h5'), mode='r') as store: str(store) for k in store.keys(): store.select(k) - finally: - safe_close(store) def test_legacy_0_11_read(self): # legacy from 0.11 - try: - path = os.path.join('legacy_hdf', 'legacy_table_0.11.h5') - store = HDFStore(tm.get_data_path(path), 'r') + path = os.path.join('legacy_hdf', 'legacy_table_0.11.h5') + with ensure_clean_store(tm.get_data_path(path), mode='r') as store: str(store) assert 'df' in store assert 'df1' in store @@ -4306,8 +4080,6 @@ def test_legacy_0_11_read(self): assert isinstance(df, DataFrame) assert isinstance(df1, DataFrame) assert isinstance(mi, DataFrame) - finally: - safe_close(store) def test_copy(self): @@ -4444,38 +4216,6 @@ def test_tseries_indices_frame(self): self.assertEqual(type(result.index), type(df.index)) self.assertEqual(result.index.freq, df.index.freq) - def test_tseries_select_index_column(self): - # GH7777 - # selecting a UTC datetimeindex column did - # not preserve UTC tzinfo set before storing - - # check that no tz still works - rng = date_range('1/1/2000', '1/30/2000') - frame = DataFrame(np.random.randn(len(rng), 4), index=rng) - - with ensure_clean_store(self.path) as store: - store.append('frame', frame) - result = store.select_column('frame', 'index') - self.assertEqual(rng.tz, DatetimeIndex(result.values).tz) - - # check utc - rng = date_range('1/1/2000', '1/30/2000', tz='UTC') - frame = DataFrame(np.random.randn(len(rng), 4), index=rng) - - with ensure_clean_store(self.path) as store: - store.append('frame', frame) - result = store.select_column('frame', 'index') - self.assertEqual(rng.tz, DatetimeIndex(result.values).tz) - - # double check non-utc - rng = date_range('1/1/2000', '1/30/2000', tz='US/Eastern') - frame = DataFrame(np.random.randn(len(rng), 4), index=rng) - - with ensure_clean_store(self.path) as store: - store.append('frame', frame) - result = store.select_column('frame', 'index') - self.assertEqual(rng.tz, DatetimeIndex(result.values).tz) - def test_unicode_index(self): unicode_values = [u('\u03c3'), u('\u03c3\u03c3')] @@ -4907,6 +4647,249 @@ def test_complex_append(self): result = store.select('df') assert_frame_equal(pd.concat([df, df], 0), result) +class TestTimezones(Base, tm.TestCase): + + + def _compare_with_tz(self, a, b): + tm.assert_frame_equal(a, b) + + # compare the zones on each element + for c in a.columns: + for i in a.index: + a_e = a.loc[i,c] + b_e = b.loc[i,c] + if not (a_e == b_e and a_e.tz == b_e.tz): + raise AssertionError("invalid tz comparsion [%s] [%s]" % (a_e, b_e)) + + def test_append_with_timezones_dateutil(self): + + from datetime import timedelta + tm._skip_if_no_dateutil() + + # use maybe_get_tz instead of dateutil.tz.gettz to handle the windows filename issues. + from pandas.tslib import maybe_get_tz + gettz = lambda x: maybe_get_tz('dateutil/' + x) + + # as columns + with ensure_clean_store(self.path) as store: + + _maybe_remove(store, 'df_tz') + df = DataFrame(dict(A=[ Timestamp('20130102 2:00:00', tz=gettz('US/Eastern')) + timedelta(hours=1) * i for i in range(5) ])) + + store.append('df_tz', df, data_columns=['A']) + result = store['df_tz'] + self._compare_with_tz(result, df) + assert_frame_equal(result, df) + + # select with tz aware + expected = df[df.A >= df.A[3]] + result = store.select('df_tz', where=Term('A>=df.A[3]')) + self._compare_with_tz(result, expected) + + # ensure we include dates in DST and STD time here. + _maybe_remove(store, 'df_tz') + df = DataFrame(dict(A=Timestamp('20130102', tz=gettz('US/Eastern')), B=Timestamp('20130603', tz=gettz('US/Eastern'))), index=range(5)) + store.append('df_tz', df) + result = store['df_tz'] + self._compare_with_tz(result, df) + assert_frame_equal(result, df) + + df = DataFrame(dict(A=Timestamp('20130102', tz=gettz('US/Eastern')), B=Timestamp('20130102', tz=gettz('EET'))), index=range(5)) + self.assertRaises(ValueError, store.append, 'df_tz', df) + + # this is ok + _maybe_remove(store, 'df_tz') + store.append('df_tz', df, data_columns=['A', 'B']) + result = store['df_tz'] + self._compare_with_tz(result, df) + assert_frame_equal(result, df) + + # can't append with diff timezone + df = DataFrame(dict(A=Timestamp('20130102', tz=gettz('US/Eastern')), B=Timestamp('20130102', tz=gettz('CET'))), index=range(5)) + self.assertRaises(ValueError, store.append, 'df_tz', df) + + # as index + with ensure_clean_store(self.path) as store: + + # GH 4098 example + df = DataFrame(dict(A=Series(lrange(3), index=date_range('2000-1-1', periods=3, freq='H', tz=gettz('US/Eastern'))))) + + _maybe_remove(store, 'df') + store.put('df', df) + result = store.select('df') + assert_frame_equal(result, df) + + _maybe_remove(store, 'df') + store.append('df', df) + result = store.select('df') + assert_frame_equal(result, df) + + def test_append_with_timezones_pytz(self): + + from datetime import timedelta + + # as columns + with ensure_clean_store(self.path) as store: + + _maybe_remove(store, 'df_tz') + df = DataFrame(dict(A = [ Timestamp('20130102 2:00:00',tz='US/Eastern') + timedelta(hours=1)*i for i in range(5) ])) + store.append('df_tz',df,data_columns=['A']) + result = store['df_tz'] + self._compare_with_tz(result,df) + assert_frame_equal(result,df) + + # select with tz aware + self._compare_with_tz(store.select('df_tz',where=Term('A>=df.A[3]')),df[df.A>=df.A[3]]) + + _maybe_remove(store, 'df_tz') + # ensure we include dates in DST and STD time here. + df = DataFrame(dict(A = Timestamp('20130102',tz='US/Eastern'), B = Timestamp('20130603',tz='US/Eastern')),index=range(5)) + store.append('df_tz',df) + result = store['df_tz'] + self._compare_with_tz(result,df) + assert_frame_equal(result,df) + + df = DataFrame(dict(A = Timestamp('20130102',tz='US/Eastern'), B = Timestamp('20130102',tz='EET')),index=range(5)) + self.assertRaises(ValueError, store.append, 'df_tz', df) + + # this is ok + _maybe_remove(store, 'df_tz') + store.append('df_tz',df,data_columns=['A','B']) + result = store['df_tz'] + self._compare_with_tz(result,df) + assert_frame_equal(result,df) + + # can't append with diff timezone + df = DataFrame(dict(A = Timestamp('20130102',tz='US/Eastern'), B = Timestamp('20130102',tz='CET')),index=range(5)) + self.assertRaises(ValueError, store.append, 'df_tz', df) + + # as index + with ensure_clean_store(self.path) as store: + + # GH 4098 example + df = DataFrame(dict(A = Series(lrange(3), index=date_range('2000-1-1',periods=3,freq='H', tz='US/Eastern')))) + + _maybe_remove(store, 'df') + store.put('df',df) + result = store.select('df') + assert_frame_equal(result,df) + + _maybe_remove(store, 'df') + store.append('df',df) + result = store.select('df') + assert_frame_equal(result,df) + + def test_tseries_select_index_column(self): + # GH7777 + # selecting a UTC datetimeindex column did + # not preserve UTC tzinfo set before storing + + # check that no tz still works + rng = date_range('1/1/2000', '1/30/2000') + frame = DataFrame(np.random.randn(len(rng), 4), index=rng) + + with ensure_clean_store(self.path) as store: + store.append('frame', frame) + result = store.select_column('frame', 'index') + self.assertEqual(rng.tz, DatetimeIndex(result.values).tz) + + # check utc + rng = date_range('1/1/2000', '1/30/2000', tz='UTC') + frame = DataFrame(np.random.randn(len(rng), 4), index=rng) + + with ensure_clean_store(self.path) as store: + store.append('frame', frame) + result = store.select_column('frame', 'index') + self.assertEqual(rng.tz, DatetimeIndex(result.values).tz) + + # double check non-utc + rng = date_range('1/1/2000', '1/30/2000', tz='US/Eastern') + frame = DataFrame(np.random.randn(len(rng), 4), index=rng) + + with ensure_clean_store(self.path) as store: + store.append('frame', frame) + result = store.select_column('frame', 'index') + self.assertEqual(rng.tz, DatetimeIndex(result.values).tz) + + def test_timezones(self): + rng = date_range('1/1/2000', '1/30/2000', tz='US/Eastern') + frame = DataFrame(np.random.randn(len(rng), 4), index=rng) + + with ensure_clean_store(self.path) as store: + store['frame'] = frame + recons = store['frame'] + self.assertTrue(recons.index.equals(rng)) + self.assertEqual(rng.tz, recons.index.tz) + + def test_fixed_offset_tz(self): + rng = date_range('1/1/2000 00:00:00-07:00', '1/30/2000 00:00:00-07:00') + frame = DataFrame(np.random.randn(len(rng), 4), index=rng) + + with ensure_clean_store(self.path) as store: + store['frame'] = frame + recons = store['frame'] + self.assertTrue(recons.index.equals(rng)) + self.assertEqual(rng.tz, recons.index.tz) + + def test_store_timezone(self): + # GH2852 + # issue storing datetime.date with a timezone as it resets when read back in a new timezone + + import platform + if platform.system() == "Windows": + raise nose.SkipTest("timezone setting not supported on windows") + + import datetime + import time + import os + + # original method + with ensure_clean_store(self.path) as store: + + today = datetime.date(2013,9,10) + df = DataFrame([1,2,3], index = [today, today, today]) + store['obj1'] = df + result = store['obj1'] + assert_frame_equal(result, df) + + # with tz setting + orig_tz = os.environ.get('TZ') + + def setTZ(tz): + if tz is None: + try: + del os.environ['TZ'] + except: + pass + else: + os.environ['TZ']=tz + time.tzset() + + try: + + with ensure_clean_store(self.path) as store: + + setTZ('EST5EDT') + today = datetime.date(2013,9,10) + df = DataFrame([1,2,3], index = [today, today, today]) + store['obj1'] = df + + setTZ('CST6CDT') + result = store['obj1'] + + assert_frame_equal(result, df) + + finally: + setTZ(orig_tz) + + def test_legacy_datetimetz_object(self): + # legacy from < 0.17.0 + # 8260 + expected = DataFrame(dict(A=Timestamp('20130102', tz='US/Eastern'), B=Timestamp('20130603', tz='CET')), index=range(5)) + with ensure_clean_store(tm.get_data_path('legacy_hdf/datetimetz_object.h5'), mode='r') as store: + result = store['df'] + assert_frame_equal(result, expected) + def _test_sort(obj): if isinstance(obj, DataFrame): return obj.reindex(sorted(obj.index)) diff --git a/pandas/io/tests/test_sql.py b/pandas/io/tests/test_sql.py index 619de8d6bad3b..d61c5f0740a91 100644 --- a/pandas/io/tests/test_sql.py +++ b/pandas/io/tests/test_sql.py @@ -1718,12 +1718,14 @@ def test_schema_support(self): tm.assert_frame_equal(res1, res2) def test_datetime_with_time_zone(self): + # Test to see if we read the date column with timezones that # the timezone information is converted to utc and into a # np.datetime64 (GH #7139) + df = sql.read_sql_table("types_test_data", self.conn) self.assertTrue(issubclass(df.DateColWithTz.dtype.type, np.datetime64), - "DateColWithTz loaded with incorrect type") + "DateColWithTz loaded with incorrect type -> {0}".format(df.DateColWithTz.dtype)) # "2000-01-01 00:00:00-08:00" should convert to "2000-01-01 08:00:00" self.assertEqual(df.DateColWithTz[0], Timestamp('2000-01-01 08:00:00')) diff --git a/pandas/lib.pyx b/pandas/lib.pyx index 07f0c89535a77..75b25c7a81458 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -1732,7 +1732,7 @@ cdef class BlockPlacement: self._as_array = arr self._has_array = True - def __unicode__(self): + def __str__(self): cdef slice s = self._ensure_has_slice() if s is not None: v = self._as_slice @@ -1741,6 +1741,8 @@ cdef class BlockPlacement: return '%s(%r)' % (self.__class__.__name__, v) + __repr__ = __str__ + def __len__(self): cdef slice s = self._ensure_has_slice() if s is not None: diff --git a/pandas/sparse/frame.py b/pandas/sparse/frame.py index 83278fe12d641..f1799eb99f720 100644 --- a/pandas/sparse/frame.py +++ b/pandas/sparse/frame.py @@ -115,7 +115,7 @@ def __init__(self, data=None, index=None, columns=None, index=index, kind=self._default_kind, fill_value=self._default_fill_value) - mgr = df_to_manager(data, columns, index) + mgr = to_manager(data, columns, index) if dtype is not None: mgr = mgr.astype(dtype) @@ -181,7 +181,7 @@ def _init_dict(self, data, index, columns, dtype=None): if c not in sdict: sdict[c] = sp_maker(nan_vec) - return df_to_manager(sdict, columns, index) + return to_manager(sdict, columns, index) def _init_matrix(self, data, index, columns, dtype=None): data = _prep_ndarray(data, copy=False) @@ -233,7 +233,7 @@ def _unpickle_sparse_frame_compat(self, state): series_dict[col] = SparseSeries(sp_values, sparse_index=sp_index, fill_value=fv) - self._data = df_to_manager(series_dict, columns, index) + self._data = to_manager(series_dict, columns, index) self._default_fill_value = fv self._default_kind = kind @@ -737,7 +737,7 @@ def applymap(self, func): """ return self.apply(lambda x: lmap(func, x)) -def df_to_manager(sdf, columns, index): +def to_manager(sdf, columns, index): """ create and return the block manager from a dataframe of series, columns, index """ # from BlockManager perspective diff --git a/pandas/sparse/series.py b/pandas/sparse/series.py index a8addfab17c26..975b65b8bb4b7 100644 --- a/pandas/sparse/series.py +++ b/pandas/sparse/series.py @@ -219,15 +219,15 @@ def __init__(self, data=None, index=None, sparse_index=None, kind='block', @property def values(self): """ return the array """ - return self._data._values + return self.block.values def __array__(self, result=None): """ the array interface, return my values """ - return self._data._values + return self.block.values def get_values(self): """ same as values """ - return self._data._values.to_dense().view() + return self.block.to_dense().view() @property def block(self): diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx index 9ee5a753af567..43591639d8fe8 100644 --- a/pandas/src/inference.pyx +++ b/pandas/src/inference.pyx @@ -73,13 +73,11 @@ except AttributeError: cdef _try_infer_map(v): """ if its in our map, just return the dtype """ cdef: - object val_name, val_kind - val_name = v.dtype.name - if val_name in _TYPE_MAP: - return _TYPE_MAP[val_name] - val_kind = v.dtype.kind - if val_kind in _TYPE_MAP: - return _TYPE_MAP[val_kind] + object attr, val + for attr in ['name','kind','base']: + val = getattr(v.dtype,attr) + if val in _TYPE_MAP: + return _TYPE_MAP[val] return None def infer_dtype(object _values): diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index c9e4285d8b684..7701e39b91778 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -7,6 +7,7 @@ import pandas as pd from pandas.compat import u, StringIO from pandas.core.base import FrozenList, FrozenNDArray, PandasDelegate +import pandas.core.common as com from pandas.tseries.base import DatetimeIndexOpsMixin from pandas.util.testing import assertRaisesRegexp, assertIsInstance from pandas.tseries.common import is_datetimelike @@ -404,22 +405,35 @@ def test_value_counts_unique_nunique(self): # freq must be specified because repeat makes freq ambiguous # resets name from Index - expected_index = pd.Index(o[::-1], name=None) + expected_index = pd.Index(o[::-1]) # attach name to klass - o = klass(np.repeat(values, range(1, len(o) + 1)), freq=o.freq, name='a') + o = o.repeat(range(1, len(o) + 1)) + o.name = 'a' + + elif isinstance(o, DatetimeIndex): + + # resets name from Index + expected_index = pd.Index(o[::-1]) + + # attach name to klass + o = o.repeat(range(1, len(o) + 1)) + o.name = 'a' + # don't test boolean elif isinstance(o,Index) and o.is_boolean(): continue elif isinstance(o, Index): - expected_index = pd.Index(values[::-1], name=None) - o = klass(np.repeat(values, range(1, len(o) + 1)), name='a') + expected_index = pd.Index(values[::-1]) + o = o.repeat(range(1, len(o) + 1)) + o.name = 'a' else: - expected_index = pd.Index(values[::-1], name=None) - idx = np.repeat(o.index.values, range(1, len(o) + 1)) + expected_index = pd.Index(values[::-1]) + idx = o.index.repeat(range(1, len(o) + 1)) o = klass(np.repeat(values, range(1, len(o) + 1)), index=idx, name='a') expected_s = Series(range(10, 0, -1), index=expected_index, dtype='int64', name='a') + result = o.value_counts() tm.assert_series_equal(result, expected_s) self.assertTrue(result.index.name is None) @@ -450,7 +464,16 @@ def test_value_counts_unique_nunique(self): continue # special assign to the numpy array - if o.values.dtype == 'datetime64[ns]' or isinstance(o, PeriodIndex): + if com.is_datetimetz(o): + if isinstance(o, DatetimeIndex): + v = o.asi8 + v[0:2] = pd.tslib.iNaT + values = o._shallow_copy(v) + else: + o = o.copy() + o[0:2] = pd.tslib.iNaT + values = o.values + elif o.values.dtype == 'datetime64[ns]' or isinstance(o, PeriodIndex): values[0:2] = pd.tslib.iNaT else: values[0:2] = null_obj @@ -563,17 +586,19 @@ def test_value_counts_inferred(self): self.assertEqual(s.nunique(), 0) # GH 3002, datetime64[ns] + # don't test names though txt = "\n".join(['xxyyzz20100101PIE', 'xxyyzz20100101GUM', 'xxyyzz20100101EGG', 'xxyyww20090101EGG', 'foofoo20080909PIE', 'foofoo20080909GUM']) f = StringIO(txt) df = pd.read_fwf(f, widths=[6, 8, 3], names=["person_id", "dt", "food"], parse_dates=["dt"]) - s = klass(df['dt'].copy(), name='dt') + s = klass(df['dt'].copy()) + s.name = None idx = pd.to_datetime(['2010-01-01 00:00:00Z', '2008-09-09 00:00:00Z', '2009-01-01 00:00:00X']) - expected_s = Series([3, 2, 1], index=idx, name='dt') + expected_s = Series([3, 2, 1], index=idx) tm.assert_series_equal(s.value_counts(), expected_s) expected = np.array(['2010-01-01 00:00:00Z', '2009-01-01 00:00:00Z', @@ -588,7 +613,7 @@ def test_value_counts_inferred(self): # with NaT s = df['dt'].copy() - s = klass([v for v in s.values] + [pd.NaT], name='dt') + s = klass([v for v in s.values] + [pd.NaT]) result = s.value_counts() self.assertEqual(result.index.dtype, 'datetime64[ns]') @@ -600,6 +625,7 @@ def test_value_counts_inferred(self): unique = s.unique() self.assertEqual(unique.dtype, 'datetime64[ns]') + # numpy_array_equal cannot compare pd.NaT self.assert_numpy_array_equal(unique[:3], expected) self.assertTrue(unique[3] is pd.NaT or unique[3].astype('int64') == pd.tslib.iNaT) diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 680b370cbca41..6cbe6e96ed892 100755 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -1255,24 +1255,6 @@ def setUp(self): def test_dtypes(self): - dtype = com.CategoricalDtype() - hash(dtype) - self.assertTrue(com.is_categorical_dtype(dtype)) - - s = Series(self.factor,name='A') - - # dtypes - self.assertTrue(com.is_categorical_dtype(s.dtype)) - self.assertTrue(com.is_categorical_dtype(s)) - self.assertFalse(com.is_categorical_dtype(np.dtype('float64'))) - - # np.dtype doesn't know about our new dtype - def f(): - np.dtype(dtype) - self.assertRaises(TypeError, f) - - self.assertFalse(dtype == np.str_) - self.assertFalse(np.str_ == dtype) # GH8143 index = ['cat','obj','num'] @@ -1799,16 +1781,14 @@ def test_categorical_repr_datetime(self): idx = pd.date_range('2011-01-01 09:00', freq='H', periods=5, tz='US/Eastern') c = pd.Categorical(idx) exp = """[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00] -Categories (5, datetime64[ns]): [2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, - 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, - 2011-01-01 13:00:00-05:00]""" +Categories (5, datetime64[ns, US/Eastern]): [2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00,\n 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00,\n 2011-01-01 13:00:00-05:00]""" self.assertEqual(repr(c), exp) c = pd.Categorical(idx.append(idx), categories=idx) exp = """[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00, 2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00] -Categories (5, datetime64[ns]): [2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, - 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, - 2011-01-01 13:00:00-05:00]""" +Categories (5, datetime64[ns, US/Eastern]): [2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, + 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, + 2011-01-01 13:00:00-05:00]""" self.assertEqual(repr(c), exp) def test_categorical_repr_datetime_ordered(self): @@ -1828,16 +1808,16 @@ def test_categorical_repr_datetime_ordered(self): idx = pd.date_range('2011-01-01 09:00', freq='H', periods=5, tz='US/Eastern') c = pd.Categorical(idx, ordered=True) exp = """[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00] -Categories (5, datetime64[ns]): [2011-01-01 09:00:00-05:00 < 2011-01-01 10:00:00-05:00 < - 2011-01-01 11:00:00-05:00 < 2011-01-01 12:00:00-05:00 < - 2011-01-01 13:00:00-05:00]""" +Categories (5, datetime64[ns, US/Eastern]): [2011-01-01 09:00:00-05:00 < 2011-01-01 10:00:00-05:00 < + 2011-01-01 11:00:00-05:00 < 2011-01-01 12:00:00-05:00 < + 2011-01-01 13:00:00-05:00]""" self.assertEqual(repr(c), exp) c = pd.Categorical(idx.append(idx), categories=idx, ordered=True) exp = """[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00, 2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00] -Categories (5, datetime64[ns]): [2011-01-01 09:00:00-05:00 < 2011-01-01 10:00:00-05:00 < - 2011-01-01 11:00:00-05:00 < 2011-01-01 12:00:00-05:00 < - 2011-01-01 13:00:00-05:00]""" +Categories (5, datetime64[ns, US/Eastern]): [2011-01-01 09:00:00-05:00 < 2011-01-01 10:00:00-05:00 < + 2011-01-01 11:00:00-05:00 < 2011-01-01 12:00:00-05:00 < + 2011-01-01 13:00:00-05:00]""" self.assertEqual(repr(c), exp) def test_categorical_repr_period(self): @@ -2017,9 +1997,9 @@ def test_categorical_series_repr_datetime(self): 3 2011-01-01 12:00:00-05:00 4 2011-01-01 13:00:00-05:00 dtype: category -Categories (5, datetime64[ns]): [2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, - 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, - 2011-01-01 13:00:00-05:00]""" +Categories (5, datetime64[ns, US/Eastern]): [2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, + 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, + 2011-01-01 13:00:00-05:00]""" self.assertEqual(repr(s), exp) def test_categorical_series_repr_datetime_ordered(self): @@ -2043,9 +2023,9 @@ def test_categorical_series_repr_datetime_ordered(self): 3 2011-01-01 12:00:00-05:00 4 2011-01-01 13:00:00-05:00 dtype: category -Categories (5, datetime64[ns]): [2011-01-01 09:00:00-05:00 < 2011-01-01 10:00:00-05:00 < - 2011-01-01 11:00:00-05:00 < 2011-01-01 12:00:00-05:00 < - 2011-01-01 13:00:00-05:00]""" +Categories (5, datetime64[ns, US/Eastern]): [2011-01-01 09:00:00-05:00 < 2011-01-01 10:00:00-05:00 < + 2011-01-01 11:00:00-05:00 < 2011-01-01 12:00:00-05:00 < + 2011-01-01 13:00:00-05:00]""" self.assertEqual(repr(s), exp) def test_categorical_series_repr_period(self): diff --git a/pandas/tests/test_dtypes.py b/pandas/tests/test_dtypes.py new file mode 100644 index 0000000000000..54a49de582e56 --- /dev/null +++ b/pandas/tests/test_dtypes.py @@ -0,0 +1,142 @@ +# -*- coding: utf-8 -*- + +import nose +import numpy as np +from pandas import Series, Categorical, date_range +import pandas.core.common as com +from pandas.core.common import (CategoricalDtype, is_categorical_dtype, is_categorical, + DatetimeTZDtype, is_datetime64tz_dtype, is_datetimetz, + is_dtype_equal, is_datetime64_ns_dtype, is_datetime64_dtype) +import pandas.util.testing as tm + +_multiprocess_can_split_ = True + +class Base(object): + + def test_hash(self): + hash(self.dtype) + + def test_equality_invalid(self): + self.assertRaises(self.dtype == 'foo') + + def test_numpy_informed(self): + + # np.dtype doesn't know about our new dtype + def f(): + np.dtype(self.dtype) + self.assertRaises(TypeError, f) + + self.assertNotEqual(self.dtype, np.str_) + self.assertNotEqual(np.str_, self.dtype) + + def test_pickle(self): + result = self.round_trip_pickle(self.dtype) + self.assertEqual(result, self.dtype) + +class TestCategoricalDtype(Base, tm.TestCase): + + def setUp(self): + self.dtype = CategoricalDtype() + + def test_equality(self): + self.assertTrue(is_dtype_equal(self.dtype, 'category')) + self.assertTrue(is_dtype_equal(self.dtype, CategoricalDtype())) + self.assertFalse(is_dtype_equal(self.dtype, 'foo')) + + def test_construction_from_string(self): + result = CategoricalDtype.construct_from_string('category') + self.assertTrue(is_dtype_equal(self.dtype, result)) + self.assertRaises(TypeError, lambda : CategoricalDtype.construct_from_string('foo')) + + def test_is_dtype(self): + self.assertTrue(CategoricalDtype.is_dtype(self.dtype)) + self.assertTrue(CategoricalDtype.is_dtype('category')) + self.assertTrue(CategoricalDtype.is_dtype(CategoricalDtype())) + self.assertFalse(CategoricalDtype.is_dtype('foo')) + self.assertFalse(CategoricalDtype.is_dtype(np.float64)) + + def test_basic(self): + + self.assertTrue(is_categorical_dtype(self.dtype)) + + factor = Categorical.from_array(['a', 'b', 'b', 'a', + 'a', 'c', 'c', 'c']) + + s = Series(factor,name='A') + + # dtypes + self.assertTrue(is_categorical_dtype(s.dtype)) + self.assertTrue(is_categorical_dtype(s)) + self.assertFalse(is_categorical_dtype(np.dtype('float64'))) + + self.assertTrue(is_categorical(s.dtype)) + self.assertTrue(is_categorical(s)) + self.assertFalse(is_categorical(np.dtype('float64'))) + self.assertFalse(is_categorical(1.0)) + +class TestDatetimeTZDtype(Base, tm.TestCase): + + def setUp(self): + self.dtype = DatetimeTZDtype('ns','US/Eastern') + + def test_construction(self): + self.assertRaises(ValueError, lambda : DatetimeTZDtype('ms','US/Eastern')) + + def test_subclass(self): + a = DatetimeTZDtype('datetime64[ns, US/Eastern]') + b = DatetimeTZDtype('datetime64[ns, CET]') + + self.assertTrue(issubclass(type(a), type(a))) + self.assertTrue(issubclass(type(a), type(b))) + + def test_compat(self): + self.assertFalse(is_datetime64_ns_dtype(self.dtype)) + self.assertFalse(is_datetime64_ns_dtype('datetime64[ns, US/Eastern]')) + self.assertFalse(is_datetime64_dtype(self.dtype)) + self.assertFalse(is_datetime64_dtype('datetime64[ns, US/Eastern]')) + + def test_construction_from_string(self): + result = DatetimeTZDtype('datetime64[ns, US/Eastern]') + self.assertTrue(is_dtype_equal(self.dtype, result)) + result = DatetimeTZDtype.construct_from_string('datetime64[ns, US/Eastern]') + self.assertTrue(is_dtype_equal(self.dtype, result)) + self.assertRaises(TypeError, lambda : DatetimeTZDtype.construct_from_string('foo')) + + def test_is_dtype(self): + self.assertTrue(DatetimeTZDtype.is_dtype(self.dtype)) + self.assertTrue(DatetimeTZDtype.is_dtype('datetime64[ns, US/Eastern]')) + self.assertFalse(DatetimeTZDtype.is_dtype('foo')) + self.assertTrue(DatetimeTZDtype.is_dtype(DatetimeTZDtype('ns','US/Pacific'))) + self.assertFalse(DatetimeTZDtype.is_dtype(np.float64)) + + def test_equality(self): + self.assertTrue(is_dtype_equal(self.dtype, 'datetime64[ns, US/Eastern]')) + self.assertTrue(is_dtype_equal(self.dtype, DatetimeTZDtype('ns','US/Eastern'))) + self.assertFalse(is_dtype_equal(self.dtype, 'foo')) + self.assertFalse(is_dtype_equal(self.dtype, DatetimeTZDtype('ns','CET'))) + self.assertFalse(is_dtype_equal(DatetimeTZDtype('ns','US/Eastern'), DatetimeTZDtype('ns','US/Pacific'))) + + # numpy compat + self.assertTrue(is_dtype_equal(np.dtype("M8[ns]"),"datetime64[ns]")) + + def test_basic(self): + + self.assertTrue(is_datetime64tz_dtype(self.dtype)) + + dr = date_range('20130101',periods=3,tz='US/Eastern') + s = Series(dr,name='A') + + # dtypes + self.assertTrue(is_datetime64tz_dtype(s.dtype)) + self.assertTrue(is_datetime64tz_dtype(s)) + self.assertFalse(is_datetime64tz_dtype(np.dtype('float64'))) + self.assertFalse(is_datetime64tz_dtype(1.0)) + + self.assertTrue(is_datetimetz(s)) + self.assertTrue(is_datetimetz(s.dtype)) + self.assertFalse(is_datetimetz(np.dtype('float64'))) + self.assertFalse(is_datetimetz(1.0)) + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 3d0259deef6f2..9db237606ce3f 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -2243,6 +2243,11 @@ def setUp(self): self.all_mixed = DataFrame({'a': 1., 'b': 2, 'c': 'foo', 'float32' : np.array([1.]*10,dtype='float32'), 'int32' : np.array([1]*10,dtype='int32'), }, index=np.arange(10)) + self.tzframe = DataFrame({'A' : date_range('20130101',periods=3), + 'B' : date_range('20130101',periods=3,tz='US/Eastern'), + 'C' : date_range('20130101',periods=3,tz='CET')}) + self.tzframe.iloc[1,1] = pd.NaT + self.tzframe.iloc[1,2] = pd.NaT self.ts1 = tm.makeTimeSeries() self.ts2 = tm.makeTimeSeries()[5:] @@ -4049,13 +4054,14 @@ def test_constructor_with_datetimes(self): import pytz tz = pytz.timezone('US/Eastern') dt = tz.localize(datetime(2012, 1, 1)) + df = DataFrame({'End Date': dt}, index=[0]) self.assertEqual(df.iat[0,0],dt) - assert_series_equal(df.dtypes,Series({'End Date' : np.dtype('object') })) + assert_series_equal(df.dtypes,Series({'End Date' : 'datetime64[ns, US/Eastern]' })) df = DataFrame([{'End Date': dt}]) self.assertEqual(df.iat[0,0],dt) - assert_series_equal(df.dtypes,Series({'End Date' : np.dtype('object') })) + assert_series_equal(df.dtypes,Series({'End Date' : 'datetime64[ns, US/Eastern]' })) # tz-aware (UTC and other tz's) # GH 8411 @@ -4087,6 +4093,140 @@ def test_constructor_with_datetimes(self): expected = DataFrame( {'a' : i.to_series(keep_tz=True).reset_index(drop=True), 'b': i_no_tz }) assert_frame_equal(df, expected) + def test_constructor_with_datetime_tz(self): + + # 8260 + # support datetime64 with tz + + idx = Index(date_range('20130101',periods=3,tz='US/Eastern'), + name='foo') + dr = date_range('20130110',periods=3) + + # construction + df = DataFrame({'A' : idx, 'B' : dr}) + self.assertTrue(df['A'].dtype,'M8[ns, US/Eastern') + self.assertTrue(df['A'].name == 'A') + assert_series_equal(df['A'],Series(idx,name='A')) + assert_series_equal(df['B'],Series(dr,name='B')) + + # construction from dict + df2 = DataFrame(dict(A=Timestamp('20130102', tz='US/Eastern'), B=Timestamp('20130603', tz='CET')), index=range(5)) + assert_series_equal(df2.dtypes, Series(['datetime64[ns, US/Eastern]', 'datetime64[ns, CET]'], index=['A','B'])) + + # concat + df3 = pd.concat([df2.A.to_frame(),df2.B.to_frame()],axis=1) + assert_frame_equal(df2, df3) + + # select_dtypes + result = df3.select_dtypes(include=['datetime64[ns]']) + expected = df3.reindex(columns=[]) + assert_frame_equal(result, expected) + + # this will select based on issubclass, and these are the same class + result = df3.select_dtypes(include=['datetime64[ns, CET]']) + expected = df3 + assert_frame_equal(result, expected) + + # from index + idx2 = date_range('20130101',periods=3,tz='US/Eastern',name='foo') + df2 = DataFrame(idx2) + assert_series_equal(df2['foo'],Series(idx2,name='foo')) + df2 = DataFrame(Series(idx2)) + assert_series_equal(df2['foo'],Series(idx2,name='foo')) + + idx2 = date_range('20130101',periods=3,tz='US/Eastern') + df2 = DataFrame(idx2) + assert_series_equal(df2[0],Series(idx2,name=0)) + df2 = DataFrame(Series(idx2)) + assert_series_equal(df2[0],Series(idx2,name=0)) + + # interleave + result = self.tzframe.values + expected = np.array([[Timestamp('2013-01-01 00:00:00'), + Timestamp('2013-01-02 00:00:00'), + Timestamp('2013-01-03 00:00:00')], + [Timestamp('2013-01-01 00:00:00-0500', tz='US/Eastern'), + pd.NaT, + Timestamp('2013-01-03 00:00:00-0500', tz='US/Eastern')], + [Timestamp('2013-01-01 00:00:00+0100', tz='CET'), + pd.NaT, + Timestamp('2013-01-03 00:00:00+0100', tz='CET')]], dtype=object).T + self.assert_numpy_array_equal(result, expected) + + # astype + result = self.tzframe.astype(object) + assert_frame_equal(result, DataFrame(expected, index=self.tzframe.index, columns=self.tzframe.columns)) + + # str formatting + result = self.tzframe.astype(str) + expected = np.array([['2013-01-01', '2013-01-01 00:00:00-05:00', + '2013-01-01 00:00:00+01:00'], + ['2013-01-02', 'NaT', 'NaT'], + ['2013-01-03', '2013-01-03 00:00:00-05:00', + '2013-01-03 00:00:00+01:00']], dtype=object) + self.assert_numpy_array_equal(result, expected) + + result = str(self.tzframe) + self.assertTrue('0 2013-01-01 2013-01-01 00:00:00-05:00 2013-01-01 00:00:00+01:00' in result) + self.assertTrue('1 2013-01-02 NaT NaT' in result) + self.assertTrue('2 2013-01-03 2013-01-03 00:00:00-05:00 2013-01-03 00:00:00+01:00' in result) + + # setitem + df['C'] = idx + assert_series_equal(df['C'],Series(idx,name='C')) + + df['D'] = 'foo' + df['D'] = idx + assert_series_equal(df['D'],Series(idx,name='D')) + del df['D'] + + # assert that A & C are not sharing the same base (e.g. they + # are copies) + b1 = df._data.blocks[1] + b2 = df._data.blocks[2] + self.assertTrue(b1.values.equals(b2.values)) + self.assertFalse(id(b1.values.values.base) == id(b2.values.values.base)) + + # with nan + df2 = df.copy() + df2.iloc[1,1] = pd.NaT + df2.iloc[1,2] = pd.NaT + result = df2['B'] + assert_series_equal(notnull(result), Series([True,False,True],name='B')) + assert_series_equal(df2.dtypes, df.dtypes) + + # set/reset + df = DataFrame({'A' : [0,1,2] }, index=idx) + result = df.reset_index() + self.assertTrue(result['foo'].dtype,'M8[ns, US/Eastern') + + result = result.set_index('foo') + tm.assert_index_equal(df.index,idx) + + # indexing + result = df2.iloc[1] + expected = Series([Timestamp('2013-01-02 00:00:00-0500', tz='US/Eastern'), np.nan, np.nan], + index=list('ABC'), dtype='object', name=1) + assert_series_equal(result, expected) + result = df2.loc[1] + expected = Series([Timestamp('2013-01-02 00:00:00-0500', tz='US/Eastern'), np.nan, np.nan], + index=list('ABC'), dtype='object', name=1) + assert_series_equal(result, expected) + + # indexing - fast_xs + df = DataFrame({'a': date_range('2014-01-01', periods=10, tz='UTC')}) + result = df.iloc[5] + expected = Timestamp('2014-01-06 00:00:00+0000', tz='UTC', offset='D') + self.assertEqual(result, expected) + + result = df.loc[5] + self.assertEqual(result, expected) + + # indexing - boolean + result = df[df.a > df.a[3]] + expected = df.iloc[4:] + assert_frame_equal(result, expected) + def test_constructor_for_list_with_dtypes(self): intname = np.dtype(np.int_).name floatname = np.dtype(np.float_).name @@ -4397,9 +4537,9 @@ def test_astype_str(self): result = df.astype(tt) expected = DataFrame({ - 'a' : list(map(tt, a.values)), - 'b' : list(map(tt, b.values)), - 'c' : list(map(tt, c.values)), + 'a' : list(map(tt, map(lambda x: Timestamp(x)._date_repr, a.values))), + 'b' : list(map(tt, map(Timestamp, b.values))), + 'c' : list(map(tt, map(lambda x: Timedelta(x)._repr_base(format='all'), c.values))), 'd' : list(map(tt, d.values)), 'e' : list(map(tt, e.values)), }) @@ -4425,6 +4565,10 @@ def test_pickle(self): unpickled = self.round_trip_pickle(self.empty) repr(unpickled) + # tz frame + unpickled = self.round_trip_pickle(self.tzframe) + assert_frame_equal(self.tzframe, unpickled) + def test_to_dict(self): test_data = { 'A': {'1': 1, '2': 2}, @@ -6293,6 +6437,17 @@ def test_to_csv_from_csv(self): xp.columns = lmap(int,xp.columns) assert_frame_equal(xp,rs) + # tz, 8260 + with ensure_clean(pname) as path: + + self.tzframe.to_csv(path) + result = pd.read_csv(path, index_col=0, parse_dates=['A']) + + converter = lambda c: pd.to_datetime(result[c]).dt.tz_localize('UTC').dt.tz_convert(self.tzframe[c].dt.tz) + result['B'] = converter('B') + result['C'] = converter('C') + assert_frame_equal(result, self.tzframe) + def test_to_csv_cols_reordering(self): # GH3454 import pandas as pd @@ -14806,8 +14961,10 @@ def test_dataframe_metadata(self): self.assertEqual(df[['X']].testattr, 'XXX') self.assertEqual(df.loc[['a', 'b'], :].testattr, 'XXX') self.assertEqual(df.iloc[[0, 1], :].testattr, 'XXX') + # GH9776 self.assertEqual(df.iloc[0:1, :].testattr, 'XXX') + # GH10553 unpickled = self.round_trip_pickle(df) assert_frame_equal(df, unpickled) diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index 9a3576a8fd846..9cfa95bc7ebec 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -567,7 +567,7 @@ def test_constructor_corner(self): # corner case self.assertRaises(TypeError, Index, 0) - def test_consruction_list_mixed_tuples(self): + def test_construction_list_mixed_tuples(self): # 10697 # if we are constructing from a mixed list of tuples, make sure that we # are independent of the sorting order @@ -2798,9 +2798,7 @@ def test_str(self): if hasattr(idx,'tz'): if idx.tz is not None: - self.assertTrue("tz='%s'" % idx.tz in str(idx)) - else: - self.assertTrue("tz=None" in str(idx)) + self.assertTrue(idx.tz in str(idx)) if hasattr(idx,'freq'): self.assertTrue("freq='%s'" % idx.freqstr in str(idx)) @@ -2828,6 +2826,24 @@ def setUp(self): def create_index(self): return date_range('20130101', periods=5) + def test_construction_with_alt(self): + + i = pd.date_range('20130101',periods=5,freq='H',tz='US/Eastern') + i2 = DatetimeIndex(i, dtype=i.dtype) + self.assert_index_equal(i, i2) + + i2 = DatetimeIndex(i.tz_localize(None).asi8, tz=i.dtype.tz) + self.assert_index_equal(i, i2) + + i2 = DatetimeIndex(i.tz_localize(None).asi8, dtype=i.dtype) + self.assert_index_equal(i, i2) + + i2 = DatetimeIndex(i.tz_localize(None).asi8, dtype=i.dtype, tz=i.dtype.tz) + self.assert_index_equal(i, i2) + + # incompat tz/dtype + self.assertRaises(ValueError, lambda : DatetimeIndex(i.tz_localize(None).asi8, dtype=i.dtype, tz='US/Pacific')) + def test_pickle_compat_construction(self): pass diff --git a/pandas/tests/test_internals.py b/pandas/tests/test_internals.py index 7c51641b8e5da..0b3a1d8a221d6 100644 --- a/pandas/tests/test_internals.py +++ b/pandas/tests/test_internals.py @@ -6,7 +6,8 @@ import nose import numpy as np -from pandas import Index, MultiIndex, DataFrame, Series, Categorical +import re +from pandas import Index, MultiIndex, DataFrame, DatetimeIndex, Series, Categorical from pandas.compat import OrderedDict, lrange from pandas.sparse.array import SparseArray from pandas.core.internals import * @@ -44,7 +45,7 @@ def create_block(typestr, placement, item_shape=None, num_offset=0): * complex, c16, c8 * bool * object, string, O - * datetime, dt, M8[ns] + * datetime, dt, M8[ns], M8[ns, tz] * timedelta, td, m8[ns] * sparse (SparseArray with fill_value=0.0) * sparse_na (SparseArray with fill_value=np.nan) @@ -74,6 +75,13 @@ def create_block(typestr, placement, item_shape=None, num_offset=0): values = np.ones(shape, dtype=np.bool_) elif typestr in ('datetime', 'dt', 'M8[ns]'): values = (mat * 1e9).astype('M8[ns]') + elif typestr.startswith('M8[ns'): + # datetime with tz + m = re.search('M8\[ns,\s*(\w+\/?\w*)\]', typestr) + assert m is not None, "incompatible typestr -> {0}".format(typestr) + tz = m.groups()[0] + assert num_items == 1, "must have only 1 num items for a tz-aware" + values = DatetimeIndex(np.arange(N) * 1e9, tz=tz) elif typestr in ('timedelta', 'td', 'm8[ns]'): values = (mat * 1).astype('m8[ns]') elif typestr in ('category',): @@ -478,7 +486,6 @@ def test_copy(self): def test_sparse(self): mgr = create_mgr('a: sparse-1; b: sparse-2') - # what to test here? self.assertEqual(mgr.as_matrix().dtype, np.float64) @@ -510,6 +517,12 @@ def test_as_matrix_datetime(self): mgr = create_mgr('h: datetime-1; g: datetime-2') self.assertEqual(mgr.as_matrix().dtype, 'M8[ns]') + def test_as_matrix_datetime_tz(self): + mgr = create_mgr('h: M8[ns, US/Eastern]; g: M8[ns, CET]') + self.assertEqual(mgr.get('h').dtype, 'datetime64[ns, US/Eastern]') + self.assertEqual(mgr.get('g').dtype, 'datetime64[ns, CET]') + self.assertEqual(mgr.as_matrix().dtype, 'object') + def test_astype(self): # coerce all mgr = create_mgr('c: f4; d: f2; e: f8') diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index be7ed6c1b268f..45ad17089fdeb 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -2191,6 +2191,21 @@ def test_datetimeindex(self): self.assertIsInstance(index.levels[0],pd.DatetimeIndex) self.assertIsInstance(index.levels[1],pd.DatetimeIndex) + def test_constructor_with_tz(self): + + index = pd.DatetimeIndex(['2013/01/01 09:00', '2013/01/02 09:00'], + name='dt1', tz='US/Pacific') + columns = pd.DatetimeIndex(['2014/01/01 09:00', '2014/01/02 09:00'], + name='dt2', tz='Asia/Tokyo') + + result = MultiIndex.from_arrays([index, columns]) + tm.assert_index_equal(result.levels[0], index) + tm.assert_index_equal(result.levels[1], columns) + + result = MultiIndex.from_arrays([Series(index), Series(columns)]) + tm.assert_index_equal(result.levels[0], index) + tm.assert_index_equal(result.levels[1], columns) + def test_set_index_datetime(self): # GH 3950 df = pd.DataFrame({'label':['a', 'a', 'a', 'b', 'b', 'b'], diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 36a8600e51725..e6c999e697022 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -1,6 +1,7 @@ # coding=utf-8 # pylint: disable-msg=E1101,W0612 +import re import sys from datetime import datetime, timedelta import operator @@ -18,7 +19,7 @@ import pandas as pd from pandas import (Index, Series, DataFrame, isnull, notnull, bdate_range, - date_range, period_range, timedelta_range) + date_range, period_range, timedelta_range, _np_version_under1p8) from pandas.core.index import MultiIndex from pandas.core.indexing import IndexingError from pandas.tseries.period import PeriodIndex @@ -137,6 +138,30 @@ def compare(s, name): expected = Series(DatetimeIndex(s.values).tz_localize('UTC').tz_convert('US/Eastern'),index=s.index) tm.assert_series_equal(result, expected) + # datetimeindex with tz + s = Series(date_range('20130101',periods=5,tz='US/Eastern')) + for prop in ok_for_dt: + + # we test freq below + if prop != 'freq': + compare(s, prop) + + for prop in ok_for_dt_methods: + getattr(s.dt,prop) + + result = s.dt.to_pydatetime() + self.assertIsInstance(result,np.ndarray) + self.assertTrue(result.dtype == object) + + result = s.dt.tz_convert('CET') + expected = Series(s.values.tz_convert('CET'),index=s.index) + tm.assert_series_equal(result, expected) + + tz_result = result.dt.tz + self.assertEqual(str(tz_result), 'CET') + freq_result = s.dt.freq + self.assertEqual(freq_result, DatetimeIndex(s.values, freq='infer').freq) + # timedeltaindex for s in [Series(timedelta_range('1 day',periods=5),index=list('abcde')), Series(timedelta_range('1 day 01:23:45',periods=5,freq='s')), @@ -980,6 +1005,80 @@ def test_constructor_dtype_datetime64(self): dr = date_range('20130101',periods=3,tz='US/Eastern') self.assertTrue(str(Series(dr).iloc[0].tz) == 'US/Eastern') + # non-convertible + s = Series([1479596223000, -1479590, pd.NaT]) + self.assertTrue(s.dtype == 'object') + self.assertTrue(s[2] is pd.NaT) + self.assertTrue('NaT' in str(s)) + + # if we passed a NaT it remains + s = Series([datetime(2010, 1, 1), datetime(2, 1, 1), pd.NaT]) + self.assertTrue(s.dtype == 'object') + self.assertTrue(s[2] is pd.NaT) + self.assertTrue('NaT' in str(s)) + + # if we passed a nan it remains + s = Series([datetime(2010, 1, 1), datetime(2, 1, 1), np.nan]) + self.assertTrue(s.dtype == 'object') + self.assertTrue(s[2] is np.nan) + self.assertTrue('NaN' in str(s)) + + def test_constructor_with_datetime_tz(self): + + # 8260 + # support datetime64 with tz + + dr = date_range('20130101',periods=3,tz='US/Eastern') + s = Series(dr) + self.assertTrue(s.dtype.name == 'datetime64[ns, US/Eastern]') + self.assertTrue(s.dtype == 'datetime64[ns, US/Eastern]') + self.assertTrue(com.is_datetime64tz_dtype(s.dtype)) + + # indexing + result = s.iloc[0] + self.assertEqual(result,Timestamp('2013-01-01 00:00:00-0500', tz='US/Eastern', offset='D')) + result = s[0] + self.assertEqual(result,Timestamp('2013-01-01 00:00:00-0500', tz='US/Eastern', offset='D')) + + result = s[Series([True,True,False],index=s.index)] + assert_series_equal(result,s[0:2]) + + result = s.iloc[0:1] + assert_series_equal(result,Series(dr[0:1])) + + # concat + result = pd.concat([s.iloc[0:1],s.iloc[1:]]) + assert_series_equal(result,s) + + # astype + result = s.astype(object) + expected = Series(s.values.asobject) + assert_series_equal(result, expected) + + # short str + self.assertTrue('datetime64[ns, US/Eastern]' in str(s)) + + # formatting with NaT + result = s.shift() + self.assertTrue('datetime64[ns, US/Eastern]' in str(result)) + self.assertTrue('NaT' in str(result)) + + # long str + t = Series(date_range('20130101',periods=1000,tz='US/Eastern')) + self.assertTrue('datetime64[ns, US/Eastern]' in str(t)) + + result = pd.DatetimeIndex(s,freq='infer') + tm.assert_index_equal(result, dr) + + # inference + s = Series([pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'),pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Pacific')]) + self.assertTrue(s.dtype == 'datetime64[ns, US/Pacific]') + self.assertTrue(lib.infer_dtype(s) == 'datetime64') + + s = Series([pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'),pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Eastern')]) + self.assertTrue(s.dtype == 'object') + self.assertTrue(lib.infer_dtype(s) == 'datetime') + def test_constructor_periodindex(self): # GH7932 # converting a PeriodIndex when put in a Series @@ -3497,16 +3596,17 @@ def test_timedelta_assignment(self): def test_operators_datetimelike(self): def run_ops(ops, get_ser, test_ser): - for op in ops: - try: - op = getattr(get_ser, op, None) - if op is not None: - self.assertRaises(TypeError, op, test_ser) - except: - com.pprint_thing("Failed on op %r" % op) - raise + + # check that we are getting a TypeError + # with 'operate' (from core/ops.py) for the ops that are not defined + for op_str in ops: + op = getattr(get_ser, op_str, None) + with tm.assertRaisesRegexp(TypeError, 'operate'): + op(test_ser) + ### timedelta64 ### td1 = Series([timedelta(minutes=5,seconds=3)]*3) + td1.iloc[2] = np.nan td2 = timedelta(minutes=5,seconds=4) ops = ['__mul__','__floordiv__','__pow__', '__rmul__','__rfloordiv__','__rpow__'] @@ -3521,6 +3621,7 @@ def run_ops(ops, get_ser, test_ser): ### datetime64 ### dt1 = Series([Timestamp('20111230'), Timestamp('20120101'), Timestamp('20120103')]) + dt1.iloc[2] = np.nan dt2 = Series([Timestamp('20111231'), Timestamp('20120102'), Timestamp('20120104')]) ops = ['__add__', '__mul__', '__floordiv__', '__truediv__', '__div__', @@ -3549,6 +3650,66 @@ def run_ops(ops, get_ser, test_ser): td1 + dt1 dt1 + td1 + # 8260, 10763 + # datetime64 with tz + ops = ['__mul__', '__floordiv__', '__truediv__', '__div__', '__pow__', + '__rmul__', '__rfloordiv__', '__rtruediv__', '__rdiv__', + '__rpow__'] + dt1 = Series(date_range('2000-01-01 09:00:00',periods=5,tz='US/Eastern'),name='foo') + dt2 = dt1.copy() + dt2.iloc[2] = np.nan + td1 = Series(timedelta_range('1 days 1 min',periods=5, freq='H')) + td2 = td1.copy() + td2.iloc[1] = np.nan + run_ops(ops, dt1, td1) + + result = dt1 + td1[0] + expected = (dt1.dt.tz_localize(None) + td1[0]).dt.tz_localize('US/Eastern') + assert_series_equal(result, expected) + + result = dt2 + td2[0] + expected = (dt2.dt.tz_localize(None) + td2[0]).dt.tz_localize('US/Eastern') + assert_series_equal(result, expected) + + # odd numpy behavior with scalar timedeltas + if not _np_version_under1p8: + result = td1[0] + dt1 + expected = (dt1.dt.tz_localize(None) + td1[0]).dt.tz_localize('US/Eastern') + assert_series_equal(result, expected) + + result = td2[0] + dt2 + expected = (dt2.dt.tz_localize(None) + td2[0]).dt.tz_localize('US/Eastern') + assert_series_equal(result, expected) + + result = dt1 - td1[0] + expected = (dt1.dt.tz_localize(None) - td1[0]).dt.tz_localize('US/Eastern') + assert_series_equal(result, expected) + self.assertRaises(TypeError, lambda: td1[0] - dt1) + + result = dt2 - td2[0] + expected = (dt2.dt.tz_localize(None) - td2[0]).dt.tz_localize('US/Eastern') + assert_series_equal(result, expected) + self.assertRaises(TypeError, lambda: td2[0] - dt2) + + result = dt1 + td1 + expected = (dt1.dt.tz_localize(None) + td1).dt.tz_localize('US/Eastern') + assert_series_equal(result, expected) + + result = dt2 + td2 + expected = (dt2.dt.tz_localize(None) + td2).dt.tz_localize('US/Eastern') + assert_series_equal(result, expected) + + result = dt1 - td1 + expected = (dt1.dt.tz_localize(None) - td1).dt.tz_localize('US/Eastern') + assert_series_equal(result, expected) + + result = dt2 - td2 + expected = (dt2.dt.tz_localize(None) - td2).dt.tz_localize('US/Eastern') + assert_series_equal(result, expected) + + self.assertRaises(TypeError, lambda: td1 - dt1) + self.assertRaises(TypeError, lambda: td2 - dt2) + def test_ops_datetimelike_align(self): # GH 7500 # datetimelike ops need to align @@ -4820,6 +4981,7 @@ def test_drop_duplicates(self): sc = s.copy() sc.drop_duplicates(keep='last', inplace=True) assert_series_equal(sc, s[~expected]) + # deprecate take_last with tm.assert_produces_warning(FutureWarning): assert_series_equal(s.duplicated(take_last=True), expected) @@ -4852,6 +5014,7 @@ def test_drop_duplicates(self): sc = s.copy() sc.drop_duplicates(keep='last', inplace=True) assert_series_equal(sc, s[~expected]) + # deprecate take_last with tm.assert_produces_warning(FutureWarning): assert_series_equal(s.duplicated(take_last=True), expected) @@ -5345,6 +5508,16 @@ def test_shift(self): expected = Series([np.nan,0,1,2,3],index=index) assert_series_equal(result,expected) + # xref 8260 + # with tz + s = Series(date_range('2000-01-01 09:00:00',periods=5,tz='US/Eastern'),name='foo') + result = s-s.shift() + assert_series_equal(result,Series(TimedeltaIndex(['NaT'] + ['1 days']*4),name='foo')) + + # incompat tz + s2 = Series(date_range('2000-01-01 09:00:00',periods=5,tz='CET'),name='foo') + self.assertRaises(ValueError, lambda : s-s2) + def test_tshift(self): # PeriodIndex ps = tm.makePeriodSeries() @@ -5843,17 +6016,17 @@ def test_astype_str(self): for tt in set([str, compat.text_type]): ts = Series([Timestamp('2010-01-04 00:00:00')]) s = ts.astype(tt) - expected = Series([tt(ts.values[0])]) + expected = Series([tt('2010-01-04')]) assert_series_equal(s, expected) ts = Series([Timestamp('2010-01-04 00:00:00', tz='US/Eastern')]) s = ts.astype(tt) - expected = Series([tt(ts.values[0])]) + expected = Series([tt('2010-01-04 00:00:00-05:00')]) assert_series_equal(s, expected) td = Series([Timedelta(1, unit='d')]) s = td.astype(tt) - expected = Series([tt(td.values[0])]) + expected = Series([tt('1 days 00:00:00.000000000')]) assert_series_equal(s, expected) def test_astype_unicode(self): @@ -6922,9 +7095,9 @@ def check_replace(to_rep, val, expected): # test an object with dates + floats + integers + strings dr = date_range('1/1/2001', '1/10/2001', freq='D').to_series().reset_index(drop=True) - r = dr.astype(object).replace([dr[0],dr[1],dr[2]], [1.0,2,'a']) - assert_series_equal(r, Series([1.0,2,'a'] + - dr[3:].tolist(),dtype=object)) + result = dr.astype(object).replace([dr[0],dr[1],dr[2]], [1.0,2,'a']) + expected = Series([1.0,2,'a'] + dr[3:].tolist(),dtype=object) + assert_series_equal(result, expected) def test_replace_bool_with_string_no_op(self): s = Series([True, False, True]) @@ -7003,6 +7176,11 @@ def test_diff(self): nxp = xp.diff() assert_series_equal(nrs, nxp) + # with tz + s = Series(date_range('2000-01-01 09:00:00',periods=5,tz='US/Eastern'), name='foo') + result = s.diff() + assert_series_equal(result,Series(TimedeltaIndex(['NaT'] + ['1 days']*4),name='foo')) + def test_pct_change(self): rs = self.ts.pct_change(fill_method=None) assert_series_equal(rs, self.ts / self.ts.shift(1) - 1) diff --git a/pandas/tools/tests/test_pivot.py b/pandas/tools/tests/test_pivot.py index 34789a3c52cb7..50ae574c03067 100644 --- a/pandas/tools/tests/test_pivot.py +++ b/pandas/tools/tests/test_pivot.py @@ -579,21 +579,21 @@ def test_pivot_dtaccessor(self): exp_idx = Index(['a', 'b'], name='label') expected = DataFrame({7: [0, 3], 8: [1, 4], 9:[2, 5]}, - index=exp_idx, columns=[7, 8, 9]) + index=exp_idx, columns=Index([7, 8, 9],name='dt1')) tm.assert_frame_equal(result, expected) result = pivot_table(df, index=df['dt2'].dt.month, columns=df['dt1'].dt.hour, values='value1') expected = DataFrame({7: [0, 3], 8: [1, 4], 9:[2, 5]}, - index=[1, 2], columns=[7, 8, 9]) + index=Index([1, 2],name='dt2'), columns=Index([7, 8, 9],name='dt1')) tm.assert_frame_equal(result, expected) - result = pivot_table(df, index=df['dt2'].dt.year, + result = pivot_table(df, index=df['dt2'].dt.year.values, columns=[df['dt1'].dt.hour, df['dt2'].dt.month], values='value1') - exp_col = MultiIndex.from_arrays([[7, 7, 8, 8, 9, 9], [1, 2] * 3]) + exp_col = MultiIndex.from_arrays([[7, 7, 8, 8, 9, 9], [1, 2] * 3],names=['dt1','dt2']) expected = DataFrame(np.array([[0, 3, 1, 4, 2, 5]],dtype='int64'), index=[2013], columns=exp_col) tm.assert_frame_equal(result, expected) diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py index 727852ced25b0..d0e4ba8012cff 100644 --- a/pandas/tseries/base.py +++ b/pandas/tseries/base.py @@ -178,7 +178,7 @@ def order(self, return_indexer=False, ascending=True): return self._simple_new(sorted_values, **attribs) - def take(self, indices, axis=0): + def take(self, indices, axis=0, **kwargs): """ Analogous to ndarray.take """ @@ -335,11 +335,6 @@ def _format_attrs(self): if freq is not None: freq = "'%s'" % freq attrs.append(('freq',freq)) - elif attrib == 'tz': - tz = self.tz - if tz is not None: - tz = "'%s'" % tz - attrs.append(('tz',tz)) return attrs @cache_readonly @@ -444,9 +439,9 @@ def _add_delta_td(self, other): inc = tslib._delta_to_nanoseconds(other) mask = self.asi8 == tslib.iNaT - new_values = (self.asi8 + inc).view(self.dtype) + new_values = (self.asi8 + inc).view('i8') new_values[mask] = tslib.iNaT - return new_values.view(self.dtype) + return new_values.view('i8') def _add_delta_tdi(self, other): # add a delta of a TimedeltaIndex @@ -540,8 +535,7 @@ def repeat(self, repeats, axis=None): """ Analogous to ndarray.repeat """ - return self._simple_new(self.values.repeat(repeats), - name=self.name) + return self._shallow_copy(self.values.repeat(repeats), freq=None) def summary(self, name=None): """ diff --git a/pandas/tseries/common.py b/pandas/tseries/common.py index a4d5939d386ae..eb56ca5f33ab3 100644 --- a/pandas/tseries/common.py +++ b/pandas/tseries/common.py @@ -9,6 +9,8 @@ from pandas import tslib from pandas.core.common import (_NS_DTYPE, _TD_DTYPE, is_period_arraylike, is_datetime_arraylike, is_integer_dtype, is_list_like, + is_datetime64_dtype, is_datetime64tz_dtype, + is_timedelta64_dtype, get_dtype_kinds) def is_datetimelike(data): @@ -43,23 +45,24 @@ def maybe_to_datetimelike(data, copy=False): raise TypeError("cannot convert an object of type {0} to a datetimelike index".format(type(data))) index = data.index - if issubclass(data.dtype.type, np.datetime64): - return DatetimeProperties(DatetimeIndex(data, copy=copy, freq='infer'), index) - elif issubclass(data.dtype.type, np.timedelta64): - return TimedeltaProperties(TimedeltaIndex(data, copy=copy, freq='infer'), index) + if is_datetime64_dtype(data.dtype) or is_datetime64tz_dtype(data.dtype): + return DatetimeProperties(DatetimeIndex(data, copy=copy, freq='infer'), index, name=data.name) + elif is_timedelta64_dtype(data.dtype): + return TimedeltaProperties(TimedeltaIndex(data, copy=copy, freq='infer'), index, name=data.name) else: if is_period_arraylike(data): - return PeriodProperties(PeriodIndex(data, copy=copy), index) + return PeriodProperties(PeriodIndex(data, copy=copy), index, name=data.name) if is_datetime_arraylike(data): - return DatetimeProperties(DatetimeIndex(data, copy=copy, freq='infer'), index) + return DatetimeProperties(DatetimeIndex(data, copy=copy, freq='infer'), index, name=data.name) raise TypeError("cannot convert an object of type {0} to a datetimelike index".format(type(data))) class Properties(PandasDelegate): - def __init__(self, values, index): + def __init__(self, values, index, name): self.values = values self.index = index + self.name = name def _delegate_property_get(self, name): from pandas import Series @@ -74,7 +77,7 @@ def _delegate_property_get(self, name): return result # return the result as a Series, which is by definition a copy - result = Series(result, index=self.index) + result = Series(result, index=self.index, name=self.name) # setting this object will show a SettingWithCopyWarning/Error result.is_copy = ("modifications to a property of a datetimelike object are not " @@ -95,7 +98,7 @@ def _delegate_method(self, name, *args, **kwargs): if not com.is_list_like(result): return result - result = Series(result, index=self.index) + result = Series(result, index=self.index, name=self.name) # setting this object will show a SettingWithCopyWarning/Error result.is_copy = ("modifications to a method of a datetimelike object are not " @@ -196,7 +199,7 @@ class CombinedDatetimelikeProperties(DatetimeProperties, TimedeltaProperties): def _concat_compat(to_concat, axis=0): """ provide concatenation of an datetimelike array of arrays each of which is a single - M8[ns], or m8[ns] dtype + M8[ns], datetimet64[ns, tz] or m8[ns] dtype Parameters ---------- @@ -211,6 +214,10 @@ def _concat_compat(to_concat, axis=0): def convert_to_pydatetime(x, axis): # coerce to an object dtype if x.dtype == _NS_DTYPE: + + if hasattr(x, 'tz'): + x = x.asobject + shape = x.shape x = tslib.ints_to_pydatetime(x.view(np.int64).ravel()) x = x.reshape(shape) @@ -218,10 +225,19 @@ def convert_to_pydatetime(x, axis): shape = x.shape x = tslib.ints_to_pytimedelta(x.view(np.int64).ravel()) x = x.reshape(shape) + return x typs = get_dtype_kinds(to_concat) + # datetimetz + if 'datetimetz' in typs: + + # we require ALL of the same tz for datetimetz + tzs = set([ getattr(x,'tz',None) for x in to_concat ])-set([None]) + if len(tzs) == 1: + return DatetimeIndex(np.concatenate([ x.tz_localize(None).asi8 for x in to_concat ]), tz=list(tzs)[0]) + # single dtype if len(typs) == 1: diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index 85de5e083d6d9..628f396480190 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -14,6 +14,7 @@ import pandas.tslib as tslib import pandas._period as period from pandas.tslib import Timedelta +from pytz import AmbiguousTimeError class FreqGroup(object): FR_ANN = 1000 @@ -810,7 +811,11 @@ def infer_freq(index, warn=True): raise TypeError("cannot infer freq from a non-convertible index type {0}".format(type(index))) index = index.values - index = pd.DatetimeIndex(index) + try: + index = pd.DatetimeIndex(index) + except AmbiguousTimeError: + index = pd.DatetimeIndex(index.asi8) + inferer = _FrequencyInferer(index, warn=warn) return inferer.get_freq() diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 19ff9a4b19a3e..aad15c7a30c6b 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -6,8 +6,11 @@ import numpy as np from pandas.core.common import (_NS_DTYPE, _INT64_DTYPE, _values_from_object, _maybe_box, + is_object_dtype, is_datetime64_dtype, + is_datetimetz, is_dtype_equal, ABCSeries, is_integer, is_float, - is_object_dtype, is_datetime64_dtype) + DatetimeTZDtype) + from pandas.io.common import PerformanceWarning from pandas.core.index import Index, Int64Index, Float64Index import pandas.compat as compat @@ -113,11 +116,12 @@ def _new_DatetimeIndex(cls, d): """ This is called upon unpickling, rather than the default which doesn't have arguments and breaks __new__ """ - # simply set the tz # data are already in UTC + # so need to localize tz = d.pop('tz',None) result = cls.__new__(cls, **d) - result.tz = tz + if tz is not None: + result = result.tz_localize('UTC').tz_convert(tz) return result class DatetimeIndex(DatelikeOps, DatetimeIndexOpsMixin, Int64Index): @@ -198,7 +202,7 @@ def __new__(cls, data=None, freq=None, start=None, end=None, periods=None, copy=False, name=None, tz=None, verify_integrity=True, normalize=False, - closed=None, ambiguous='raise', **kwargs): + closed=None, ambiguous='raise', dtype=None, **kwargs): dayfirst = kwargs.pop('dayfirst', None) yearfirst = kwargs.pop('yearfirst', None) @@ -263,6 +267,13 @@ def __new__(cls, data=None, dayfirst=dayfirst, yearfirst=yearfirst) + if is_datetimetz(data): + # extract the data whether a Series or Index + if isinstance(data, ABCSeries): + data = data.values + tz = data.tz + data = data.tz_localize(None, ambiguous='infer').values + if issubclass(data.dtype.type, np.datetime64): if isinstance(data, ABCSeries): data = data.values @@ -311,7 +322,8 @@ def __new__(cls, data=None, # tz aware subarr = tools._to_datetime(data, box=False, utc=True) - if not np.issubdtype(subarr.dtype, np.datetime64): + # we may not have been able to convert + if not (is_datetimetz(subarr) or np.issubdtype(subarr.dtype, np.datetime64)): raise ValueError('Unable to convert %s to datetime dtype' % str(data)) @@ -333,6 +345,16 @@ def __new__(cls, data=None, subarr = cls._simple_new(subarr, name=name, freq=freq, tz=tz) + # if dtype is provided, coerce here + if dtype is not None: + + if not is_dtype_equal(subarr.dtype, dtype): + + if subarr.tz is not None: + raise ValueError("cannot localize from non-UTC data") + dtype = DatetimeTZDtype.construct_from_string(dtype) + subarr = subarr.tz_localize(dtype.tz) + if verify_integrity and len(subarr) > 0: if freq is not None and not freq_infer: inferred = subarr.inferred_freq @@ -497,16 +519,21 @@ def _local_timestamps(self): return result.take(reverse) @classmethod - def _simple_new(cls, values, name=None, freq=None, tz=None, **kwargs): + def _simple_new(cls, values, name=None, freq=None, tz=None, dtype=None, **kwargs): """ we require the we have a dtype compat for the values if we are passed a non-dtype compat, then coerce using the constructor """ if not getattr(values,'dtype',None): + # empty, but with dtype compat + if values is None: + values = np.empty(0, dtype=_NS_DTYPE) + return cls(values, name=name, freq=freq, tz=tz, dtype=dtype, **kwargs) values = np.array(values,copy=False) + if is_object_dtype(values): - return cls(values, name=name, freq=freq, tz=tz, **kwargs).values + return cls(values, name=name, freq=freq, tz=tz, dtype=dtype, **kwargs).values elif not is_datetime64_dtype(values): values = com._ensure_int64(values).view(_NS_DTYPE) @@ -689,7 +716,15 @@ def _add_delta(self, delta): def _add_offset(self, offset): try: - return offset.apply_index(self) + if self.tz is not None: + values = self.tz_localize(None) + else: + values = self + result = offset.apply_index(values) + if self.tz is not None: + result = result.tz_localize(self.tz) + return result + except NotImplementedError: warnings.warn("Non-vectorized DateOffset being applied to Series or DatetimeIndex", PerformanceWarning) @@ -739,7 +774,8 @@ def to_series(self, keep_tz=False): If the timezone is not set, the resulting Series will have a datetime64[ns] dtype. - Otherwise the Series will have an object dtype; the + + Otherwise the Series will have an datetime64[ns, tz] dtype; the tz will be preserved. If keep_tz is False: @@ -761,8 +797,11 @@ def _to_embed(self, keep_tz=False): This is for internal compat """ if keep_tz and self.tz is not None: - return self.asobject.values - return self.values + + # preserve the tz & copy + return self.copy(deep=True) + + return self.values.copy() def to_pydatetime(self): """ @@ -1471,9 +1510,11 @@ def inferred_type(self): # sure we can't have ambiguous indexing return 'datetime64' - @property + @cache_readonly def dtype(self): - return _NS_DTYPE + if self.tz is None: + return _NS_DTYPE + return com.DatetimeTZDtype('ns',self.tz) @property def is_all_dates(self): diff --git a/pandas/tseries/tests/test_base.py b/pandas/tseries/tests/test_base.py index 3d9e80f351c44..e4ef9accc7d36 100644 --- a/pandas/tseries/tests/test_base.py +++ b/pandas/tseries/tests/test_base.py @@ -111,32 +111,32 @@ def test_minmax(self): self.assertTrue(pd.isnull(getattr(obj, op)())) def test_representation(self): - idx1 = DatetimeIndex([], freq='D') - idx2 = DatetimeIndex(['2011-01-01'], freq='D') - idx3 = DatetimeIndex(['2011-01-01', '2011-01-02'], freq='D') - idx4 = DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03'], freq='D') - idx5 = DatetimeIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00'], - freq='H', tz='Asia/Tokyo') - idx6 = DatetimeIndex(['2011-01-01 09:00', '2011-01-01 10:00', pd.NaT], - tz='US/Eastern') - - exp1 = """DatetimeIndex([], dtype='datetime64[ns]', freq='D', tz=None)""" - - exp2 = """DatetimeIndex(['2011-01-01'], dtype='datetime64[ns]', freq='D', tz=None)""" - - exp3 = """DatetimeIndex(['2011-01-01', '2011-01-02'], dtype='datetime64[ns]', freq='D', tz=None)""" - exp4 = """DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03'], dtype='datetime64[ns]', freq='D', tz=None)""" - - exp5 = """DatetimeIndex(['2011-01-01 09:00:00+09:00', '2011-01-01 10:00:00+09:00', '2011-01-01 11:00:00+09:00'], dtype='datetime64[ns]', freq='H', tz='Asia/Tokyo')""" - - exp6 = """DatetimeIndex(['2011-01-01 09:00:00-05:00', '2011-01-01 10:00:00-05:00', 'NaT'], dtype='datetime64[ns]', freq=None, tz='US/Eastern')""" + idx = [] + idx.append(DatetimeIndex([], freq='D')) + idx.append(DatetimeIndex(['2011-01-01'], freq='D')) + idx.append(DatetimeIndex(['2011-01-01', '2011-01-02'], freq='D')) + idx.append(DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03'], freq='D')) + idx.append(DatetimeIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00'], + freq='H', tz='Asia/Tokyo')) + idx.append(DatetimeIndex(['2011-01-01 09:00', '2011-01-01 10:00', pd.NaT], + tz='US/Eastern')) + idx.append(DatetimeIndex(['2011-01-01 09:00', '2011-01-01 10:00', pd.NaT], + tz='UTC')) + + exp = [] + exp.append("""DatetimeIndex([], dtype='datetime64[ns]', freq='D')""") + exp.append("""DatetimeIndex(['2011-01-01'], dtype='datetime64[ns]', freq='D')""") + exp.append("""DatetimeIndex(['2011-01-01', '2011-01-02'], dtype='datetime64[ns]', freq='D')""") + exp.append("""DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03'], dtype='datetime64[ns]', freq='D')""") + exp.append("""DatetimeIndex(['2011-01-01 09:00:00+09:00', '2011-01-01 10:00:00+09:00', '2011-01-01 11:00:00+09:00'], dtype='datetime64[ns, Asia/Tokyo]', freq='H')""") + exp.append("""DatetimeIndex(['2011-01-01 09:00:00-05:00', '2011-01-01 10:00:00-05:00', 'NaT'], dtype='datetime64[ns, US/Eastern]', freq=None)""") + exp.append("""DatetimeIndex(['2011-01-01 09:00:00+00:00', '2011-01-01 10:00:00+00:00', 'NaT'], dtype='datetime64[ns, UTC]', freq=None)""") with pd.option_context('display.width', 300): - for idx, expected in zip([idx1, idx2, idx3, idx4, idx5, idx6], - [exp1, exp2, exp3, exp4, exp5, exp6]): + for indx, expected in zip(idx, exp): for func in ['__repr__', '__unicode__', '__str__']: - result = getattr(idx, func)() + result = getattr(indx, func)() self.assertEqual(result, expected) def test_summary(self): diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index 0bb385b756eb8..707672a8ecec3 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -1101,6 +1101,60 @@ def test_to_datetime_array_of_dt64s(self): ) ) + def test_to_datetime_tz(self): + + # xref 8260 + # uniform returns a DatetimeIndex + arr = [pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'),pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Pacific')] + result = pd.to_datetime(arr) + expected = DatetimeIndex(['2013-01-01 13:00:00','2013-01-02 14:00:00'],tz='US/Pacific') + tm.assert_index_equal(result, expected) + + # mixed tzs will raise + arr = [pd.Timestamp('2013-01-01 13:00:00', tz='US/Pacific'),pd.Timestamp('2013-01-02 14:00:00', tz='US/Eastern')] + self.assertRaises(ValueError, lambda : pd.to_datetime(arr)) + + def test_to_datetime_tz_pytz(self): + + # xref 8260 + tm._skip_if_no_pytz() + import pytz + + us_eastern = pytz.timezone('US/Eastern') + arr = np.array([us_eastern.localize(datetime(year=2000, month=1, day=1, hour=3, minute=0)), + us_eastern.localize(datetime(year=2000, month=6, day=1, hour=3, minute=0))],dtype=object) + result = pd.to_datetime(arr, utc=True) + expected = DatetimeIndex(['2000-01-01 08:00:00+00:00', '2000-06-01 07:00:00+00:00'], dtype='datetime64[ns, UTC]', freq=None) + tm.assert_index_equal(result, expected) + + def test_to_datetime_tz_psycopg2(self): + + # xref 8260 + try: + import psycopg2 + except ImportError: + raise nose.SkipTest("no psycopg2 installed") + + # misc cases + arr = np.array([ datetime(2000, 1, 1, 3, 0, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=-300, name=None)), + datetime(2000, 6, 1, 3, 0, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=-240, name=None))], dtype=object) + + result = pd.to_datetime(arr, errors='coerce', utc=True) + expected = DatetimeIndex(['2000-01-01 08:00:00+00:00', '2000-06-01 07:00:00+00:00'], dtype='datetime64[ns, UTC]', freq=None) + tm.assert_index_equal(result, expected) + + # dtype coercion + i = pd.DatetimeIndex(['2000-01-01 08:00:00+00:00'],tz=psycopg2.tz.FixedOffsetTimezone(offset=-300, name=None)) + self.assertFalse(com.is_datetime64_ns_dtype(i)) + + # tz coerceion + result = pd.to_datetime(i, errors='coerce') + tm.assert_index_equal(result, i) + + result = pd.to_datetime(i, errors='coerce', utc=True) + expected = pd.DatetimeIndex(['2000-01-01 13:00:00']) + tm.assert_index_equal(result, expected) + def test_index_to_datetime(self): idx = Index(['1/1/2000', '1/2/2000', '1/3/2000']) @@ -2471,6 +2525,15 @@ def test_datetime64_with_DateOffset(self): exp = klass(date_range('1999-01-01', '1999-01-31')) assert_func(result, exp) + s = klass([Timestamp('2000-01-15 00:15:00', tz='US/Central'), + pd.Timestamp('2000-02-15', tz='US/Central')]) + result = s + pd.offsets.Day() + result2 = pd.offsets.Day() + s + exp = klass([Timestamp('2000-01-16 00:15:00', tz='US/Central'), + Timestamp('2000-02-16', tz='US/Central')]) + assert_func(result, exp) + assert_func(result2, exp) + s = klass([Timestamp('2000-01-15 00:15:00', tz='US/Central'), pd.Timestamp('2000-02-15', tz='US/Central')]) result = s + pd.offsets.MonthEnd() diff --git a/pandas/tseries/tests/test_timezones.py b/pandas/tseries/tests/test_timezones.py index aa655efc08ca5..9306175f995c5 100644 --- a/pandas/tseries/tests/test_timezones.py +++ b/pandas/tseries/tests/test_timezones.py @@ -17,7 +17,7 @@ from pytz import NonExistentTimeError import pandas.util.testing as tm - +from pandas.core.dtypes import DatetimeTZDtype from pandas.util.testing import assert_frame_equal from pandas.compat import lrange, zip @@ -667,7 +667,8 @@ def test_frame_no_datetime64_dtype(self): dr = date_range('2011/1/1', '2012/1/1', freq='W-FRI') dr_tz = dr.tz_localize(self.tzstr('US/Eastern')) e = DataFrame({'A': 'foo', 'B': dr_tz}, index=dr) - self.assertEqual(e['B'].dtype, 'O') + tz_expected = DatetimeTZDtype('ns',dr_tz.tzinfo) + self.assertEqual(e['B'].dtype, tz_expected) # GH 2810 (with timezones) datetimes_naive = [ ts.to_pydatetime() for ts in dr ] @@ -675,8 +676,8 @@ def test_frame_no_datetime64_dtype(self): df = DataFrame({'dr' : dr, 'dr_tz' : dr_tz, 'datetimes_naive': datetimes_naive, 'datetimes_with_tz' : datetimes_with_tz }) - result = df.get_dtype_counts() - expected = Series({ 'datetime64[ns]' : 2, 'object' : 2 }) + result = df.get_dtype_counts().sort_index() + expected = Series({ 'datetime64[ns]' : 2, str(tz_expected) : 2 }).sort_index() tm.assert_series_equal(result, expected) def test_hongkong_tz_convert(self): diff --git a/pandas/tseries/tests/test_tslib.py b/pandas/tseries/tests/test_tslib.py index 85bae42e7a492..d67ff9d0e397e 100644 --- a/pandas/tseries/tests/test_tslib.py +++ b/pandas/tseries/tests/test_tslib.py @@ -943,6 +943,12 @@ def compare_local_to_utc(tz_didx, utc_didx): tslib.maybe_get_tz('Asia/Tokyo')) self.assert_numpy_array_equal(result, np.array([], dtype=np.int64)) + # Check all-NaT array + result = tslib.tz_convert(np.array([tslib.iNaT], dtype=np.int64), + tslib.maybe_get_tz('US/Eastern'), + tslib.maybe_get_tz('Asia/Tokyo')) + self.assert_numpy_array_equal(result, np.array([tslib.iNaT], dtype=np.int64)) + class TestTimestampOps(tm.TestCase): def test_timestamp_and_datetime(self): self.assertEqual((Timestamp(datetime.datetime(2013, 10, 13)) - datetime.datetime(2013, 10, 12)).days, 1) diff --git a/pandas/tseries/tools.py b/pandas/tseries/tools.py index 6f08448b47b1e..b53f59a621be2 100644 --- a/pandas/tseries/tools.py +++ b/pandas/tseries/tools.py @@ -291,6 +291,14 @@ def _convert_listlike(arg, box, format): pass return arg + + elif com.is_datetime64tz_dtype(arg): + if not isinstance(arg, DatetimeIndex): + return DatetimeIndex(arg, tz='utc' if utc else None) + if utc: + arg = arg.tz_convert(None) + return arg + elif format is None and com.is_integer_dtype(arg) and unit=='ns': result = arg.astype('datetime64[ns]') if box: diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index 369993b4c54d1..ae440f67ce96a 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -1962,12 +1962,15 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', for i in range(n): val = values[i] - # set as nan if is even a datetime NaT + # set as nan except if its a NaT if _checknull_with_nat(val): - oresult[i] = np.nan - elif util.is_datetime64_object(val): if val is np_NaT or val.view('i8') == iNaT: + oresult[i] = NaT + else: oresult[i] = np.nan + elif util.is_datetime64_object(val): + if val is np_NaT or val.view('i8') == iNaT: + oresult[i] = NaT else: oresult[i] = val.item() else: @@ -3296,7 +3299,7 @@ except: def tz_convert(ndarray[int64_t] vals, object tz1, object tz2): cdef: - ndarray[int64_t] utc_dates, result, trans, deltas + ndarray[int64_t] utc_dates, tt, result, trans, deltas Py_ssize_t i, pos, n = len(vals) int64_t v, offset pandas_datetimestruct dts @@ -3315,27 +3318,38 @@ def tz_convert(ndarray[int64_t] vals, object tz1, object tz2): if _is_tzlocal(tz1): for i in range(n): v = vals[i] - pandas_datetime_to_datetimestruct(v, PANDAS_FR_ns, &dts) - dt = datetime(dts.year, dts.month, dts.day, dts.hour, - dts.min, dts.sec, dts.us, tz1) - delta = (int(total_seconds(_get_utcoffset(tz1, dt))) - * 1000000000) - utc_dates[i] = v - delta + if v == iNaT: + utc_dates[i] = iNaT + else: + pandas_datetime_to_datetimestruct(v, PANDAS_FR_ns, &dts) + dt = datetime(dts.year, dts.month, dts.day, dts.hour, + dts.min, dts.sec, dts.us, tz1) + delta = (int(total_seconds(_get_utcoffset(tz1, dt))) + * 1000000000) + utc_dates[i] = v - delta else: trans, deltas, typ = _get_dst_info(tz1) + # all-NaT + tt = vals[vals!=iNaT] + if not len(tt): + return vals + trans_len = len(trans) - pos = trans.searchsorted(vals[0]) - 1 + pos = trans.searchsorted(tt[0]) - 1 if pos < 0: raise ValueError('First time before start of DST info') offset = deltas[pos] for i in range(n): v = vals[i] - while pos + 1 < trans_len and v >= trans[pos + 1]: - pos += 1 - offset = deltas[pos] - utc_dates[i] = v - offset + if v == iNaT: + utc_dates[i] = iNaT + else: + while pos + 1 < trans_len and v >= trans[pos + 1]: + pos += 1 + offset = deltas[pos] + utc_dates[i] = v - offset else: utc_dates = vals @@ -3346,18 +3360,26 @@ def tz_convert(ndarray[int64_t] vals, object tz1, object tz2): if _is_tzlocal(tz2): for i in range(n): v = utc_dates[i] - pandas_datetime_to_datetimestruct(v, PANDAS_FR_ns, &dts) - dt = datetime(dts.year, dts.month, dts.day, dts.hour, - dts.min, dts.sec, dts.us, tz2) - delta = int(total_seconds(_get_utcoffset(tz2, dt))) * 1000000000 - result[i] = v + delta + if v == iNaT: + result[i] = iNaT + else: + pandas_datetime_to_datetimestruct(v, PANDAS_FR_ns, &dts) + dt = datetime(dts.year, dts.month, dts.day, dts.hour, + dts.min, dts.sec, dts.us, tz2) + delta = int(total_seconds(_get_utcoffset(tz2, dt))) * 1000000000 + result[i] = v + delta return result # Convert UTC to other timezone trans, deltas, typ = _get_dst_info(tz2) trans_len = len(trans) - pos = trans.searchsorted(utc_dates[0]) - 1 + # use first non-NaT element + # if all-NaT, return all-NaT + if (result==iNaT).all(): + return result + + pos = trans.searchsorted(utc_dates[utc_dates!=iNaT][0]) - 1 if pos < 0: raise ValueError('First time before start of DST info') @@ -3365,7 +3387,7 @@ def tz_convert(ndarray[int64_t] vals, object tz1, object tz2): offset = deltas[pos] for i in range(n): v = utc_dates[i] - if vals[i] == NPY_NAT: + if vals[i] == iNaT: result[i] = vals[i] else: while pos + 1 < trans_len and v >= trans[pos + 1]: @@ -3412,6 +3434,7 @@ def tz_convert_single(int64_t val, object tz1, object tz2): dts.min, dts.sec, dts.us, tz2) delta = int(total_seconds(_get_utcoffset(tz2, dt))) * 1000000000 return utc_date + delta + # Convert UTC to other timezone trans, deltas, typ = _get_dst_info(tz2) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 4b7c8d4540e0f..c72f964b8d247 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -23,9 +23,11 @@ import numpy as np import pandas as pd -from pandas.core.common import (is_sequence, array_equivalent, is_list_like, +from pandas.core.common import (is_sequence, array_equivalent, is_list_like, is_number, is_datetimelike_v_numeric, is_datetimelike_v_object, - is_number, pprint_thing, take_1d) + is_number, pprint_thing, take_1d, + needs_i8_conversion) + import pandas.compat as compat from pandas.compat import( filter, map, zip, range, unichr, lrange, lmap, lzip, u, callable, Counter, @@ -864,7 +866,7 @@ def assert_series_equal(left, right, check_dtype=True, elif check_datetimelike_compat: # we want to check only if we have compat dtypes # e.g. integer and M|m are NOT compat, but we can simply check the values in that case - if is_datetimelike_v_numeric(left, right) or is_datetimelike_v_object(left, right): + if is_datetimelike_v_numeric(left, right) or is_datetimelike_v_object(left, right) or needs_i8_conversion(left) or needs_i8_conversion(right): # datetimelike may have different objects (e.g. datetime.datetime vs Timestamp) but will compare equal if not Index(left.values).equals(Index(right.values)): diff --git a/vb_suite/binary_ops.py b/vb_suite/binary_ops.py index db9a6b730064e..6c268e7602c1c 100644 --- a/vb_suite/binary_ops.py +++ b/vb_suite/binary_ops.py @@ -172,3 +172,28 @@ start_date=datetime(2013, 1, 1)) timestamp_ops_diff2 = Benchmark("s-s.shift()", setup, start_date=datetime(2013, 1, 1)) + +#---------------------------------------------------------------------- +# timeseries with tz + +setup = common_setup + """ +N = 10000 +halfway = N // 2 - 1 +s = Series(date_range('20010101', periods=N, freq='T', tz='US/Eastern')) +ts = s[halfway] +""" + +timestamp_tz_series_compare = Benchmark("ts >= s", setup, + start_date=datetime(2013, 9, 27)) +series_timestamp_tz_compare = Benchmark("s <= ts", setup, + start_date=datetime(2012, 2, 21)) + +setup = common_setup + """ +N = 10000 +s = Series(date_range('20010101', periods=N, freq='s', tz='US/Eastern')) +""" + +timestamp_tz_ops_diff1 = Benchmark("s.diff()", setup, + start_date=datetime(2013, 1, 1)) +timestamp_tz_ops_diff2 = Benchmark("s-s.shift()", setup, + start_date=datetime(2013, 1, 1)) diff --git a/vb_suite/timeseries.py b/vb_suite/timeseries.py index 151777add104a..2273623927c51 100644 --- a/vb_suite/timeseries.py +++ b/vb_suite/timeseries.py @@ -105,7 +105,7 @@ def date_range(start=None, end=None, periods=None, freq=None): start_date=datetime(2012, 4, 27)) #---------------------------------------------------------------------- -# Time zone stuff +# Time zone setup = common_setup + """ rng = date_range('1/1/2000', '3/1/2000', tz='US/Eastern')