API: add DatetimeBlockTZ pandas-dev#8260

fix scalar comparisons vs None generally fix NaT formattting in Series TST: skip postgresql test with tz's update for msgpack Conflicts: pandas/core/base.py pandas/core/categorical.py pandas/core/format.py pandas/tests/test_base.py pandas/util/testing.py full interop for tz-aware Series & timedeltas pandas-dev#10763
nickeubank · Sep 29, 2015 · 1cbdcb5 · 1cbdcb5
1 parent 1160b6e
commit 1cbdcb5
Show file tree

Hide file tree

Showing 60 changed files with 2,523 additions and 1,089 deletions.
diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json
@@ -18,15 +18,15 @@
     // If missing or the empty string, the tool will be automatically
     // determined by looking for tools on the PATH environment
     // variable.
-    "environment_type": "",
+    "environment_type": "conda",
 
     // the base URL to show a commit for the project.
     "show_commit_url": "https://github.com/pydata/pandas/commit/",
 
     // The Pythons you'd like to test against.  If not provided, defaults
     // to the current version of Python used to run `asv`.
     // "pythons": ["2.7", "3.4"],
-    "pythons": ["2.7", "3.4"],
+    "pythons": ["2.7"],
 
     // The matrix of dependencies to test.  Each key is the name of a
     // package (in PyPI) and the values are version numbers.  An empty
@@ -41,7 +41,7 @@
         "sqlalchemy": [],
         "scipy": [],
         "numexpr": [],
-        "tables": [],
+        "pytables": [],
         "openpyxl": [],
         "xlrd": [],
         "xlwt": []

diff --git a/asv_bench/benchmarks/binary_ops.py b/asv_bench/benchmarks/binary_ops.py
@@ -203,34 +203,59 @@ def time_series_timestamp_compare(self):
 
 class timestamp_ops_diff1(object):
     goal_time = 0.2
+    N = 1000000
 
     def setup(self):
-        self.N = 1000000
-        self.s = Series(date_range('20010101', periods=self.N, freq='s'))
+        self.s = self.create()
+
+    def create(self):
+        return Series(date_range('20010101', periods=self.N, freq='s'))
 
     def time_timestamp_ops_diff1(self):
         self.s.diff()
 
+class timestamp_tz_ops_diff1(timestamp_ops_diff1):
+    N = 10000
+
+    def create(self):
+        return Series(date_range('20010101', periods=self.N, freq='s', tz='US/Eastern'))
 
 class timestamp_ops_diff2(object):
     goal_time = 0.2
+    N = 1000000
 
     def setup(self):
-        self.N = 1000000
-        self.s = Series(date_range('20010101', periods=self.N, freq='s'))
+        self.s = self.create()
+
+    def create(self):
+        return Series(date_range('20010101', periods=self.N, freq='s'))
 
     def time_timestamp_ops_diff2(self):
         (self.s - self.s.shift())
 
+class timestamp_tz_ops_diff2(timestamp_ops_diff2):
+    N = 10000
+
+    def create(self):
+        return Series(date_range('20010101', periods=self.N, freq='s', tz='US/Eastern'))
 
 class timestamp_series_compare(object):
     goal_time = 0.2
+    N = 1000000
 
     def setup(self):
-        self.N = 1000000
         self.halfway = ((self.N // 2) - 1)
-        self.s = Series(date_range('20010101', periods=self.N, freq='T'))
+        self.s = self.create()
         self.ts = self.s[self.halfway]
 
+    def create(self):
+        return Series(date_range('20010101', periods=self.N, freq='T'))
+
     def time_timestamp_series_compare(self):
-        (self.ts >= self.s)
+        (self.ts >= self.s)
+
+class timestamp_tz_series_compare(timestamp_series_compare):
+    N = 10000
+
+    def create(self):
+        return Series(date_range('20010101', periods=self.N, freq='T', tz='US/Eastern'))
diff --git a/doc/source/basics.rst b/doc/source/basics.rst
@@ -1590,9 +1590,10 @@ dtypes
 ------
 
 The main types stored in pandas objects are ``float``, ``int``, ``bool``,
-``datetime64[ns]``, ``timedelta[ns]`` and ``object``. In addition these dtypes
-have item sizes, e.g. ``int64`` and ``int32``. A convenient :attr:`~DataFrame.dtypes``
-attribute for DataFrames returns a Series with the data type of each column.
+``datetime64[ns]`` and ``datetime64[ns, tz]`` (in >= 0.17.0), ``timedelta[ns]``, ``category`` (in >= 0.15.0), and ``object``. In addition these dtypes
+have item sizes, e.g. ``int64`` and ``int32``. See :ref:`Series with TZ <timeseries.timezone_series>` for more detail on ``datetime64[ns, tz]`` dtypes.
+
+A convenient :attr:`~DataFrame.dtypes` attribute for DataFrames returns a Series with the data type of each column.
 
 .. ipython:: python
 
@@ -1814,8 +1815,14 @@ dtypes:
    df['tdeltas'] = df.dates.diff()
    df['uint64'] = np.arange(3, 6).astype('u8')
    df['other_dates'] = pd.date_range('20130101', periods=3).values
+   df['tz_aware_dates'] = pd.date_range('20130101', periods=3, tz='US/Eastern')
    df
 
+And the dtypes
+
+.. ipython:: python
+
+   df.dtypes
 
 :meth:`~DataFrame.select_dtypes` has two parameters ``include`` and ``exclude`` that allow you to
 say "give me the columns WITH these dtypes" (``include``) and/or "give the
@@ -1868,7 +1875,7 @@ All numpy dtypes are subclasses of ``numpy.generic``:
 
 .. note::
 
-    Pandas also defines an additional ``category`` dtype, which is not integrated into the normal
+    Pandas also defines the types ``category``, and ``datetime64[ns, tz]``, which are not integrated into the normal
     numpy hierarchy and wont show up with the above function.
 
 .. note::

diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -50,6 +50,7 @@ Highlights include:
 
 - Release the Global Interpreter Lock (GIL) on some cython operations, see :ref:`here <whatsnew_0170.gil>`
 - The sorting API has been revamped to remove some long-time inconsistencies, see :ref:`here <whatsnew_0170.api_breaking.sorting>`
+- Support for a ``datetime64[ns]`` with timezones as a first-class dtype, see :ref:`here <whatsnew_0170.tz>`
 - The default for ``to_datetime`` will now be to ``raise`` when presented with unparseable formats,
   previously this would return the original input, see :ref:`here <whatsnew_0170.api_breaking.to_datetime>`
 - The default for ``dropna`` in ``HDFStore`` has changed to ``False``, to store by default all rows even

diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst
@@ -1745,3 +1745,30 @@ constructor as well as ``tz_localize``.
 
    # tz_convert(None) is identical with tz_convert('UTC').tz_localize(None)
    didx.tz_convert('UCT').tz_localize(None)
+
+.. _timeseries.timezone_series:
+
+TZ aware Dtypes
+~~~~~~~~~~~~~~~
+
+.. versionadded:: 0.17.0
+
+``Series/DatetimeIndex`` with a timezone naive value are represented with a dtype of ``datetime64[ns]``.
+
+.. ipython:: python
+
+   dr = pd.date_range('20130101',periods=3)
+   dr
+   s = Series(dr)
+   s
+
+``Series/DatetimeIndex`` with a timezone aware value are represented with a dtype of ``datetime64[ns, tz]``.
+
+.. ipython:: python
+
+   dr = pd.date_range('20130101',periods=3,tz='US/Eastern')
+   dr
+   s = Series(dr)
+   s
+
+Both of these ``Series`` can be manipulated via the ``.dt`` accessor, see the :ref:`docs <basics.dt_accessors>` as well.
diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt
@@ -30,6 +30,7 @@ Highlights include:
 
 - Release the Global Interpreter Lock (GIL) on some cython operations, see :ref:`here <whatsnew_0170.gil>`
 - The sorting API has been revamped to remove some long-time inconsistencies, see :ref:`here <whatsnew_0170.api_breaking.sorting>`
+- Support for a ``datetime64[ns]`` with timezones as a first-class dtype, see :ref:`here <whatsnew_0170.tz>`
 - The default for ``to_datetime`` will now be to ``raise`` when presented with unparseable formats,
   previously this would return the original input, see :ref:`here <whatsnew_0170.api_breaking.to_datetime>`
 - The default for ``dropna`` in ``HDFStore`` has changed to ``False``, to store by default all rows even
@@ -437,6 +438,58 @@ To keep the previous behaviour, you can use ``errors='ignore'``:
 Furthermore, ``pd.to_timedelta`` has gained a similar API, of ``errors='raise'|'ignore'|'coerce'``, and the ``coerce`` keyword
 has been deprecated in favor of ``errors='coerce'``.
 
+.. _whatsnew_0170.tz:
+
+Datetime with TZ
+~~~~~~~~~~~~~~~~
+
+We are adding an implementation that natively supports datetime with timezones. A ``Series`` or a ``DataFrame`` column previously
+*could* be assigned a datetime with timezones, and would work as an ``object`` dtype. This had performance issues with a large
+number rows. (:issue:`8260`, :issue:`10763`)
+
+The new implementation allows for having a single-timezone across all rows, and operating on it in a performant manner.
+
+.. ipython:: python
+
+   df = DataFrame({'A' : date_range('20130101',periods=3),
+                   'B' : date_range('20130101',periods=3,tz='US/Eastern'),
+                   'C' : date_range('20130101',periods=3,tz='CET')})
+   df
+   df.dtypes
+
+.. ipython:: python
+
+   df.B
+   df.B.dt.tz_localize(None)
+
+This uses a new-dtype representation as well, that is very similar in look-and-feel to its numpy cousin ``datetime64[ns]``
+
+.. ipython:: python
+
+   df['B'].dtype
+   type(df['B']).dtype
+
+.. note::
+
+   There is a slightly different string repr for the underlying ``DatetimeIndex`` as a result of the dtype changes, but
+   functionally these are the same.
+
+   .. code-block:: python
+
+      In [1]: pd.date_range('20130101',periods=3,tz='US/Eastern')
+      Out[1]: DatetimeIndex(['2013-01-01 00:00:00-05:00', '2013-01-02 00:00:00-05:00',
+                             '2013-01-03 00:00:00-05:00'],
+                            dtype='datetime64[ns]', freq='D', tz='US/Eastern')
+
+      In [2]: pd.date_range('20130101',periods=3,tz='US/Eastern').dtype
+      Out[2]: dtype('<M8[ns]')
+
+   .. ipython:: python
+
+      pd.date_range('20130101',periods=3,tz='US/Eastern')
+      pd.date_range('20130101',periods=3,tz='US/Eastern').dtype
+
+
 .. _whatsnew_0170.api_breaking.convert_objects:
 
 Changes to convert_objects
@@ -844,6 +897,9 @@ Bug Fixes
 - Bug in incorrection computation of ``.mean()`` on ``timedelta64[ns]`` because of overflow (:issue:`9442`)
 - Bug in ``DataFrame.to_html(index=False)`` renders unnecessary ``name`` row (:issue:`10344`)
 - Bug in ``DataFrame.to_latex()`` the ``column_format`` argument could not be passed (:issue:`9402`)
+- Bug in ``DatetimeIndex`` when localizing with ``NaT`` (:issue:`10477`)
+- Bug in ``Series.dt`` ops in preserving meta-data (:issue:`10477`)
+- Bug in preserving ``NaT`` when passed in an otherwise invalid ``to_datetime`` construction (:issue:`10477`)
 - Bug in ``DataFrame.apply`` when function returns categorical series. (:issue:`9573`)
 - Bug in ``to_datetime`` with invalid dates and formats supplied (:issue:`10154`)
 - Bug in ``Index.drop_duplicates`` dropping name(s) (:issue:`10115`)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -206,7 +206,7 @@ def value_counts(values, sort=True, ascending=False, normalize=False,
     """
     from pandas.core.series import Series
     from pandas.tools.tile import cut
-    from pandas.tseries.period import PeriodIndex
+    from pandas import Index, PeriodIndex, DatetimeIndex
 
     name = getattr(values, 'name', None)
     values = Series(values).values
@@ -225,11 +225,15 @@ def value_counts(values, sort=True, ascending=False, normalize=False,
 
         dtype = values.dtype
         is_period = com.is_period_arraylike(values)
+        is_datetimetz = com.is_datetimetz(values)
 
-        if com.is_datetime_or_timedelta_dtype(dtype) or is_period:
+        if com.is_datetime_or_timedelta_dtype(dtype) or is_period or is_datetimetz:
 
             if is_period:
-                values = PeriodIndex(values, name=name)
+                values = PeriodIndex(values)
+            elif is_datetimetz:
+                tz = getattr(values, 'tz', None)
+                values = DatetimeIndex(values).tz_localize(None)
 
             values = values.view(np.int64)
             keys, counts = htable.value_count_scalar64(values, dropna)
@@ -239,8 +243,14 @@ def value_counts(values, sort=True, ascending=False, normalize=False,
                 msk = keys != iNaT
                 keys, counts = keys[msk], counts[msk]
 
+            # localize to the original tz if necessary
+            if is_datetimetz:
+                keys = DatetimeIndex(keys).tz_localize(tz)
+
             # convert the keys back to the dtype we came in
-            keys = keys.astype(dtype)
+            else:
+                keys = keys.astype(dtype)
+
 
         elif com.is_integer_dtype(dtype):
             values = com._ensure_int64(values)
@@ -257,7 +267,9 @@ def value_counts(values, sort=True, ascending=False, normalize=False,
                 keys = np.insert(keys, 0, np.NaN)
                 counts = np.insert(counts, 0, mask.sum())
 
-        result = Series(counts, index=com._values_from_object(keys), name=name)
+        if not isinstance(keys, Index):
+            keys = Index(keys)
+        result = Series(counts, index=keys, name=name)
 
         if bins is not None:
             # TODO: This next line should be more efficient

diff --git a/pandas/core/base.py b/pandas/core/base.py
@@ -364,6 +364,11 @@ def base(self):
         """ return the base object if the memory of the underlying data is shared """
         return self.values.base
 
+    @property
+    def _values(self):
+        """ the internal implementation """
+        return self.values
+
     def max(self):
         """ The maximum value of the object """
         return nanops.nanmax(self.values)
@@ -397,6 +402,14 @@ def hasnans(self):
         """ return if I have any nans; enables various perf speedups """
         return com.isnull(self).any()
 
+    def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None,
+                filter_type=None, **kwds):
+        """ perform the reduction type operation if we can """
+        func = getattr(self,name,None)
+        if func is None:
+            raise TypeError("{klass} cannot perform the operation {op}".format(klass=self.__class__.__name__,op=name))
+        return func(**kwds)
+
     def value_counts(self, normalize=False, sort=True, ascending=False,
                      bins=None, dropna=True):
         """
@@ -586,7 +599,7 @@ def drop_duplicates(self, keep='first', inplace=False):
     @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
     @Appender(_shared_docs['duplicated'] % _indexops_doc_kwargs)
     def duplicated(self, keep='first'):
-        keys = com._ensure_object(self.values)
+        keys = com._values_from_object(com._ensure_object(self.values))
         duplicated = lib.duplicated(keys, keep=keep)
         try:
             return self._constructor(duplicated,

diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
@@ -12,13 +12,14 @@
 import pandas.core.common as com
 from pandas.util.decorators import cache_readonly, deprecate_kwarg
 
-from pandas.core.common import (CategoricalDtype, ABCSeries, ABCIndexClass, ABCCategoricalIndex,
+from pandas.core.common import (ABCSeries, ABCIndexClass, ABCPeriodIndex, ABCCategoricalIndex,
                                 isnull, notnull, is_dtype_equal,
                                 is_categorical_dtype, is_integer_dtype, is_object_dtype,
                                 _possibly_infer_to_datetimelike, get_dtype_kinds,
                                 is_list_like, is_sequence, is_null_slice, is_bool,
                                 _ensure_platform_int, _ensure_object, _ensure_int64,
                                 _coerce_indexer_dtype, take_1d)
+from pandas.core.dtypes import CategoricalDtype
 from pandas.util.terminal import get_terminal_size
 from pandas.core.config import get_option
 
@@ -85,7 +86,7 @@ def f(self, other):
 def maybe_to_categorical(array):
     """ coerce to a categorical if a series is given """
     if isinstance(array, (ABCSeries, ABCCategoricalIndex)):
-        return array.values
+        return array._values
     return array
 
 _codes_doc = """The category codes of this categorical.
@@ -231,7 +232,7 @@ def __init__(self, values, categories=None, ordered=False, name=None, fastpath=F
 
             # we are either a Series or a CategoricalIndex
             if isinstance(values, (ABCSeries, ABCCategoricalIndex)):
-                values = values.values
+                values = values._values
 
             if ordered is None:
                 ordered = values.ordered