Skip to content

Commit

Permalink
Merge pull request #11216 from jreback/datetime_with_tz
Browse files Browse the repository at this point in the history
BUG: edge case when reading from postgresl with read_sql_query and datetime with tz and chunksize
  • Loading branch information
jreback committed Oct 3, 2015
2 parents d6c7a3a + bd26dec commit 071cffd
Show file tree
Hide file tree
Showing 2 changed files with 90 additions and 28 deletions.
39 changes: 28 additions & 11 deletions pandas/io/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from pandas.core.api import DataFrame, Series
from pandas.core.common import isnull
from pandas.core.base import PandasObject
from pandas.core.dtypes import DatetimeTZDtype
from pandas.tseries.tools import to_datetime
from pandas.util.decorators import Appender

Expand Down Expand Up @@ -89,6 +90,10 @@ def _handle_date_column(col, format=None):
# parse dates as timestamp
format = 's' if format is None else format
return to_datetime(col, errors='coerce', unit=format, utc=True)
elif com.is_datetime64tz_dtype(col):
# coerce to UTC timezone
# GH11216
return to_datetime(col,errors='coerce').astype('datetime64[ns, UTC]')
else:
return to_datetime(col, errors='coerce', format=format, utc=True)

Expand All @@ -113,6 +118,14 @@ def _parse_date_columns(data_frame, parse_dates):
fmt = None
data_frame[col_name] = _handle_date_column(df_col, format=fmt)


# we want to coerce datetime64_tz dtypes for now
# we could in theory do a 'nice' conversion from a FixedOffset tz
# GH11216
for col_name, df_col in data_frame.iteritems():
if com.is_datetime64tz_dtype(df_col):
data_frame[col_name] = _handle_date_column(df_col)

return data_frame


Expand Down Expand Up @@ -366,7 +379,7 @@ def read_sql_query(sql, con, index_col=None, coerce_float=True, params=None,
----------
sql : string SQL query or SQLAlchemy Selectable (select or text object)
to be executed.
con : SQLAlchemy connectable(engine/connection) or database string URI
con : SQLAlchemy connectable(engine/connection) or database string URI
or sqlite3 DBAPI2 connection
Using SQLAlchemy makes it possible to use any DB supported by that
library.
Expand Down Expand Up @@ -898,11 +911,10 @@ def _harmonize_columns(self, parse_dates=None):
try:
df_col = self.frame[col_name]
# the type the dataframe column should have
col_type = self._numpy_type(sql_col.type)
col_type = self._get_dtype(sql_col.type)

if col_type is datetime or col_type is date:
if not issubclass(df_col.dtype.type, np.datetime64):
self.frame[col_name] = _handle_date_column(df_col)
if col_type is datetime or col_type is date or col_type is DatetimeTZDtype:
self.frame[col_name] = _handle_date_column(df_col)

elif col_type is float:
# floats support NA, can always convert!
Expand Down Expand Up @@ -982,20 +994,25 @@ def _sqlalchemy_type(self, col):

return Text

def _numpy_type(self, sqltype):
from sqlalchemy.types import Integer, Float, Boolean, DateTime, Date
def _get_dtype(self, sqltype):
from sqlalchemy.types import Integer, Float, Boolean, DateTime, Date, TIMESTAMP

if isinstance(sqltype, Float):
return float
if isinstance(sqltype, Integer):
elif isinstance(sqltype, Integer):
# TODO: Refine integer size.
return np.dtype('int64')
if isinstance(sqltype, DateTime):
elif isinstance(sqltype, TIMESTAMP):
# we have a timezone capable type
if not sqltype.timezone:
return datetime
return DatetimeTZDtype
elif isinstance(sqltype, DateTime):
# Caution: np.datetime64 is also a subclass of np.number.
return datetime
if isinstance(sqltype, Date):
elif isinstance(sqltype, Date):
return date
if isinstance(sqltype, Boolean):
elif isinstance(sqltype, Boolean):
return bool
return object

Expand Down
79 changes: 62 additions & 17 deletions pandas/io/tests/test_sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,15 @@
import nose
import warnings
import numpy as np
import pandas as pd

from datetime import datetime, date, time

from pandas import DataFrame, Series, Index, MultiIndex, isnull, concat
from pandas import date_range, to_datetime, to_timedelta, Timestamp
import pandas.compat as compat
from pandas.compat import StringIO, range, lrange, string_types
from pandas.core import common as com
from pandas.core.datetools import format as date_format

import pandas.io.sql as sql
Expand Down Expand Up @@ -1248,6 +1250,66 @@ def test_default_date_load(self):
self.assertTrue(issubclass(df.DateCol.dtype.type, np.datetime64),
"DateCol loaded with incorrect type")

def test_datetime_with_timezone(self):
# edge case that converts postgresql datetime with time zone types
# to datetime64[ns,psycopg2.tz.FixedOffsetTimezone..], which is ok
# but should be more natural, so coerce to datetime64[ns] for now

def check(col):
# check that a column is either datetime64[ns]
# or datetime64[ns, UTC]
if com.is_datetime64_dtype(col.dtype):

# "2000-01-01 00:00:00-08:00" should convert to "2000-01-01 08:00:00"
self.assertEqual(col[0], Timestamp('2000-01-01 08:00:00'))

# "2000-06-01 00:00:00-07:00" should convert to "2000-06-01 07:00:00"
self.assertEqual(col[1], Timestamp('2000-06-01 07:00:00'))

elif com.is_datetime64tz_dtype(col.dtype):
self.assertTrue(str(col.dt.tz) == 'UTC')

# "2000-01-01 00:00:00-08:00" should convert to "2000-01-01 08:00:00"
self.assertEqual(col[0], Timestamp('2000-01-01 08:00:00', tz='UTC'))

# "2000-06-01 00:00:00-07:00" should convert to "2000-06-01 07:00:00"
self.assertEqual(col[1], Timestamp('2000-06-01 07:00:00', tz='UTC'))

else:
raise AssertionError("DateCol loaded with incorrect type -> {0}".format(col.dtype))

# GH11216
df = pd.read_sql_query("select * from types_test_data", self.conn)
if not hasattr(df,'DateColWithTz'):
raise nose.SkipTest("no column with datetime with time zone")

# this is parsed on Travis (linux), but not on macosx for some reason
# even with the same versions of psycopg2 & sqlalchemy, possibly a Postgrsql server
# version difference
col = df.DateColWithTz
self.assertTrue(com.is_object_dtype(col.dtype) or com.is_datetime64_dtype(col.dtype) \
or com.is_datetime64tz_dtype(col.dtype),
"DateCol loaded with incorrect type -> {0}".format(col.dtype))

df = pd.read_sql_query("select * from types_test_data", self.conn, parse_dates=['DateColWithTz'])
if not hasattr(df,'DateColWithTz'):
raise nose.SkipTest("no column with datetime with time zone")
check(df.DateColWithTz)

df = pd.concat(list(pd.read_sql_query("select * from types_test_data",
self.conn,chunksize=1)),ignore_index=True)
col = df.DateColWithTz
self.assertTrue(com.is_datetime64tz_dtype(col.dtype),
"DateCol loaded with incorrect type -> {0}".format(col.dtype))
self.assertTrue(str(col.dt.tz) == 'UTC')
expected = sql.read_sql_table("types_test_data", self.conn)
tm.assert_series_equal(df.DateColWithTz, expected.DateColWithTz.astype('datetime64[ns, UTC]'))

# xref #7139
# this might or might not be converted depending on the postgres driver
df = sql.read_sql_table("types_test_data", self.conn)
check(df.DateColWithTz)

def test_date_parsing(self):
# No Parsing
df = sql.read_sql_table("types_test_data", self.conn)
Expand Down Expand Up @@ -1746,23 +1808,6 @@ def test_schema_support(self):
res2 = pdsql.read_table('test_schema_other2')
tm.assert_frame_equal(res1, res2)

def test_datetime_with_time_zone(self):

# Test to see if we read the date column with timezones that
# the timezone information is converted to utc and into a
# np.datetime64 (GH #7139)

df = sql.read_sql_table("types_test_data", self.conn)
self.assertTrue(issubclass(df.DateColWithTz.dtype.type, np.datetime64),
"DateColWithTz loaded with incorrect type -> {0}".format(df.DateColWithTz.dtype))

# "2000-01-01 00:00:00-08:00" should convert to "2000-01-01 08:00:00"
self.assertEqual(df.DateColWithTz[0], Timestamp('2000-01-01 08:00:00'))

# "2000-06-01 00:00:00-07:00" should convert to "2000-06-01 07:00:00"
self.assertEqual(df.DateColWithTz[1], Timestamp('2000-06-01 07:00:00'))


class TestMySQLAlchemy(_TestMySQLAlchemy, _TestSQLAlchemy):
pass

Expand Down

0 comments on commit 071cffd

Please sign in to comment.