-
Notifications
You must be signed in to change notification settings - Fork 4
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Improve date/time hint handling for Pandas based on better testing #148
Changes from all commits
04b0166
fffbe35
6b804b4
c38477f
92e7d63
8057981
75c4825
e7852af
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
93.3500 | ||
93.4300 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
92.5900 | ||
92.6000 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,7 @@ | ||
from .types import HintCompression, HintEncoding, HintDateFormat, HintTimeOnlyFormat, HintQuoting | ||
import logging | ||
import csv | ||
from typing import Optional, Dict, Union | ||
from typing_extensions import Literal | ||
from typing import Optional, Dict | ||
|
||
|
||
logger = logging.getLogger(__name__) | ||
|
@@ -39,16 +38,19 @@ | |
# account for MM/DD time. | ||
# | ||
# https://github.com/bluelabsio/records-mover/issues/75 | ||
# | ||
python_date_format_from_hints: Dict[Union[HintDateFormat, Literal['DD/MM/YY']], str] = { | ||
python_date_format_from_hints: Dict[HintDateFormat, str] = { | ||
'DD-MM-YYYY': '%d-%m-%Y', | ||
'MM-DD-YYYY': '%m-%d-%Y', | ||
'YYYY-MM-DD': '%Y-%m-%d', | ||
'MM/DD/YY': '%m/%d/%Y', | ||
'DD/MM/YY': '%d/%m/%Y', | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These were wrong! %Y is a four digit year, %y is a two digit year. |
||
'MM/DD/YY': '%m/%d/%y', | ||
'DD/MM/YY': '%d/%m/%y', | ||
'DD-MM-YY': '%d-%m-%y', | ||
} | ||
|
||
python_time_format_from_hints: Dict[HintTimeOnlyFormat, str] = { | ||
'HH24:MI:SS': '%H:%M:%S', | ||
'HH12:MI AM': '%I:%M:%S %p', | ||
'HH:MI:SS': '%H:%M:%S', | ||
'HH12:MI AM': '%I:%M %p', | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We were adding seconds here against what the user was telling us. We have a similar bug with datetime/datetimetz types - see #143 Once we have more solid tests here we can go back and generalize to_csv_options() and fix those along the way - to fix it with the current design would be basically remove (mostly-working) functionality. |
||
} | ||
|
||
hint_encoding_from_pandas: Dict[str, HintEncoding] = { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -63,28 +63,23 @@ def pandas_to_csv_options(records_format: DelimitedRecordsFormat, | |
pandas_options['header'] = hints.header_row | ||
quiet_remove(unhandled_hints, 'header-row') | ||
|
||
# Note the limitation on Pandas export with BigQuery around | ||
# datetimeformattz: | ||
# | ||
# https://github.com/bluelabsio/records-mover/issues/95 | ||
|
||
# The current datetimefomat/datetimeformattz hint support in | ||
# to_csv_options.py is limited to ISO format driven by the | ||
# dateformat hint. | ||
# | ||
# This could be generalized to a smarter function which translates | ||
# to from the Records Spec language into the | ||
# Python/Pandas/strftime format, and reject fewer hints as a | ||
# result. | ||
# | ||
# https://github.com/bluelabsio/records-mover/issues/143 | ||
if hints.dateformat is None: | ||
if hints.datetimeformattz == hints.datetimeformat: | ||
# BigQuery requires that timezone offsets have a colon; | ||
# Python (and thus Pandas) doesn't support adding the | ||
# colon with strftime. However, we can specify things | ||
# without a timezone delimiter just fine. | ||
# | ||
# Unfortunately Python/Pandas will drop the timezone info | ||
# instead of converting the timestamp to UTC. This | ||
# corrupts the time, as BigQuery assumes what it gets in | ||
# is UTC format. Boo. | ||
# | ||
# $ python3 | ||
# >>> import pytz | ||
# >>> us_eastern = pytz.timezone('US/Eastern') | ||
# >>> import datetime | ||
# >>> us_eastern.localize(datetime.datetime(2000, 1, 2, 12, 34, 56, 789012)) | ||
# .strftime('%Y-%m-%d %H:%M:%S.%f') | ||
# '2000-01-02 12:34:56.789012' | ||
# >>> | ||
# | ||
# https://github.com/bluelabsio/records-mover/issues/95 | ||
pandas_options['date_format'] = '%Y-%m-%d %H:%M:%S.%f' | ||
else: | ||
pandas_options['date_format'] = '%Y-%m-%d %H:%M:%S.%f%z' | ||
|
@@ -108,14 +103,22 @@ def pandas_to_csv_options(records_format: DelimitedRecordsFormat, | |
pandas_options['date_format'] = '%m/%d/%y %H:%M:%S.%f' | ||
else: | ||
pandas_options['date_format'] = '%m/%d/%y %H:%M:%S.%f%z' | ||
elif hints.dateformat == 'DD/MM/YY': | ||
if hints.datetimeformattz == hints.datetimeformat: | ||
pandas_options['date_format'] = '%d/%m/%y %H:%M:%S.%f' | ||
else: | ||
pandas_options['date_format'] = '%d/%m/%y %H:%M:%S.%f%z' | ||
elif hints.dateformat == 'DD-MM-YY': | ||
if hints.datetimeformattz == hints.datetimeformat: | ||
pandas_options['date_format'] = '%d-%m-%y %H:%M:%S.%f' | ||
else: | ||
pandas_options['date_format'] = '%d-%m-%y %H:%M:%S.%f%z' | ||
else: | ||
cant_handle_hint(fail_if_cant_handle_hint, 'dateformat', hints) | ||
quiet_remove(unhandled_hints, 'dateformat') | ||
|
||
# pandas can't seem to export a date and time together :( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This comment isn't right; it certainly can. What it can't do is export a time without a date, or a date without a time. That's what we have prep_for_csv.py for. |
||
# | ||
# might be nice someday to only emit the errors if the actual data | ||
# being moved is affected by whatever limitation... | ||
# It might be nice someday to only emit the errors if the actual | ||
# data being moved is affected by whatever limitation... | ||
if (hints.datetimeformattz not in (f"{hints.dateformat} HH24:MI:SSOF", | ||
f"{hints.dateformat} HH:MI:SSOF", | ||
f"{hints.dateformat} HH24:MI:SS", | ||
|
@@ -137,7 +140,7 @@ def pandas_to_csv_options(records_format: DelimitedRecordsFormat, | |
cant_handle_hint(fail_if_cant_handle_hint, 'datetimeformat', hints) | ||
quiet_remove(unhandled_hints, 'datetimeformat') | ||
|
||
if hints.timeonlyformat != 'HH24:MI:SS': | ||
if hints.timeonlyformat not in ['HH24:MI:SS', 'HH:MI:SS']: | ||
cant_handle_hint(fail_if_cant_handle_hint, 'timeonlyformat', hints) | ||
quiet_remove(unhandled_hints, 'timeonlyformat') | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Since this is now a str alias, no reason to add more things to it.