Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve date/time hint handling for Pandas based on better testing #148

Merged
merged 8 commits into from
Dec 10, 2020
2 changes: 1 addition & 1 deletion metrics/coverage_high_water_mark
Original file line number Diff line number Diff line change
@@ -1 +1 @@
93.3500
93.4300
2 changes: 1 addition & 1 deletion metrics/mypy_high_water_mark
Original file line number Diff line number Diff line change
@@ -1 +1 @@
92.5900
92.6000
16 changes: 9 additions & 7 deletions records_mover/records/delimited/conversions.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
from .types import HintCompression, HintEncoding, HintDateFormat, HintTimeOnlyFormat, HintQuoting
import logging
import csv
from typing import Optional, Dict, Union
from typing_extensions import Literal
from typing import Optional, Dict


logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -39,16 +38,19 @@
# account for MM/DD time.
#
# https://github.com/bluelabsio/records-mover/issues/75
#
python_date_format_from_hints: Dict[Union[HintDateFormat, Literal['DD/MM/YY']], str] = {
python_date_format_from_hints: Dict[HintDateFormat, str] = {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since this is now a str alias, no reason to add more things to it.

'DD-MM-YYYY': '%d-%m-%Y',
'MM-DD-YYYY': '%m-%d-%Y',
'YYYY-MM-DD': '%Y-%m-%d',
'MM/DD/YY': '%m/%d/%Y',
'DD/MM/YY': '%d/%m/%Y',
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These were wrong! %Y is a four digit year, %y is a two digit year.

'MM/DD/YY': '%m/%d/%y',
'DD/MM/YY': '%d/%m/%y',
'DD-MM-YY': '%d-%m-%y',
}

python_time_format_from_hints: Dict[HintTimeOnlyFormat, str] = {
'HH24:MI:SS': '%H:%M:%S',
'HH12:MI AM': '%I:%M:%S %p',
'HH:MI:SS': '%H:%M:%S',
'HH12:MI AM': '%I:%M %p',
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We were adding seconds here against what the user was telling us. We have a similar bug with datetime/datetimetz types - see #143

Once we have more solid tests here we can go back and generalize to_csv_options() and fix those along the way - to fix it with the current design would be basically remove (mostly-working) functionality.

}

hint_encoding_from_pandas: Dict[str, HintEncoding] = {
Expand Down
53 changes: 28 additions & 25 deletions records_mover/records/pandas/to_csv_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,28 +63,23 @@ def pandas_to_csv_options(records_format: DelimitedRecordsFormat,
pandas_options['header'] = hints.header_row
quiet_remove(unhandled_hints, 'header-row')

# Note the limitation on Pandas export with BigQuery around
# datetimeformattz:
#
# https://github.com/bluelabsio/records-mover/issues/95

# The current datetimefomat/datetimeformattz hint support in
# to_csv_options.py is limited to ISO format driven by the
# dateformat hint.
#
# This could be generalized to a smarter function which translates
# to from the Records Spec language into the
# Python/Pandas/strftime format, and reject fewer hints as a
# result.
#
# https://github.com/bluelabsio/records-mover/issues/143
if hints.dateformat is None:
if hints.datetimeformattz == hints.datetimeformat:
# BigQuery requires that timezone offsets have a colon;
# Python (and thus Pandas) doesn't support adding the
# colon with strftime. However, we can specify things
# without a timezone delimiter just fine.
#
# Unfortunately Python/Pandas will drop the timezone info
# instead of converting the timestamp to UTC. This
# corrupts the time, as BigQuery assumes what it gets in
# is UTC format. Boo.
#
# $ python3
# >>> import pytz
# >>> us_eastern = pytz.timezone('US/Eastern')
# >>> import datetime
# >>> us_eastern.localize(datetime.datetime(2000, 1, 2, 12, 34, 56, 789012))
# .strftime('%Y-%m-%d %H:%M:%S.%f')
# '2000-01-02 12:34:56.789012'
# >>>
#
# https://github.com/bluelabsio/records-mover/issues/95
pandas_options['date_format'] = '%Y-%m-%d %H:%M:%S.%f'
else:
pandas_options['date_format'] = '%Y-%m-%d %H:%M:%S.%f%z'
Expand All @@ -108,14 +103,22 @@ def pandas_to_csv_options(records_format: DelimitedRecordsFormat,
pandas_options['date_format'] = '%m/%d/%y %H:%M:%S.%f'
else:
pandas_options['date_format'] = '%m/%d/%y %H:%M:%S.%f%z'
elif hints.dateformat == 'DD/MM/YY':
if hints.datetimeformattz == hints.datetimeformat:
pandas_options['date_format'] = '%d/%m/%y %H:%M:%S.%f'
else:
pandas_options['date_format'] = '%d/%m/%y %H:%M:%S.%f%z'
elif hints.dateformat == 'DD-MM-YY':
if hints.datetimeformattz == hints.datetimeformat:
pandas_options['date_format'] = '%d-%m-%y %H:%M:%S.%f'
else:
pandas_options['date_format'] = '%d-%m-%y %H:%M:%S.%f%z'
else:
cant_handle_hint(fail_if_cant_handle_hint, 'dateformat', hints)
quiet_remove(unhandled_hints, 'dateformat')

# pandas can't seem to export a date and time together :(
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This comment isn't right; it certainly can. What it can't do is export a time without a date, or a date without a time. That's what we have prep_for_csv.py for.

#
# might be nice someday to only emit the errors if the actual data
# being moved is affected by whatever limitation...
# It might be nice someday to only emit the errors if the actual
# data being moved is affected by whatever limitation...
if (hints.datetimeformattz not in (f"{hints.dateformat} HH24:MI:SSOF",
f"{hints.dateformat} HH:MI:SSOF",
f"{hints.dateformat} HH24:MI:SS",
Expand All @@ -137,7 +140,7 @@ def pandas_to_csv_options(records_format: DelimitedRecordsFormat,
cant_handle_hint(fail_if_cant_handle_hint, 'datetimeformat', hints)
quiet_remove(unhandled_hints, 'datetimeformat')

if hints.timeonlyformat != 'HH24:MI:SS':
if hints.timeonlyformat not in ['HH24:MI:SS', 'HH:MI:SS']:
cant_handle_hint(fail_if_cant_handle_hint, 'timeonlyformat', hints)
quiet_remove(unhandled_hints, 'timeonlyformat')

Expand Down
165 changes: 165 additions & 0 deletions tests/component/records/pandas/test_prep_for_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@
from records_mover.records.pandas import prep_df_for_csv_output
from records_mover.records.schema import RecordsSchema
from records_mover.records import DelimitedRecordsFormat, ProcessingInstructions
from ..datetime_cases import (
DATE_CASES, DATETIMETZ_CASES, DATETIME_CASES, TIMEONLY_CASES,
create_sample, SAMPLE_YEAR, SAMPLE_MONTH, SAMPLE_DAY, SAMPLE_HOUR, SAMPLE_MINUTE, SAMPLE_SECOND
)


class TestPrepForCsv(unittest.TestCase):
Expand Down Expand Up @@ -120,3 +124,164 @@ def test_prep_df_for_csv_output_include_index(self):
self.assertEqual(new_df['time'][0], '12:33:53')
# self.assertEqual(new_df['timetz'][0], '12:33:53-05')
self.assertIsNotNone(new_df)

def test_dateformat(self):
schema_data = {
'schema': "bltypes/v1",
'fields': {
"date": {
"type": "date",
"index": 1,
},
}
}
records_schema = RecordsSchema.from_data(schema_data)
processing_instructions = ProcessingInstructions()
for dateformat in DATE_CASES:
records_format = DelimitedRecordsFormat(variant='bluelabs',
hints={
'dateformat': dateformat
})
# us_eastern = pytz.timezone('US/Eastern')
data = {
'date': [
pd.Timestamp(year=SAMPLE_YEAR, month=SAMPLE_MONTH, day=SAMPLE_DAY)
],
}
df = pd.DataFrame(data, columns=['date'])

new_df = prep_df_for_csv_output(df=df,
include_index=False,
records_schema=records_schema,
records_format=records_format,
processing_instructions=processing_instructions)
self.assertEqual(new_df['date'][0],
create_sample(dateformat))
# self.assertEqual(new_df['timetz'][0], '12:33:53-05')
self.assertIsNotNone(new_df)

def test_datetimeformattz(self):
schema_data = {
'schema': "bltypes/v1",
'fields': {
"datetimetz": {
"type": "datetimetz",
"index": 1,
},
}
}
records_schema = RecordsSchema.from_data(schema_data)
processing_instructions = ProcessingInstructions()
for datetimeformattz in DATETIMETZ_CASES:
records_format = DelimitedRecordsFormat(variant='bluelabs',
hints={
'datetimeformattz': datetimeformattz
})
# us_eastern = pytz.timezone('US/Eastern')
timestamp = pd.Timestamp(year=SAMPLE_YEAR, month=SAMPLE_MONTH, day=SAMPLE_DAY,
hour=SAMPLE_HOUR, minute=SAMPLE_MINUTE,
second=SAMPLE_SECOND)

data = {
'datetimetz': [
timestamp
],
}
df = pd.DataFrame(data, columns=['datetimetz'])

new_df = prep_df_for_csv_output(df=df,
include_index=False,
records_schema=records_schema,
records_format=records_format,
processing_instructions=processing_instructions)
# No conversion is done of datetimetz as pandas' CSV
# outputter handles it properly, so we should expect the
# original again
self.assertEqual(new_df['datetimetz'][0],
timestamp,
create_sample(datetimeformattz))
# self.assertEqual(new_df['timetz'][0], '12:33:53-05')
self.assertIsNotNone(new_df)

def test_datetimeformat(self):
schema_data = {
'schema': "bltypes/v1",
'fields': {
"datetimez": {
"type": "datetime",
"index": 1,
},
}
}
records_schema = RecordsSchema.from_data(schema_data)
processing_instructions = ProcessingInstructions()
for datetimeformat in DATETIME_CASES:
records_format = DelimitedRecordsFormat(variant='bluelabs',
hints={
'datetimeformat': datetimeformat
})
# us_eastern = pytz.timezone('US/Eastern')
timestamp = pd.Timestamp(year=SAMPLE_YEAR, month=SAMPLE_MONTH, day=SAMPLE_DAY,
hour=SAMPLE_HOUR, minute=SAMPLE_MINUTE,
second=SAMPLE_SECOND)

data = {
'datetime': [
timestamp
],
}
df = pd.DataFrame(data, columns=['datetime'])

new_df = prep_df_for_csv_output(df=df,
include_index=False,
records_schema=records_schema,
records_format=records_format,
processing_instructions=processing_instructions)
# No conversion is done of datetime as pandas' CSV
# outputter handles it properly, so we should expect the
# original again
self.assertEqual(new_df['datetime'][0],
timestamp,
create_sample(datetimeformat))
# self.assertEqual(new_df['timetz'][0], '12:33:53-05')
self.assertIsNotNone(new_df)

def test_timeonlyformat(self):
schema_data = {
'schema': "bltypes/v1",
'fields': {
"datetimez": {
"type": "time",
"index": 1,
},
}
}
records_schema = RecordsSchema.from_data(schema_data)
processing_instructions = ProcessingInstructions()
for timeonlyformat in TIMEONLY_CASES:
records_format = DelimitedRecordsFormat(variant='bluelabs',
hints={
'timeonlyformat': timeonlyformat
})
# us_eastern = pytz.timezone('US/Eastern')
timestamp = pd.Timestamp(year=SAMPLE_YEAR, month=SAMPLE_MONTH, day=SAMPLE_DAY,
hour=SAMPLE_HOUR, minute=SAMPLE_MINUTE,
second=SAMPLE_SECOND)

data = {
'time': [
timestamp
],
}
df = pd.DataFrame(data, columns=['time'])

new_df = prep_df_for_csv_output(df=df,
include_index=False,
records_schema=records_schema,
records_format=records_format,
processing_instructions=processing_instructions)
self.assertEqual(new_df['time'][0],
create_sample(timeonlyformat),
timeonlyformat)
# self.assertEqual(new_df['timetz'][0], '12:33:53-05')
self.assertIsNotNone(new_df)
Loading