-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Improve date/time hint handling for Redshift based on better testing
Based on the new testing approach in #144, this PR adds component tests for date/time types while loading/exporting CSV files in Redshift databases. Along the way some places for improvement were found!
- Loading branch information
1 parent
080b970
commit 2f68b2c
Showing
3 changed files
with
282 additions
and
0 deletions.
There are no files selected for viewing
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,163 @@ | ||
import unittest | ||
from records_mover.db.redshift.records_copy import redshift_copy_options | ||
from records_mover.records import DelimitedRecordsFormat, AvroRecordsFormat | ||
from sqlalchemy_redshift.commands import Encoding, Format | ||
from ...records.datetime_cases import ( | ||
DATE_CASES, DATETIMETZ_CASES, DATETIME_CASES, TIMEONLY_CASES | ||
) | ||
|
||
|
||
class TestRecordsCopy(unittest.TestCase): | ||
def test_redshift_copy_options_encodings(self): | ||
tests = { | ||
'UTF16': Encoding.utf16, | ||
'UTF16LE': Encoding.utf16le, | ||
'UTF16BE': Encoding.utf16be | ||
} | ||
for hint_spelling, redshift_sqlalchemy_spelling in tests.items(): | ||
|
||
records_format =\ | ||
DelimitedRecordsFormat(variant='bluelabs', | ||
hints={ | ||
'encoding': hint_spelling | ||
}) | ||
unhandled_hints = set(records_format.hints.keys()) | ||
out = redshift_copy_options(unhandled_hints, | ||
records_format, | ||
fail_if_cant_handle_hint=True, | ||
fail_if_row_invalid=True, | ||
max_failure_rows=0) | ||
self.assertIs(out['encoding'], redshift_sqlalchemy_spelling) | ||
|
||
def test_redshift_copy_options_avro(self): | ||
records_format = AvroRecordsFormat() | ||
unhandled_hints = set() | ||
out = redshift_copy_options(unhandled_hints, | ||
records_format, | ||
fail_if_cant_handle_hint=True, | ||
fail_if_row_invalid=True, | ||
max_failure_rows=0) | ||
self.assertIs(out['format'], Format.avro) | ||
|
||
def test_redshift_copy_options_dateformat(self): | ||
# The records spec's date/time formats are based on Redshift's | ||
# spec originally, so it's expected that everything here would | ||
# be accepted as-is, but please double-check with Redshift's | ||
# docs as new test cases are added | ||
accept_as_is = { | ||
'YYYY-MM-DD': True, | ||
'MM-DD-YYYY': True, | ||
'DD-MM-YYYY': True, | ||
'MM/DD/YY': True, | ||
'DD/MM/YY': True, | ||
'DD-MM-YY': True, | ||
} | ||
for dateformat in DATE_CASES: | ||
records_format =\ | ||
DelimitedRecordsFormat(variant='bluelabs', | ||
hints={ | ||
'dateformat': dateformat | ||
}) | ||
unhandled_hints = set(records_format.hints.keys()) | ||
out = redshift_copy_options(unhandled_hints, | ||
records_format, | ||
fail_if_cant_handle_hint=True, | ||
fail_if_row_invalid=True, | ||
max_failure_rows=0) | ||
if accept_as_is[dateformat]: | ||
self.assertIs(out['date_format'], dateformat) | ||
else: | ||
self.fail('define what to expect here') | ||
|
||
def test_redshift_copy_options_datetimeformattz(self): | ||
# Redshift's time_format doesn't support separate | ||
# configuration for datetimeformat vs datetimeformattz, but | ||
# the 'auto' flag seems to work with specific things (see | ||
# tests run in records_copy.py). | ||
# | ||
# Please verify new formats have a test run | ||
# and documented in records_copy.py before putting an entry in | ||
# here. | ||
expectations = { | ||
'YYYY-MM-DD HH:MI:SSOF': 'auto', | ||
'YYYY-MM-DD HH:MI:SS': 'YYYY-MM-DD HH:MI:SS', | ||
'YYYY-MM-DD HH24:MI:SSOF': 'auto', | ||
'MM/DD/YY HH24:MIOF': 'auto', | ||
'MM/DD/YY HH24:MI': 'MM/DD/YY HH24:MI', | ||
} | ||
for datetimeformattz in DATETIMETZ_CASES: | ||
hints = { | ||
'datetimeformattz': datetimeformattz, | ||
'datetimeformat': datetimeformattz.replace('OF', '') | ||
} | ||
records_format =\ | ||
DelimitedRecordsFormat(variant='bluelabs', | ||
hints=hints) | ||
unhandled_hints = set(records_format.hints.keys()) | ||
out = redshift_copy_options(unhandled_hints, | ||
records_format, | ||
fail_if_cant_handle_hint=True, | ||
fail_if_row_invalid=True, | ||
max_failure_rows=0) | ||
self.assertEquals(out['time_format'], expectations[datetimeformattz]) | ||
|
||
def test_redshift_copy_options_datetimeformat(self): | ||
# Redshift's time_format doesn't support separate | ||
# configuration for datetimeformat vs datetimeformattz, but | ||
# the 'auto' flag seems to work with specific things (see | ||
# tests run in records_copy.py). | ||
# | ||
# Please verify new formats have a test run | ||
# and documented in records_copy.py before putting an entry in | ||
# here. | ||
for datetimeformat in DATETIME_CASES: | ||
hints = { | ||
'datetimeformattz': f"{datetimeformat}OF", | ||
'datetimeformat': datetimeformat, | ||
} | ||
records_format =\ | ||
DelimitedRecordsFormat(variant='bluelabs', | ||
hints=hints) | ||
unhandled_hints = set(records_format.hints.keys()) | ||
out = redshift_copy_options(unhandled_hints, | ||
records_format, | ||
fail_if_cant_handle_hint=True, | ||
fail_if_row_invalid=True, | ||
max_failure_rows=0) | ||
self.assertEqual(out['time_format'], 'auto') | ||
for datetimeformat in DATETIME_CASES: | ||
hints = { | ||
'datetimeformattz': datetimeformat, | ||
'datetimeformat': datetimeformat, | ||
} | ||
records_format =\ | ||
DelimitedRecordsFormat(variant='bluelabs', | ||
hints=hints) | ||
unhandled_hints = set(records_format.hints.keys()) | ||
out = redshift_copy_options(unhandled_hints, | ||
records_format, | ||
fail_if_cant_handle_hint=True, | ||
fail_if_row_invalid=True, | ||
max_failure_rows=0) | ||
self.assertEqual(out['time_format'], datetimeformat) | ||
|
||
def test_redshift_copy_options_timeonlyformat(self): | ||
# Redshift didn't used to have a time only format, so Records | ||
# Mover doesn't yet support anything here and just treats | ||
# these as strings. Nothing should raise. | ||
# | ||
# https://github.com/bluelabsio/records-mover/issues/141 | ||
for timeonlyformat in TIMEONLY_CASES: | ||
hints = { | ||
'timeonlyformat': timeonlyformat, | ||
} | ||
records_format =\ | ||
DelimitedRecordsFormat(variant='bluelabs', | ||
hints=hints) | ||
unhandled_hints = set(records_format.hints.keys()) | ||
redshift_copy_options(unhandled_hints, | ||
records_format, | ||
fail_if_cant_handle_hint=True, | ||
fail_if_row_invalid=True, | ||
max_failure_rows=0) | ||
# hurray, we didn't raise! |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
import unittest | ||
from typing import Set | ||
from records_mover.db.redshift.records_unload import redshift_unload_options | ||
from records_mover.records import DelimitedRecordsFormat | ||
from ...records.datetime_cases import ( | ||
DATE_CASES, DATETIMETZ_CASES, DATETIME_CASES, TIMEONLY_CASES | ||
) | ||
|
||
|
||
class TestRecordsUnload(unittest.TestCase): | ||
def test_redshift_unload_options_datetimeformattz(self): | ||
# Redshift offers no options and only unloads YYYY-MM-DD | ||
# HH:MI:SSOF, so we should reject everything else. Double | ||
# check with the docs just in case, though--maybe that's | ||
# changed! | ||
expected_failures = { | ||
'YYYY-MM-DD HH:MI:SS', | ||
'MM/DD/YY HH24:MI', | ||
} | ||
for datetimeformattz in DATETIMETZ_CASES: | ||
hints = { | ||
'datetimeformattz': datetimeformattz, | ||
} | ||
records_format =\ | ||
DelimitedRecordsFormat(variant='bluelabs', | ||
hints=hints) | ||
unhandled_hints = set(records_format.hints.keys()) | ||
try: | ||
redshift_unload_options(unhandled_hints, | ||
records_format, | ||
fail_if_cant_handle_hint=True) | ||
except NotImplementedError: | ||
if datetimeformattz in expected_failures: | ||
continue | ||
else: | ||
raise | ||
self.assertNotIn(datetimeformattz, expected_failures) | ||
|
||
def test_redshift_unload_options_datetimeformat(self): | ||
# Redshift offers no options and only unloads YYYY-MM-DD | ||
# HH:MI:SS, so we should reject everything else. Double | ||
# check with the docs just in case, though--maybe that's | ||
# changed! | ||
expected_failures = { | ||
'YYYY-MM-DD HH12:MI AM', | ||
'MM/DD/YY HH24:MI', | ||
} | ||
for datetimeformat in DATETIME_CASES: | ||
hints = { | ||
'datetimeformat': datetimeformat, | ||
} | ||
records_format =\ | ||
DelimitedRecordsFormat(variant='bluelabs', | ||
hints=hints) | ||
unhandled_hints = set(records_format.hints.keys()) | ||
try: | ||
redshift_unload_options(unhandled_hints, | ||
records_format, | ||
fail_if_cant_handle_hint=True) | ||
except NotImplementedError: | ||
if datetimeformat in expected_failures: | ||
continue | ||
else: | ||
raise | ||
self.assertNotIn(datetimeformat, expected_failures) | ||
|
||
def test_redshift_unload_options_timeonlyformat(self): | ||
# Redshift didn't used to have a time only format, so Records | ||
# Mover doesn't yet support anything here and just treats | ||
# these as strings. Nothing should raise. | ||
expected_failures: Set[str] = set() | ||
for timeonlyformat in TIMEONLY_CASES: | ||
hints = { | ||
'timeonlyformat': timeonlyformat, | ||
} | ||
records_format =\ | ||
DelimitedRecordsFormat(variant='bluelabs', | ||
hints=hints) | ||
unhandled_hints = set(records_format.hints.keys()) | ||
try: | ||
redshift_unload_options(unhandled_hints, | ||
records_format, | ||
fail_if_cant_handle_hint=True) | ||
except NotImplementedError: | ||
if timeonlyformat in expected_failures: | ||
continue | ||
else: | ||
raise | ||
self.assertNotIn(timeonlyformat, expected_failures) | ||
|
||
def test_redshift_unload_options_dateformat(self): | ||
# Redshift offers no options and only unloads YYYY-MM-DD, so | ||
# we should reject everything else. Double check with the | ||
# docs just in case, though--maybe that's changed! | ||
expected_failures = { | ||
'MM-DD-YYYY', | ||
'DD-MM-YYYY', | ||
'MM/DD/YY', | ||
'DD/MM/YY', | ||
'DD-MM-YY', | ||
} | ||
for dateformat in DATE_CASES: | ||
hints = { | ||
'dateformat': dateformat, | ||
} | ||
records_format =\ | ||
DelimitedRecordsFormat(variant='bluelabs', | ||
hints=hints) | ||
unhandled_hints = set(records_format.hints.keys()) | ||
try: | ||
redshift_unload_options(unhandled_hints, | ||
records_format, | ||
fail_if_cant_handle_hint=True) | ||
except NotImplementedError: | ||
if dateformat in expected_failures: | ||
continue | ||
else: | ||
raise | ||
self.assertNotIn(dateformat, expected_failures) |