From 2f68b2c1dbb438ea66ab6edb25c4a98113b4bd69 Mon Sep 17 00:00:00 2001 From: Vince Broz Date: Wed, 9 Dec 2020 17:45:56 -0500 Subject: [PATCH] Improve date/time hint handling for Redshift based on better testing Based on the new testing approach in #144, this PR adds component tests for date/time types while loading/exporting CSV files in Redshift databases. Along the way some places for improvement were found! --- tests/component/db/redshift/__init__.py | 0 .../db/redshift/test_records_copy.py | 163 ++++++++++++++++++ .../db/redshift/test_records_unload.py | 119 +++++++++++++ 3 files changed, 282 insertions(+) create mode 100644 tests/component/db/redshift/__init__.py create mode 100644 tests/component/db/redshift/test_records_copy.py create mode 100644 tests/component/db/redshift/test_records_unload.py diff --git a/tests/component/db/redshift/__init__.py b/tests/component/db/redshift/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/component/db/redshift/test_records_copy.py b/tests/component/db/redshift/test_records_copy.py new file mode 100644 index 000000000..33635cdc0 --- /dev/null +++ b/tests/component/db/redshift/test_records_copy.py @@ -0,0 +1,163 @@ +import unittest +from records_mover.db.redshift.records_copy import redshift_copy_options +from records_mover.records import DelimitedRecordsFormat, AvroRecordsFormat +from sqlalchemy_redshift.commands import Encoding, Format +from ...records.datetime_cases import ( + DATE_CASES, DATETIMETZ_CASES, DATETIME_CASES, TIMEONLY_CASES +) + + +class TestRecordsCopy(unittest.TestCase): + def test_redshift_copy_options_encodings(self): + tests = { + 'UTF16': Encoding.utf16, + 'UTF16LE': Encoding.utf16le, + 'UTF16BE': Encoding.utf16be + } + for hint_spelling, redshift_sqlalchemy_spelling in tests.items(): + + records_format =\ + DelimitedRecordsFormat(variant='bluelabs', + hints={ + 'encoding': hint_spelling + }) + unhandled_hints = set(records_format.hints.keys()) + out = redshift_copy_options(unhandled_hints, + records_format, + fail_if_cant_handle_hint=True, + fail_if_row_invalid=True, + max_failure_rows=0) + self.assertIs(out['encoding'], redshift_sqlalchemy_spelling) + + def test_redshift_copy_options_avro(self): + records_format = AvroRecordsFormat() + unhandled_hints = set() + out = redshift_copy_options(unhandled_hints, + records_format, + fail_if_cant_handle_hint=True, + fail_if_row_invalid=True, + max_failure_rows=0) + self.assertIs(out['format'], Format.avro) + + def test_redshift_copy_options_dateformat(self): + # The records spec's date/time formats are based on Redshift's + # spec originally, so it's expected that everything here would + # be accepted as-is, but please double-check with Redshift's + # docs as new test cases are added + accept_as_is = { + 'YYYY-MM-DD': True, + 'MM-DD-YYYY': True, + 'DD-MM-YYYY': True, + 'MM/DD/YY': True, + 'DD/MM/YY': True, + 'DD-MM-YY': True, + } + for dateformat in DATE_CASES: + records_format =\ + DelimitedRecordsFormat(variant='bluelabs', + hints={ + 'dateformat': dateformat + }) + unhandled_hints = set(records_format.hints.keys()) + out = redshift_copy_options(unhandled_hints, + records_format, + fail_if_cant_handle_hint=True, + fail_if_row_invalid=True, + max_failure_rows=0) + if accept_as_is[dateformat]: + self.assertIs(out['date_format'], dateformat) + else: + self.fail('define what to expect here') + + def test_redshift_copy_options_datetimeformattz(self): + # Redshift's time_format doesn't support separate + # configuration for datetimeformat vs datetimeformattz, but + # the 'auto' flag seems to work with specific things (see + # tests run in records_copy.py). + # + # Please verify new formats have a test run + # and documented in records_copy.py before putting an entry in + # here. + expectations = { + 'YYYY-MM-DD HH:MI:SSOF': 'auto', + 'YYYY-MM-DD HH:MI:SS': 'YYYY-MM-DD HH:MI:SS', + 'YYYY-MM-DD HH24:MI:SSOF': 'auto', + 'MM/DD/YY HH24:MIOF': 'auto', + 'MM/DD/YY HH24:MI': 'MM/DD/YY HH24:MI', + } + for datetimeformattz in DATETIMETZ_CASES: + hints = { + 'datetimeformattz': datetimeformattz, + 'datetimeformat': datetimeformattz.replace('OF', '') + } + records_format =\ + DelimitedRecordsFormat(variant='bluelabs', + hints=hints) + unhandled_hints = set(records_format.hints.keys()) + out = redshift_copy_options(unhandled_hints, + records_format, + fail_if_cant_handle_hint=True, + fail_if_row_invalid=True, + max_failure_rows=0) + self.assertEquals(out['time_format'], expectations[datetimeformattz]) + + def test_redshift_copy_options_datetimeformat(self): + # Redshift's time_format doesn't support separate + # configuration for datetimeformat vs datetimeformattz, but + # the 'auto' flag seems to work with specific things (see + # tests run in records_copy.py). + # + # Please verify new formats have a test run + # and documented in records_copy.py before putting an entry in + # here. + for datetimeformat in DATETIME_CASES: + hints = { + 'datetimeformattz': f"{datetimeformat}OF", + 'datetimeformat': datetimeformat, + } + records_format =\ + DelimitedRecordsFormat(variant='bluelabs', + hints=hints) + unhandled_hints = set(records_format.hints.keys()) + out = redshift_copy_options(unhandled_hints, + records_format, + fail_if_cant_handle_hint=True, + fail_if_row_invalid=True, + max_failure_rows=0) + self.assertEqual(out['time_format'], 'auto') + for datetimeformat in DATETIME_CASES: + hints = { + 'datetimeformattz': datetimeformat, + 'datetimeformat': datetimeformat, + } + records_format =\ + DelimitedRecordsFormat(variant='bluelabs', + hints=hints) + unhandled_hints = set(records_format.hints.keys()) + out = redshift_copy_options(unhandled_hints, + records_format, + fail_if_cant_handle_hint=True, + fail_if_row_invalid=True, + max_failure_rows=0) + self.assertEqual(out['time_format'], datetimeformat) + + def test_redshift_copy_options_timeonlyformat(self): + # Redshift didn't used to have a time only format, so Records + # Mover doesn't yet support anything here and just treats + # these as strings. Nothing should raise. + # + # https://github.com/bluelabsio/records-mover/issues/141 + for timeonlyformat in TIMEONLY_CASES: + hints = { + 'timeonlyformat': timeonlyformat, + } + records_format =\ + DelimitedRecordsFormat(variant='bluelabs', + hints=hints) + unhandled_hints = set(records_format.hints.keys()) + redshift_copy_options(unhandled_hints, + records_format, + fail_if_cant_handle_hint=True, + fail_if_row_invalid=True, + max_failure_rows=0) + # hurray, we didn't raise! diff --git a/tests/component/db/redshift/test_records_unload.py b/tests/component/db/redshift/test_records_unload.py new file mode 100644 index 000000000..2368efa4d --- /dev/null +++ b/tests/component/db/redshift/test_records_unload.py @@ -0,0 +1,119 @@ +import unittest +from typing import Set +from records_mover.db.redshift.records_unload import redshift_unload_options +from records_mover.records import DelimitedRecordsFormat +from ...records.datetime_cases import ( + DATE_CASES, DATETIMETZ_CASES, DATETIME_CASES, TIMEONLY_CASES +) + + +class TestRecordsUnload(unittest.TestCase): + def test_redshift_unload_options_datetimeformattz(self): + # Redshift offers no options and only unloads YYYY-MM-DD + # HH:MI:SSOF, so we should reject everything else. Double + # check with the docs just in case, though--maybe that's + # changed! + expected_failures = { + 'YYYY-MM-DD HH:MI:SS', + 'MM/DD/YY HH24:MI', + } + for datetimeformattz in DATETIMETZ_CASES: + hints = { + 'datetimeformattz': datetimeformattz, + } + records_format =\ + DelimitedRecordsFormat(variant='bluelabs', + hints=hints) + unhandled_hints = set(records_format.hints.keys()) + try: + redshift_unload_options(unhandled_hints, + records_format, + fail_if_cant_handle_hint=True) + except NotImplementedError: + if datetimeformattz in expected_failures: + continue + else: + raise + self.assertNotIn(datetimeformattz, expected_failures) + + def test_redshift_unload_options_datetimeformat(self): + # Redshift offers no options and only unloads YYYY-MM-DD + # HH:MI:SS, so we should reject everything else. Double + # check with the docs just in case, though--maybe that's + # changed! + expected_failures = { + 'YYYY-MM-DD HH12:MI AM', + 'MM/DD/YY HH24:MI', + } + for datetimeformat in DATETIME_CASES: + hints = { + 'datetimeformat': datetimeformat, + } + records_format =\ + DelimitedRecordsFormat(variant='bluelabs', + hints=hints) + unhandled_hints = set(records_format.hints.keys()) + try: + redshift_unload_options(unhandled_hints, + records_format, + fail_if_cant_handle_hint=True) + except NotImplementedError: + if datetimeformat in expected_failures: + continue + else: + raise + self.assertNotIn(datetimeformat, expected_failures) + + def test_redshift_unload_options_timeonlyformat(self): + # Redshift didn't used to have a time only format, so Records + # Mover doesn't yet support anything here and just treats + # these as strings. Nothing should raise. + expected_failures: Set[str] = set() + for timeonlyformat in TIMEONLY_CASES: + hints = { + 'timeonlyformat': timeonlyformat, + } + records_format =\ + DelimitedRecordsFormat(variant='bluelabs', + hints=hints) + unhandled_hints = set(records_format.hints.keys()) + try: + redshift_unload_options(unhandled_hints, + records_format, + fail_if_cant_handle_hint=True) + except NotImplementedError: + if timeonlyformat in expected_failures: + continue + else: + raise + self.assertNotIn(timeonlyformat, expected_failures) + + def test_redshift_unload_options_dateformat(self): + # Redshift offers no options and only unloads YYYY-MM-DD, so + # we should reject everything else. Double check with the + # docs just in case, though--maybe that's changed! + expected_failures = { + 'MM-DD-YYYY', + 'DD-MM-YYYY', + 'MM/DD/YY', + 'DD/MM/YY', + 'DD-MM-YY', + } + for dateformat in DATE_CASES: + hints = { + 'dateformat': dateformat, + } + records_format =\ + DelimitedRecordsFormat(variant='bluelabs', + hints=hints) + unhandled_hints = set(records_format.hints.keys()) + try: + redshift_unload_options(unhandled_hints, + records_format, + fail_if_cant_handle_hint=True) + except NotImplementedError: + if dateformat in expected_failures: + continue + else: + raise + self.assertNotIn(dateformat, expected_failures)