Skip to content

Commit

Permalink
Improve date/time hint handling for Redshift based on better testing
Browse files Browse the repository at this point in the history
Based on the new testing approach in #144, this PR adds component
tests for date/time types while loading/exporting CSV files in Redshift
databases.

Along the way some places for improvement were found!
  • Loading branch information
vinceatbluelabs committed Dec 9, 2020
1 parent 080b970 commit 2f68b2c
Show file tree
Hide file tree
Showing 3 changed files with 282 additions and 0 deletions.
Empty file.
163 changes: 163 additions & 0 deletions tests/component/db/redshift/test_records_copy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
import unittest
from records_mover.db.redshift.records_copy import redshift_copy_options
from records_mover.records import DelimitedRecordsFormat, AvroRecordsFormat
from sqlalchemy_redshift.commands import Encoding, Format
from ...records.datetime_cases import (
DATE_CASES, DATETIMETZ_CASES, DATETIME_CASES, TIMEONLY_CASES
)


class TestRecordsCopy(unittest.TestCase):
def test_redshift_copy_options_encodings(self):
tests = {
'UTF16': Encoding.utf16,
'UTF16LE': Encoding.utf16le,
'UTF16BE': Encoding.utf16be
}
for hint_spelling, redshift_sqlalchemy_spelling in tests.items():

records_format =\
DelimitedRecordsFormat(variant='bluelabs',
hints={
'encoding': hint_spelling
})
unhandled_hints = set(records_format.hints.keys())
out = redshift_copy_options(unhandled_hints,
records_format,
fail_if_cant_handle_hint=True,
fail_if_row_invalid=True,
max_failure_rows=0)
self.assertIs(out['encoding'], redshift_sqlalchemy_spelling)

def test_redshift_copy_options_avro(self):
records_format = AvroRecordsFormat()
unhandled_hints = set()
out = redshift_copy_options(unhandled_hints,
records_format,
fail_if_cant_handle_hint=True,
fail_if_row_invalid=True,
max_failure_rows=0)
self.assertIs(out['format'], Format.avro)

def test_redshift_copy_options_dateformat(self):
# The records spec's date/time formats are based on Redshift's
# spec originally, so it's expected that everything here would
# be accepted as-is, but please double-check with Redshift's
# docs as new test cases are added
accept_as_is = {
'YYYY-MM-DD': True,
'MM-DD-YYYY': True,
'DD-MM-YYYY': True,
'MM/DD/YY': True,
'DD/MM/YY': True,
'DD-MM-YY': True,
}
for dateformat in DATE_CASES:
records_format =\
DelimitedRecordsFormat(variant='bluelabs',
hints={
'dateformat': dateformat
})
unhandled_hints = set(records_format.hints.keys())
out = redshift_copy_options(unhandled_hints,
records_format,
fail_if_cant_handle_hint=True,
fail_if_row_invalid=True,
max_failure_rows=0)
if accept_as_is[dateformat]:
self.assertIs(out['date_format'], dateformat)
else:
self.fail('define what to expect here')

def test_redshift_copy_options_datetimeformattz(self):
# Redshift's time_format doesn't support separate
# configuration for datetimeformat vs datetimeformattz, but
# the 'auto' flag seems to work with specific things (see
# tests run in records_copy.py).
#
# Please verify new formats have a test run
# and documented in records_copy.py before putting an entry in
# here.
expectations = {
'YYYY-MM-DD HH:MI:SSOF': 'auto',
'YYYY-MM-DD HH:MI:SS': 'YYYY-MM-DD HH:MI:SS',
'YYYY-MM-DD HH24:MI:SSOF': 'auto',
'MM/DD/YY HH24:MIOF': 'auto',
'MM/DD/YY HH24:MI': 'MM/DD/YY HH24:MI',
}
for datetimeformattz in DATETIMETZ_CASES:
hints = {
'datetimeformattz': datetimeformattz,
'datetimeformat': datetimeformattz.replace('OF', '')
}
records_format =\
DelimitedRecordsFormat(variant='bluelabs',
hints=hints)
unhandled_hints = set(records_format.hints.keys())
out = redshift_copy_options(unhandled_hints,
records_format,
fail_if_cant_handle_hint=True,
fail_if_row_invalid=True,
max_failure_rows=0)
self.assertEquals(out['time_format'], expectations[datetimeformattz])

def test_redshift_copy_options_datetimeformat(self):
# Redshift's time_format doesn't support separate
# configuration for datetimeformat vs datetimeformattz, but
# the 'auto' flag seems to work with specific things (see
# tests run in records_copy.py).
#
# Please verify new formats have a test run
# and documented in records_copy.py before putting an entry in
# here.
for datetimeformat in DATETIME_CASES:
hints = {
'datetimeformattz': f"{datetimeformat}OF",
'datetimeformat': datetimeformat,
}
records_format =\
DelimitedRecordsFormat(variant='bluelabs',
hints=hints)
unhandled_hints = set(records_format.hints.keys())
out = redshift_copy_options(unhandled_hints,
records_format,
fail_if_cant_handle_hint=True,
fail_if_row_invalid=True,
max_failure_rows=0)
self.assertEqual(out['time_format'], 'auto')
for datetimeformat in DATETIME_CASES:
hints = {
'datetimeformattz': datetimeformat,
'datetimeformat': datetimeformat,
}
records_format =\
DelimitedRecordsFormat(variant='bluelabs',
hints=hints)
unhandled_hints = set(records_format.hints.keys())
out = redshift_copy_options(unhandled_hints,
records_format,
fail_if_cant_handle_hint=True,
fail_if_row_invalid=True,
max_failure_rows=0)
self.assertEqual(out['time_format'], datetimeformat)

def test_redshift_copy_options_timeonlyformat(self):
# Redshift didn't used to have a time only format, so Records
# Mover doesn't yet support anything here and just treats
# these as strings. Nothing should raise.
#
# https://github.com/bluelabsio/records-mover/issues/141
for timeonlyformat in TIMEONLY_CASES:
hints = {
'timeonlyformat': timeonlyformat,
}
records_format =\
DelimitedRecordsFormat(variant='bluelabs',
hints=hints)
unhandled_hints = set(records_format.hints.keys())
redshift_copy_options(unhandled_hints,
records_format,
fail_if_cant_handle_hint=True,
fail_if_row_invalid=True,
max_failure_rows=0)
# hurray, we didn't raise!
119 changes: 119 additions & 0 deletions tests/component/db/redshift/test_records_unload.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
import unittest
from typing import Set
from records_mover.db.redshift.records_unload import redshift_unload_options
from records_mover.records import DelimitedRecordsFormat
from ...records.datetime_cases import (
DATE_CASES, DATETIMETZ_CASES, DATETIME_CASES, TIMEONLY_CASES
)


class TestRecordsUnload(unittest.TestCase):
def test_redshift_unload_options_datetimeformattz(self):
# Redshift offers no options and only unloads YYYY-MM-DD
# HH:MI:SSOF, so we should reject everything else. Double
# check with the docs just in case, though--maybe that's
# changed!
expected_failures = {
'YYYY-MM-DD HH:MI:SS',
'MM/DD/YY HH24:MI',
}
for datetimeformattz in DATETIMETZ_CASES:
hints = {
'datetimeformattz': datetimeformattz,
}
records_format =\
DelimitedRecordsFormat(variant='bluelabs',
hints=hints)
unhandled_hints = set(records_format.hints.keys())
try:
redshift_unload_options(unhandled_hints,
records_format,
fail_if_cant_handle_hint=True)
except NotImplementedError:
if datetimeformattz in expected_failures:
continue
else:
raise
self.assertNotIn(datetimeformattz, expected_failures)

def test_redshift_unload_options_datetimeformat(self):
# Redshift offers no options and only unloads YYYY-MM-DD
# HH:MI:SS, so we should reject everything else. Double
# check with the docs just in case, though--maybe that's
# changed!
expected_failures = {
'YYYY-MM-DD HH12:MI AM',
'MM/DD/YY HH24:MI',
}
for datetimeformat in DATETIME_CASES:
hints = {
'datetimeformat': datetimeformat,
}
records_format =\
DelimitedRecordsFormat(variant='bluelabs',
hints=hints)
unhandled_hints = set(records_format.hints.keys())
try:
redshift_unload_options(unhandled_hints,
records_format,
fail_if_cant_handle_hint=True)
except NotImplementedError:
if datetimeformat in expected_failures:
continue
else:
raise
self.assertNotIn(datetimeformat, expected_failures)

def test_redshift_unload_options_timeonlyformat(self):
# Redshift didn't used to have a time only format, so Records
# Mover doesn't yet support anything here and just treats
# these as strings. Nothing should raise.
expected_failures: Set[str] = set()
for timeonlyformat in TIMEONLY_CASES:
hints = {
'timeonlyformat': timeonlyformat,
}
records_format =\
DelimitedRecordsFormat(variant='bluelabs',
hints=hints)
unhandled_hints = set(records_format.hints.keys())
try:
redshift_unload_options(unhandled_hints,
records_format,
fail_if_cant_handle_hint=True)
except NotImplementedError:
if timeonlyformat in expected_failures:
continue
else:
raise
self.assertNotIn(timeonlyformat, expected_failures)

def test_redshift_unload_options_dateformat(self):
# Redshift offers no options and only unloads YYYY-MM-DD, so
# we should reject everything else. Double check with the
# docs just in case, though--maybe that's changed!
expected_failures = {
'MM-DD-YYYY',
'DD-MM-YYYY',
'MM/DD/YY',
'DD/MM/YY',
'DD-MM-YY',
}
for dateformat in DATE_CASES:
hints = {
'dateformat': dateformat,
}
records_format =\
DelimitedRecordsFormat(variant='bluelabs',
hints=hints)
unhandled_hints = set(records_format.hints.keys())
try:
redshift_unload_options(unhandled_hints,
records_format,
fail_if_cant_handle_hint=True)
except NotImplementedError:
if dateformat in expected_failures:
continue
else:
raise
self.assertNotIn(dateformat, expected_failures)

0 comments on commit 2f68b2c

Please sign in to comment.