bluelabsio · vinceatbluelabs · Dec 10, 2020 · Dec 9, 2020 · Dec 9, 2020 · Dec 9, 2020
diff --git a/metrics/coverage_high_water_mark b/metrics/coverage_high_water_mark
@@ -1 +1 @@
-93.3500
+93.4300
diff --git a/metrics/mypy_high_water_mark b/metrics/mypy_high_water_mark
@@ -1 +1 @@
-92.5900
+92.6000
diff --git a/records_mover/records/delimited/conversions.py b/records_mover/records/delimited/conversions.py
@@ -1,8 +1,7 @@
 from .types import HintCompression, HintEncoding, HintDateFormat, HintTimeOnlyFormat, HintQuoting
 import logging
 import csv
-from typing import Optional, Dict, Union
-from typing_extensions import Literal
+from typing import Optional, Dict
 
 
 logger = logging.getLogger(__name__)
@@ -39,16 +38,19 @@
 # account for MM/DD time.
 #
 # https://github.com/bluelabsio/records-mover/issues/75
-#
-python_date_format_from_hints: Dict[Union[HintDateFormat, Literal['DD/MM/YY']], str] = {
+python_date_format_from_hints: Dict[HintDateFormat, str] = {
+    'DD-MM-YYYY': '%d-%m-%Y',
+    'MM-DD-YYYY': '%m-%d-%Y',
     'YYYY-MM-DD': '%Y-%m-%d',
-    'MM/DD/YY': '%m/%d/%Y',
-    'DD/MM/YY': '%d/%m/%Y',
+    'MM/DD/YY': '%m/%d/%y',
+    'DD/MM/YY': '%d/%m/%y',
+    'DD-MM-YY': '%d-%m-%y',
 }
 
 python_time_format_from_hints: Dict[HintTimeOnlyFormat, str] = {
     'HH24:MI:SS': '%H:%M:%S',
-    'HH12:MI AM': '%I:%M:%S %p',
+    'HH:MI:SS': '%H:%M:%S',
+    'HH12:MI AM': '%I:%M %p',
 }
 
 hint_encoding_from_pandas: Dict[str, HintEncoding] = {

diff --git a/records_mover/records/pandas/to_csv_options.py b/records_mover/records/pandas/to_csv_options.py
@@ -63,28 +63,23 @@ def pandas_to_csv_options(records_format: DelimitedRecordsFormat,
     pandas_options['header'] = hints.header_row
     quiet_remove(unhandled_hints, 'header-row')
 
+    # Note the limitation on Pandas export with BigQuery around
+    # datetimeformattz:
+    #
+    # https://github.com/bluelabsio/records-mover/issues/95
+
+    # The current datetimefomat/datetimeformattz hint support in
+    # to_csv_options.py is limited to ISO format driven by the
+    # dateformat hint.
+    #
+    # This could be generalized to a smarter function which translates
+    # to from the Records Spec language into the
+    # Python/Pandas/strftime format, and reject fewer hints as a
+    # result.
+    #
+    # https://github.com/bluelabsio/records-mover/issues/143
     if hints.dateformat is None:
         if hints.datetimeformattz == hints.datetimeformat:
-            # BigQuery requires that timezone offsets have a colon;
-            # Python (and thus Pandas) doesn't support adding the
-            # colon with strftime.  However, we can specify things
-            # without a timezone delimiter just fine.
-            #
-            # Unfortunately Python/Pandas will drop the timezone info
-            # instead of converting the timestamp to UTC.  This
-            # corrupts the time, as BigQuery assumes what it gets in
-            # is UTC format.  Boo.
-            #
-            # $ python3
-            # >>> import pytz
-            # >>> us_eastern = pytz.timezone('US/Eastern')
-            # >>> import datetime
-            # >>> us_eastern.localize(datetime.datetime(2000, 1, 2, 12, 34, 56, 789012))
-            #        .strftime('%Y-%m-%d %H:%M:%S.%f')
-            # '2000-01-02 12:34:56.789012'
-            # >>>
-            #
-            # https://github.com/bluelabsio/records-mover/issues/95
             pandas_options['date_format'] = '%Y-%m-%d %H:%M:%S.%f'
         else:
             pandas_options['date_format'] = '%Y-%m-%d %H:%M:%S.%f%z'
@@ -108,14 +103,22 @@ def pandas_to_csv_options(records_format: DelimitedRecordsFormat,
             pandas_options['date_format'] = '%m/%d/%y %H:%M:%S.%f'
         else:
             pandas_options['date_format'] = '%m/%d/%y %H:%M:%S.%f%z'
+    elif hints.dateformat == 'DD/MM/YY':
+        if hints.datetimeformattz == hints.datetimeformat:
+            pandas_options['date_format'] = '%d/%m/%y %H:%M:%S.%f'
+        else:
+            pandas_options['date_format'] = '%d/%m/%y %H:%M:%S.%f%z'
+    elif hints.dateformat == 'DD-MM-YY':
+        if hints.datetimeformattz == hints.datetimeformat:
+            pandas_options['date_format'] = '%d-%m-%y %H:%M:%S.%f'
+        else:
+            pandas_options['date_format'] = '%d-%m-%y %H:%M:%S.%f%z'
     else:
         cant_handle_hint(fail_if_cant_handle_hint, 'dateformat', hints)
     quiet_remove(unhandled_hints, 'dateformat')
 
-    # pandas can't seem to export a date and time together :(
-    #
-    # might be nice someday to only emit the errors if the actual data
-    # being moved is affected by whatever limitation...
+    # It might be nice someday to only emit the errors if the actual
+    # data being moved is affected by whatever limitation...
     if (hints.datetimeformattz not in (f"{hints.dateformat} HH24:MI:SSOF",
                                        f"{hints.dateformat} HH:MI:SSOF",
                                        f"{hints.dateformat} HH24:MI:SS",
@@ -137,7 +140,7 @@ def pandas_to_csv_options(records_format: DelimitedRecordsFormat,
         cant_handle_hint(fail_if_cant_handle_hint, 'datetimeformat', hints)
     quiet_remove(unhandled_hints, 'datetimeformat')
 
-    if hints.timeonlyformat != 'HH24:MI:SS':
+    if hints.timeonlyformat not in ['HH24:MI:SS', 'HH:MI:SS']:
         cant_handle_hint(fail_if_cant_handle_hint, 'timeonlyformat', hints)
     quiet_remove(unhandled_hints, 'timeonlyformat')
 

diff --git a/tests/component/records/pandas/test_prep_for_csv.py b/tests/component/records/pandas/test_prep_for_csv.py
@@ -4,6 +4,10 @@
 from records_mover.records.pandas import prep_df_for_csv_output
 from records_mover.records.schema import RecordsSchema
 from records_mover.records import DelimitedRecordsFormat, ProcessingInstructions
+from ..datetime_cases import (
+    DATE_CASES, DATETIMETZ_CASES, DATETIME_CASES, TIMEONLY_CASES,
+    create_sample, SAMPLE_YEAR, SAMPLE_MONTH, SAMPLE_DAY, SAMPLE_HOUR, SAMPLE_MINUTE, SAMPLE_SECOND
+)
 
 
 class TestPrepForCsv(unittest.TestCase):
@@ -120,3 +124,164 @@ def test_prep_df_for_csv_output_include_index(self):
         self.assertEqual(new_df['time'][0], '12:33:53')
         # self.assertEqual(new_df['timetz'][0], '12:33:53-05')
         self.assertIsNotNone(new_df)
+
+    def test_dateformat(self):
+        schema_data = {
+            'schema': "bltypes/v1",
+            'fields': {
+                "date": {
+                    "type": "date",
+                    "index": 1,
+                },
+            }
+        }
+        records_schema = RecordsSchema.from_data(schema_data)
+        processing_instructions = ProcessingInstructions()
+        for dateformat in DATE_CASES:
+            records_format = DelimitedRecordsFormat(variant='bluelabs',
+                                                    hints={
+                                                        'dateformat': dateformat
+                                                    })
+            # us_eastern = pytz.timezone('US/Eastern')
+            data = {
+                'date': [
+                    pd.Timestamp(year=SAMPLE_YEAR, month=SAMPLE_MONTH, day=SAMPLE_DAY)
+                ],
+            }
+            df = pd.DataFrame(data, columns=['date'])
+
+            new_df = prep_df_for_csv_output(df=df,
+                                            include_index=False,
+                                            records_schema=records_schema,
+                                            records_format=records_format,
+                                            processing_instructions=processing_instructions)
+            self.assertEqual(new_df['date'][0],
+                             create_sample(dateformat))
+            # self.assertEqual(new_df['timetz'][0], '12:33:53-05')
+            self.assertIsNotNone(new_df)
+
+    def test_datetimeformattz(self):
+        schema_data = {
+            'schema': "bltypes/v1",
+            'fields': {
+                "datetimetz": {
+                    "type": "datetimetz",
+                    "index": 1,
+                },
+            }
+        }
+        records_schema = RecordsSchema.from_data(schema_data)
+        processing_instructions = ProcessingInstructions()
+        for datetimeformattz in DATETIMETZ_CASES:
+            records_format = DelimitedRecordsFormat(variant='bluelabs',
+                                                    hints={
+                                                        'datetimeformattz': datetimeformattz
+                                                    })
+            # us_eastern = pytz.timezone('US/Eastern')
+            timestamp = pd.Timestamp(year=SAMPLE_YEAR, month=SAMPLE_MONTH, day=SAMPLE_DAY,
+                                     hour=SAMPLE_HOUR, minute=SAMPLE_MINUTE,
+                                     second=SAMPLE_SECOND)
+
+            data = {
+                'datetimetz': [
+                    timestamp
+                ],
+            }
+            df = pd.DataFrame(data, columns=['datetimetz'])
+
+            new_df = prep_df_for_csv_output(df=df,
+                                            include_index=False,
+                                            records_schema=records_schema,
+                                            records_format=records_format,
+                                            processing_instructions=processing_instructions)
+            # No conversion is done of datetimetz as pandas' CSV
+            # outputter handles it properly, so we should expect the
+            # original again
+            self.assertEqual(new_df['datetimetz'][0],
+                             timestamp,
+                             create_sample(datetimeformattz))
+            # self.assertEqual(new_df['timetz'][0], '12:33:53-05')
+            self.assertIsNotNone(new_df)
+
+    def test_datetimeformat(self):
+        schema_data = {
+            'schema': "bltypes/v1",
+            'fields': {
+                "datetimez": {
+                    "type": "datetime",
+                    "index": 1,
+                },
+            }
+        }
+        records_schema = RecordsSchema.from_data(schema_data)
+        processing_instructions = ProcessingInstructions()
+        for datetimeformat in DATETIME_CASES:
+            records_format = DelimitedRecordsFormat(variant='bluelabs',
+                                                    hints={
+                                                        'datetimeformat': datetimeformat
+                                                    })
+            # us_eastern = pytz.timezone('US/Eastern')
+            timestamp = pd.Timestamp(year=SAMPLE_YEAR, month=SAMPLE_MONTH, day=SAMPLE_DAY,
+                                     hour=SAMPLE_HOUR, minute=SAMPLE_MINUTE,
+                                     second=SAMPLE_SECOND)
+
+            data = {
+                'datetime': [
+                    timestamp
+                ],
+            }
+            df = pd.DataFrame(data, columns=['datetime'])
+
+            new_df = prep_df_for_csv_output(df=df,
+                                            include_index=False,
+                                            records_schema=records_schema,
+                                            records_format=records_format,
+                                            processing_instructions=processing_instructions)
+            # No conversion is done of datetime as pandas' CSV
+            # outputter handles it properly, so we should expect the
+            # original again
+            self.assertEqual(new_df['datetime'][0],
+                             timestamp,
+                             create_sample(datetimeformat))
+            # self.assertEqual(new_df['timetz'][0], '12:33:53-05')
+            self.assertIsNotNone(new_df)
+
+    def test_timeonlyformat(self):
+        schema_data = {
+            'schema': "bltypes/v1",
+            'fields': {
+                "datetimez": {
+                    "type": "time",
+                    "index": 1,
+                },
+            }
+        }
+        records_schema = RecordsSchema.from_data(schema_data)
+        processing_instructions = ProcessingInstructions()
+        for timeonlyformat in TIMEONLY_CASES:
+            records_format = DelimitedRecordsFormat(variant='bluelabs',
+                                                    hints={
+                                                        'timeonlyformat': timeonlyformat
+                                                    })
+            # us_eastern = pytz.timezone('US/Eastern')
+            timestamp = pd.Timestamp(year=SAMPLE_YEAR, month=SAMPLE_MONTH, day=SAMPLE_DAY,
+                                     hour=SAMPLE_HOUR, minute=SAMPLE_MINUTE,
+                                     second=SAMPLE_SECOND)
+
+            data = {
+                'time': [
+                    timestamp
+                ],
+            }
+            df = pd.DataFrame(data, columns=['time'])
+
+            new_df = prep_df_for_csv_output(df=df,
+                                            include_index=False,
+                                            records_schema=records_schema,
+                                            records_format=records_format,
+                                            processing_instructions=processing_instructions)
+            self.assertEqual(new_df['time'][0],
+                             create_sample(timeonlyformat),
+                             timeonlyformat)
+            # self.assertEqual(new_df['timetz'][0], '12:33:53-05')
+            self.assertIsNotNone(new_df)