From 5fd5e3c0add5a1f509b7540f9ff1507d987488e5 Mon Sep 17 00:00:00 2001 From: Tim Ryan Date: Mon, 15 May 2023 18:26:47 -0600 Subject: [PATCH] RM-86 allow multiple versions of lineterminator --- .../records/pandas/read_csv_options.py | 7 +- .../records/pandas/to_csv_options.py | 7 +- .../records/test_pandas_read_csv_options.py | 43 ++- .../test_pandas_to_csv_options_bluelabs.py | 40 ++- .../records/test_pandas_to_csv_options_csv.py | 37 +- .../test_pandas_to_csv_options_vertica.py | 34 +- tests/unit/records/targets/test_fileobj.py | 318 ++++++++++++------ 7 files changed, 338 insertions(+), 148 deletions(-) diff --git a/records_mover/records/pandas/read_csv_options.py b/records_mover/records/pandas/read_csv_options.py index 2ee8544ee..66d9ca5f2 100644 --- a/records_mover/records/pandas/read_csv_options.py +++ b/records_mover/records/pandas/read_csv_options.py @@ -6,6 +6,8 @@ from records_mover.records.schema import RecordsSchema import logging from typing import Set, Dict, Any +from packaging import version +import pandas as pd logger = logging.getLogger(__name__) @@ -499,7 +501,10 @@ def day_first(dateish_format: str) -> bool: # Character to break file into lines. Only valid with C parser. # if non_standard_record_terminator: - pandas_options['lineterminator'] = hints.record_terminator + if version.parse(pd.__version__) >= version.parse('1.5.0'): + pandas_options['lineterminator'] = hints.record_terminator + else: + pandas_options['line_terminator'] = hints.record_terminator quiet_remove(unhandled_hints, 'record-terminator') # diff --git a/records_mover/records/pandas/to_csv_options.py b/records_mover/records/pandas/to_csv_options.py index f43063a79..bee11a522 100644 --- a/records_mover/records/pandas/to_csv_options.py +++ b/records_mover/records/pandas/to_csv_options.py @@ -6,6 +6,8 @@ from records_mover.mover_types import _assert_never import logging from typing import Set, Dict +from packaging import version +import pandas as pd logger = logging.getLogger(__name__) @@ -111,7 +113,10 @@ def pandas_to_csv_options(records_format: DelimitedRecordsFormat, pandas_options['sep'] = hints.field_delimiter quiet_remove(unhandled_hints, 'field-delimiter') - pandas_options['lineterminator'] = hints.record_terminator + if version.parse(pd.__version__) >= version.parse('1.5.0'): + pandas_options['lineterminator'] = hints.record_terminator + else: + pandas_options['line_terminator'] = hints.record_terminator quiet_remove(unhandled_hints, 'record-terminator') return pandas_options diff --git a/tests/component/records/test_pandas_read_csv_options.py b/tests/component/records/test_pandas_read_csv_options.py index 47c5e629d..1f4020cd9 100644 --- a/tests/component/records/test_pandas_read_csv_options.py +++ b/tests/component/records/test_pandas_read_csv_options.py @@ -4,6 +4,8 @@ from records_mover.records.processing_instructions import ProcessingInstructions from records_mover.records.records_format import DelimitedRecordsFormat from records_mover.records.schema import RecordsSchema +from packaging import version +import pandas as pd class TestPandasReadCsvOptions(unittest.TestCase): @@ -131,19 +133,34 @@ def test_pandas_read_csv_options_csv(self): def test_pandas_read_csv_options_vertica(self): self.maxDiff = None - expected = { - 'dayfirst': False, - 'compression': None, - 'delimiter': '\x01', - 'doublequote': False, - 'engine': 'c', - 'on_bad_lines': 'error', - 'header': None, - 'lineterminator': '\x02', - 'quotechar': '"', - 'quoting': 3, - 'parse_dates': [0, 1, 2, 3], - } + if version.parse(pd.__version__) >= version.parse('1.5.0'): + expected = { + 'dayfirst': False, + 'compression': None, + 'delimiter': '\x01', + 'doublequote': False, + 'engine': 'c', + 'on_bad_lines': 'error', + 'header': None, + 'lineterminator': '\x02', + 'quotechar': '"', + 'quoting': 3, + 'parse_dates': [0, 1, 2, 3], + } + else: + expected = { + 'dayfirst': False, + 'compression': None, + 'delimiter': '\x01', + 'doublequote': False, + 'engine': 'c', + 'on_bad_lines': 'error', + 'header': None, + 'line_terminator': '\x02', + 'quotechar': '"', + 'quoting': 3, + 'parse_dates': [0, 1, 2, 3], + } processing_instructions = ProcessingInstructions() records_format = DelimitedRecordsFormat(hints=vertica_format_hints) unhandled_hints = set(records_format.hints) diff --git a/tests/component/records/test_pandas_to_csv_options_bluelabs.py b/tests/component/records/test_pandas_to_csv_options_bluelabs.py index cfbd9fb79..be450900d 100644 --- a/tests/component/records/test_pandas_to_csv_options_bluelabs.py +++ b/tests/component/records/test_pandas_to_csv_options_bluelabs.py @@ -3,22 +3,38 @@ from records_mover.records.pandas import pandas_to_csv_options from records_mover.records.processing_instructions import ProcessingInstructions from records_mover.records.records_format import DelimitedRecordsFormat +from packaging import version +import pandas as pd class TestPandasToCsvOptionsBlueLabs(unittest.TestCase): def test_pandas_to_csv_options_bluelabs(self): - expected = { - 'compression': 'gzip', - 'date_format': '%Y-%m-%d %H:%M:%S.%f%z', - 'doublequote': False, - 'encoding': 'UTF8', - 'escapechar': '\\', - 'header': False, - 'lineterminator': '\n', - 'quotechar': '"', - 'quoting': 3, - 'sep': ',', - } + if version.parse(pd.__version__) >= version.parse('1.5.0'): + expected = { + 'compression': 'gzip', + 'date_format': '%Y-%m-%d %H:%M:%S.%f%z', + 'doublequote': False, + 'encoding': 'UTF8', + 'escapechar': '\\', + 'header': False, + 'lineterminator': '\n', + 'quotechar': '"', + 'quoting': 3, + 'sep': ',', + } + else: + expected = { + 'compression': 'gzip', + 'date_format': '%Y-%m-%d %H:%M:%S.%f%z', + 'doublequote': False, + 'encoding': 'UTF8', + 'escapechar': '\\', + 'header': False, + 'line_terminator': '\n', + 'quotechar': '"', + 'quoting': 3, + 'sep': ',', + } processing_instructions = ProcessingInstructions() records_format = DelimitedRecordsFormat(hints=bluelabs_format_hints) unhandled_hints = set(records_format.hints) diff --git a/tests/component/records/test_pandas_to_csv_options_csv.py b/tests/component/records/test_pandas_to_csv_options_csv.py index 5218283f6..9dfc4a3fb 100644 --- a/tests/component/records/test_pandas_to_csv_options_csv.py +++ b/tests/component/records/test_pandas_to_csv_options_csv.py @@ -3,21 +3,36 @@ from records_mover.records.pandas import pandas_to_csv_options from records_mover.records.processing_instructions import ProcessingInstructions from records_mover.records.records_format import DelimitedRecordsFormat +from packaging import version +import pandas as pd class TestPandasToCsvOptionsCsv(unittest.TestCase): def test_pandas_to_csv_options_csv(self): - expected = { - 'compression': 'gzip', - 'date_format': '%m/%d/%y %H:%M', - 'doublequote': True, - 'encoding': 'UTF8', - 'header': True, - 'lineterminator': '\n', - 'quotechar': '"', - 'quoting': 0, - 'sep': ',' - } + if version.parse(pd.__version__) >= version.parse('1.5.0'): + expected = { + 'compression': 'gzip', + 'date_format': '%m/%d/%y %H:%M', + 'doublequote': True, + 'encoding': 'UTF8', + 'header': True, + 'lineterminator': '\n', + 'quotechar': '"', + 'quoting': 0, + 'sep': ',' + } + else: + expected = { + 'compression': 'gzip', + 'date_format': '%m/%d/%y %H:%M', + 'doublequote': True, + 'encoding': 'UTF8', + 'header': True, + 'line_terminator': '\n', + 'quotechar': '"', + 'quoting': 0, + 'sep': ',' + } processing_instructions =\ ProcessingInstructions(fail_if_cant_handle_hint=True) records_format = DelimitedRecordsFormat(hints=csv_format_hints) diff --git a/tests/component/records/test_pandas_to_csv_options_vertica.py b/tests/component/records/test_pandas_to_csv_options_vertica.py index f45cb341d..d18296b05 100644 --- a/tests/component/records/test_pandas_to_csv_options_vertica.py +++ b/tests/component/records/test_pandas_to_csv_options_vertica.py @@ -3,20 +3,34 @@ from records_mover.records.pandas import pandas_to_csv_options from records_mover.records.processing_instructions import ProcessingInstructions from records_mover.records.records_format import DelimitedRecordsFormat +from packaging import version +import pandas as pd class TestPandasToCsvOptionsVertica(unittest.TestCase): def test_pandas_to_csv_options_vertica(self): - expected = { - 'date_format': '%Y-%m-%d %H:%M:%S.%f%z', - 'doublequote': False, - 'encoding': 'UTF8', - 'header': False, - 'lineterminator': '\x02', - 'quotechar': '"', - 'quoting': 3, - 'sep': '\x01', - } + if version.parse(pd.__version__) >= version.parse('1.5.0'): + expected = { + 'date_format': '%Y-%m-%d %H:%M:%S.%f%z', + 'doublequote': False, + 'encoding': 'UTF8', + 'header': False, + 'lineterminator': '\x02', + 'quotechar': '"', + 'quoting': 3, + 'sep': '\x01', + } + else: + expected = { + 'date_format': '%Y-%m-%d %H:%M:%S.%f%z', + 'doublequote': False, + 'encoding': 'UTF8', + 'header': False, + 'line_terminator': '\x02', + 'quotechar': '"', + 'quoting': 3, + 'sep': '\x01', + } processing_instructions = ProcessingInstructions() records_format = DelimitedRecordsFormat(hints=vertica_format_hints) unhandled_hints = set(records_format.hints) diff --git a/tests/unit/records/targets/test_fileobj.py b/tests/unit/records/targets/test_fileobj.py index 4961dd9d8..57b6ad395 100644 --- a/tests/unit/records/targets/test_fileobj.py +++ b/tests/unit/records/targets/test_fileobj.py @@ -3,6 +3,8 @@ from records_mover.records.results import MoveResult from records_mover.records.records_format import DelimitedRecordsFormat from mock import patch, Mock, ANY +from packaging import version +import pandas as pd class TestFileobjTarget(unittest.TestCase): @@ -34,30 +36,58 @@ def test_move_from_dataframe_uncompressed_no_header_row(self, out = fileobj_target.move_from_dataframes_source(mock_dfs_source, mock_processing_instructions) mock_text_fileobj = mock_io.TextIOWrapper.return_value - mock_df_1.to_csv.assert_called_with(index=mock_dfs_source.include_index, - path_or_buf=mock_text_fileobj, - mode="a", - date_format='%Y-%m-%d %H:%M:%S.%f%z', - doublequote=False, - encoding='UTF8', - escapechar='\\', - header=False, - lineterminator='\n', - quotechar='"', - quoting=1, - sep=',') - mock_df_2.to_csv.assert_called_with(index=mock_dfs_source.include_index, - path_or_buf=mock_text_fileobj, - mode="a", - date_format='%Y-%m-%d %H:%M:%S.%f%z', - doublequote=False, - encoding='UTF8', - escapechar='\\', - header=False, - lineterminator='\n', - quotechar='"', - quoting=1, - sep=',') + if version.parse(pd.__version__) >= version.parse('1.5.0'): + mock_df_1.to_csv.assert_called_with(index=mock_dfs_source.include_index, + path_or_buf=mock_text_fileobj, + mode="a", + date_format='%Y-%m-%d %H:%M:%S.%f%z', + doublequote=False, + encoding='UTF8', + escapechar='\\', + header=False, + lineterminator='\n', + quotechar='"', + quoting=1, + sep=',') + else: + mock_df_1.to_csv.assert_called_with(index=mock_dfs_source.include_index, + path_or_buf=mock_text_fileobj, + mode="a", + date_format='%Y-%m-%d %H:%M:%S.%f%z', + doublequote=False, + encoding='UTF8', + escapechar='\\', + header=False, + line_terminator='\n', + quotechar='"', + quoting=1, + sep=',') + if version.parse(pd.__version__) >= version.parse('1.5.0'): + mock_df_2.to_csv.assert_called_with(index=mock_dfs_source.include_index, + path_or_buf=mock_text_fileobj, + mode="a", + date_format='%Y-%m-%d %H:%M:%S.%f%z', + doublequote=False, + encoding='UTF8', + escapechar='\\', + header=False, + lineterminator='\n', + quotechar='"', + quoting=1, + sep=',') + else: + mock_df_2.to_csv.assert_called_with(index=mock_dfs_source.include_index, + path_or_buf=mock_text_fileobj, + mode="a", + date_format='%Y-%m-%d %H:%M:%S.%f%z', + doublequote=False, + encoding='UTF8', + escapechar='\\', + header=False, + line_terminator='\n', + quotechar='"', + quoting=1, + sep=',') self.assertEqual(out, MoveResult(move_count=2, output_urls=None)) @patch('records_mover.records.pandas.prep_df_for_csv_output') @@ -88,30 +118,58 @@ def test_move_from_dataframe_uncompressed_with_header_row(self, out = fileobj_target.move_from_dataframes_source(mock_dfs_source, mock_processing_instructions) mock_text_fileobj = mock_io.TextIOWrapper.return_value - mock_df_1.to_csv.assert_called_with(index=mock_dfs_source.include_index, - path_or_buf=mock_text_fileobj, - mode="a", - date_format='%Y-%m-%d %H:%M:%S.%f%z', - doublequote=False, - encoding='UTF8', - escapechar='\\', - header=True, - lineterminator='\n', - quotechar='"', - quoting=1, - sep=',') - mock_df_2.to_csv.assert_called_with(index=mock_dfs_source.include_index, - path_or_buf=mock_text_fileobj, - mode="a", - date_format='%Y-%m-%d %H:%M:%S.%f%z', - doublequote=False, - encoding='UTF8', - escapechar='\\', - header=False, - lineterminator='\n', - quotechar='"', - quoting=1, - sep=',') + if version.parse(pd.__version__) >= version.parse('1.5.0'): + mock_df_1.to_csv.assert_called_with(index=mock_dfs_source.include_index, + path_or_buf=mock_text_fileobj, + mode="a", + date_format='%Y-%m-%d %H:%M:%S.%f%z', + doublequote=False, + encoding='UTF8', + escapechar='\\', + header=True, + lineterminator='\n', + quotechar='"', + quoting=1, + sep=',') + else: + mock_df_1.to_csv.assert_called_with(index=mock_dfs_source.include_index, + path_or_buf=mock_text_fileobj, + mode="a", + date_format='%Y-%m-%d %H:%M:%S.%f%z', + doublequote=False, + encoding='UTF8', + escapechar='\\', + header=True, + line_terminator='\n', + quotechar='"', + quoting=1, + sep=',') + if version.parse(pd.__version__) >= version.parse('1.5.0'): + mock_df_2.to_csv.assert_called_with(index=mock_dfs_source.include_index, + path_or_buf=mock_text_fileobj, + mode="a", + date_format='%Y-%m-%d %H:%M:%S.%f%z', + doublequote=False, + encoding='UTF8', + escapechar='\\', + header=False, + lineterminator='\n', + quotechar='"', + quoting=1, + sep=',') + else: + mock_df_2.to_csv.assert_called_with(index=mock_dfs_source.include_index, + path_or_buf=mock_text_fileobj, + mode="a", + date_format='%Y-%m-%d %H:%M:%S.%f%z', + doublequote=False, + encoding='UTF8', + escapechar='\\', + header=False, + line_terminator='\n', + quotechar='"', + quoting=1, + sep=',') self.assertEqual(out, MoveResult(move_count=2, output_urls=None)) @patch('records_mover.records.pandas.prep_df_for_csv_output') @@ -141,32 +199,62 @@ def test_move_from_dataframe_compressed_no_header_row(self, mock_prep_df_for_csv_output.side_effect = [mock_df_1, mock_df_2] out = fileobj_target.move_from_dataframes_source(mock_dfs_source, mock_processing_instructions) - mock_df_1.to_csv.assert_called_with(path_or_buf=ANY, - index=mock_dfs_source.include_index, - mode="a", - compression='gzip', - date_format='%Y-%m-%d %H:%M:%S.%f%z', - doublequote=False, - encoding='UTF8', - escapechar='\\', - header=False, - lineterminator='\n', - quotechar='"', - quoting=1, - sep=',') - mock_df_2.to_csv.assert_called_with(path_or_buf=ANY, - index=mock_dfs_source.include_index, - mode="a", - compression='gzip', - date_format='%Y-%m-%d %H:%M:%S.%f%z', - doublequote=False, - encoding='UTF8', - escapechar='\\', - header=False, - lineterminator='\n', - quotechar='"', - quoting=1, - sep=',') + if version.parse(pd.__version__) >= version.parse('1.5.0'): + mock_df_1.to_csv.assert_called_with(path_or_buf=ANY, + index=mock_dfs_source.include_index, + mode="a", + compression='gzip', + date_format='%Y-%m-%d %H:%M:%S.%f%z', + doublequote=False, + encoding='UTF8', + escapechar='\\', + header=False, + lineterminator='\n', + quotechar='"', + quoting=1, + sep=',') + else: + mock_df_1.to_csv.assert_called_with(path_or_buf=ANY, + index=mock_dfs_source.include_index, + mode="a", + compression='gzip', + date_format='%Y-%m-%d %H:%M:%S.%f%z', + doublequote=False, + encoding='UTF8', + escapechar='\\', + header=False, + line_terminator='\n', + quotechar='"', + quoting=1, + sep=',') + if version.parse(pd.__version__) >= version.parse('1.5.0'): + mock_df_2.to_csv.assert_called_with(path_or_buf=ANY, + index=mock_dfs_source.include_index, + mode="a", + compression='gzip', + date_format='%Y-%m-%d %H:%M:%S.%f%z', + doublequote=False, + encoding='UTF8', + escapechar='\\', + header=False, + lineterminator='\n', + quotechar='"', + quoting=1, + sep=',') + else: + mock_df_2.to_csv.assert_called_with(path_or_buf=ANY, + index=mock_dfs_source.include_index, + mode="a", + compression='gzip', + date_format='%Y-%m-%d %H:%M:%S.%f%z', + doublequote=False, + encoding='UTF8', + escapechar='\\', + header=False, + line_terminator='\n', + quotechar='"', + quoting=1, + sep=',') self.assertEqual(out, MoveResult(move_count=2, output_urls=None)) @patch('records_mover.records.pandas.prep_df_for_csv_output') @@ -196,30 +284,60 @@ def test_move_from_dataframe_compressed_with_header_row(self, mock_prep_df_for_csv_output.side_effect = [mock_df_1, mock_df_2] out = fileobj_target.move_from_dataframes_source(mock_dfs_source, mock_processing_instructions) - mock_df_1.to_csv.assert_called_with(path_or_buf=ANY, - index=mock_dfs_source.include_index, - mode="a", - compression='gzip', - date_format='%Y-%m-%d %H:%M:%S.%f%z', - doublequote=False, - encoding='UTF8', - escapechar='\\', - header=True, - lineterminator='\n', - quotechar='"', - quoting=1, - sep=',') - mock_df_2.to_csv.assert_called_with(path_or_buf=ANY, - index=mock_dfs_source.include_index, - mode="a", - compression='gzip', - date_format='%Y-%m-%d %H:%M:%S.%f%z', - doublequote=False, - encoding='UTF8', - escapechar='\\', - header=False, - lineterminator='\n', - quotechar='"', - quoting=1, - sep=',') + if version.parse(pd.__version__) >= version.parse('1.5.0'): + mock_df_1.to_csv.assert_called_with(path_or_buf=ANY, + index=mock_dfs_source.include_index, + mode="a", + compression='gzip', + date_format='%Y-%m-%d %H:%M:%S.%f%z', + doublequote=False, + encoding='UTF8', + escapechar='\\', + header=True, + lineterminator='\n', + quotechar='"', + quoting=1, + sep=',') + else: + mock_df_1.to_csv.assert_called_with(path_or_buf=ANY, + index=mock_dfs_source.include_index, + mode="a", + compression='gzip', + date_format='%Y-%m-%d %H:%M:%S.%f%z', + doublequote=False, + encoding='UTF8', + escapechar='\\', + header=True, + line_terminator='\n', + quotechar='"', + quoting=1, + sep=',') + if version.parse(pd.__version__) >= version.parse('1.5.0'): + mock_df_2.to_csv.assert_called_with(path_or_buf=ANY, + index=mock_dfs_source.include_index, + mode="a", + compression='gzip', + date_format='%Y-%m-%d %H:%M:%S.%f%z', + doublequote=False, + encoding='UTF8', + escapechar='\\', + header=False, + lineterminator='\n', + quotechar='"', + quoting=1, + sep=',') + else: + mock_df_2.to_csv.assert_called_with(path_or_buf=ANY, + index=mock_dfs_source.include_index, + mode="a", + compression='gzip', + date_format='%Y-%m-%d %H:%M:%S.%f%z', + doublequote=False, + encoding='UTF8', + escapechar='\\', + header=False, + line_terminator='\n', + quotechar='"', + quoting=1, + sep=',') self.assertEqual(out, MoveResult(move_count=2, output_urls=None))