diff --git a/metrics/coverage_high_water_mark b/metrics/coverage_high_water_mark index 2be28763a..7a830d688 100644 --- a/metrics/coverage_high_water_mark +++ b/metrics/coverage_high_water_mark @@ -1 +1 @@ -93.9600 +93.9900 \ No newline at end of file diff --git a/metrics/mypy_high_water_mark b/metrics/mypy_high_water_mark index 917098e6a..ef267bff6 100644 --- a/metrics/mypy_high_water_mark +++ b/metrics/mypy_high_water_mark @@ -1 +1 @@ -91.8200 \ No newline at end of file +92.1000 \ No newline at end of file diff --git a/records_mover/records/delimited/conversions.py b/records_mover/records/delimited/conversions.py index 0afc06246..0d9d7245b 100644 --- a/records_mover/records/delimited/conversions.py +++ b/records_mover/records/delimited/conversions.py @@ -71,12 +71,3 @@ # even if the only data it saw was in ASCII, let's be ready to see more 'ascii': 'UTF8', } - -hint_compression_from_pandas: Dict[Optional[str], HintCompression] = { - # https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html - # https://github.com/bluelabsio/knowledge/ - # blob/master/Engineering/Architecture/JobDataExchange/output-design.md#hints - 'gzip': 'GZIP', - 'bz2': 'BZIP', - None: None, -} diff --git a/records_mover/records/delimited/sniff.py b/records_mover/records/delimited/sniff.py index 0b3c6a194..7f8dda3e8 100644 --- a/records_mover/records/delimited/sniff.py +++ b/records_mover/records/delimited/sniff.py @@ -1,45 +1,195 @@ import chardet -from .types import RecordsHints, BootstrappingRecordsHints, HintEncoding -from .csv_streamer import stream_csv -from .conversions import python_encoding_from_hint +from contextlib import contextmanager +from . import RecordsHints, BootstrappingRecordsHints +from .csv_streamer import stream_csv, python_encoding_from_hint import io +import csv +import gzip +import bz2 +from .types import HintEncoding, HintRecordTerminator, HintQuoting, HintCompression +from .conversions import hint_encoding_from_chardet +from typing import List, IO, Optional, Iterator, NoReturn, Dict +from records_mover.utils.rewound_fileobj import rewound_fileobj import logging -from .types import MutableRecordsHints -from .conversions import ( - hint_compression_from_pandas, - hint_encoding_from_pandas, - hint_encoding_from_chardet -) -from typing import List, IO, Optional, TYPE_CHECKING -if TYPE_CHECKING: - from pandas.io.parsers import TextFileReader logger = logging.getLogger(__name__) +HINT_INFERENCE_SAMPLING_SIZE_BYTES = 1024 -def csv_hints_from_reader(reader: 'TextFileReader') -> RecordsHints: - header = reader._engine.header - quotechar = reader._engine.data.dialect.quotechar - delimiter = reader._engine.data.dialect.delimiter - escape_char = reader._engine.data.dialect.escapechar - compression = reader._engine.compression - encoding = reader._engine.encoding - doublequote = reader._engine.doublequote - - return { - 'header-row': True if header is not None else False, - 'field-delimiter': delimiter, - 'compression': hint_compression_from_pandas[compression], - 'quotechar': quotechar, - 'doublequote': doublequote, - 'escape': escape_char, - 'encoding': hint_encoding_from_pandas.get(encoding, encoding), - 'dateformat': 'YYYY-MM-DD', - 'timeonlyformat': 'HH12:MI AM', - 'datetimeformat': 'YYYY-MM-DD HH:MI:SS', - 'datetimeformattz': 'YYYY-MM-DD HH:MI:SSOF', - } + +# mypy way of validating we're covering all cases of an enum +# +# https://github.com/python/mypy/issues/6366#issuecomment-560369716 +def _assert_never(x: NoReturn) -> NoReturn: + assert False, "Unhandled type: {}".format(type(x).__name__) + + +@contextmanager +def rewound_decompressed_fileobj(fileobj: IO[bytes], + compression: HintCompression) -> Iterator[IO[bytes]]: + with rewound_fileobj(fileobj) as fileobj_after_rewind: + if compression is None: + yield fileobj + elif compression == 'GZIP': + yield gzip.GzipFile(mode='rb', fileobj=fileobj_after_rewind) # type: ignore + elif compression == 'LZO': + # This might be useful to implement this: + # https://github.com/ir193/python-lzo/blob/master/lzo.py#L44 + raise NotImplementedError('Records mover does not currently know how ' + 'to decompress LZO files for inspection') + elif compression == 'BZIP': + yield bz2.BZ2File(mode='rb', filename=fileobj_after_rewind) + else: + _assert_never(compression) + + +def infer_newline_format(fileobj: IO[bytes], + encoding_hint: HintEncoding, + compression: HintCompression) ->\ + Optional[HintRecordTerminator]: + with rewound_decompressed_fileobj(fileobj, compression) as fileobj: + python_encoding = python_encoding_from_hint[encoding_hint] + text_fileobj = io.TextIOWrapper(fileobj, encoding=python_encoding) + try: + if text_fileobj.newlines is None: # ...and it almost certainly will be... + text_fileobj.readline() # read enough to know newline format + # https://www.python.org/dev/peps/pep-0278/ + if text_fileobj.newlines is not None: + logger.info(f"Inferred record terminator as {repr(text_fileobj.newlines)}") + return str(text_fileobj.newlines) + else: + logger.warning("Python could not determine newline format of file.") + return None + finally: + text_fileobj.detach() + + +def sniff_encoding_hint(fileobj: IO[bytes]) -> Optional[HintEncoding]: + with rewound_fileobj(fileobj) as fileobj: + detector = chardet.UniversalDetector() + while True: + chunk = fileobj.read(HINT_INFERENCE_SAMPLING_SIZE_BYTES) + detector.feed(chunk) + if detector.done or len(chunk) < HINT_INFERENCE_SAMPLING_SIZE_BYTES: + break + detector.close() + assert detector.result is not None + if 'encoding' in detector.result: + chardet_encoding = detector.result['encoding'] + if chardet_encoding in hint_encoding_from_chardet: + return hint_encoding_from_chardet[chardet_encoding] + else: + logger.warning("Got unrecognized encoding from chardet " + f"sniffing: {detector.result}") + return None + else: + logger.warning(f"Unable to sniff file encoding using chardet: {detector.result}") + return None + + +def csv_hints_from_python(fileobj: IO[bytes], + record_terminator_hint: Optional[HintRecordTerminator], + encoding_hint: HintEncoding, + compression: HintCompression) -> RecordsHints: + # https://docs.python.org/3/library/csv.html#csv.Sniffer + with rewound_decompressed_fileobj(fileobj, + compression) as fileobj: + # Sniffer tries to determine quotechar, doublequote, + # delimiter, skipinitialspace. does not try to determine + # lineterminator. + # https://github.com/python/cpython/blob/master/Lib/csv.py#L165 + python_encoding = python_encoding_from_hint[encoding_hint] + # + # TextIOWrapper can only handle standard newline types: + # + # https://docs.python.org/3/library/io.html#io.TextIOWrapper + # + if record_terminator_hint not in [None, '\n', '\r', '\r\n']: + logger.info("Unable to infer file with non-standard newlines " + f"using Python csv.Sniffer {repr(record_terminator_hint)}.") + return {} + text_fileobj = io.TextIOWrapper(fileobj, + encoding=python_encoding, + newline=record_terminator_hint) + try: + sniffer = csv.Sniffer() + sample = text_fileobj.read(HINT_INFERENCE_SAMPLING_SIZE_BYTES) + # + # the CSV sniffer's has_header() method seems to only + # cope with DOS and UNIX newlines, not Mac. So let's give it + # UNIX newlines if we know enough to translate, since + # we're not using it to sniff newline format anyway. + # + if record_terminator_hint is not None and record_terminator_hint != '\n': + sample_with_unix_newlines = sample.replace(record_terminator_hint, '\n') + else: + sample_with_unix_newlines = sample + dialect = sniffer.sniff(sample_with_unix_newlines) + header_row = sniffer.has_header(sample_with_unix_newlines) + out: RecordsHints = { + 'doublequote': dialect.doublequote, + 'field-delimiter': dialect.delimiter, + 'quotechar': dialect.quotechar, + 'header-row': header_row, + } + logger.info(f"Python csv.Dialect sniffed: {out}") + return out + except csv.Error as e: + if str(e) == 'Could not determine delimiter': + logger.info(f"Error from csv.Sniffer--potential single-field file: {str(e)}") + return {} + else: + logger.info(f"Error from csv.Sniffer--potential single-field file: {str(e)}") + raise + finally: + text_fileobj.detach() + + +def csv_hints_from_pandas(fileobj: IO[bytes], + streaming_hints: BootstrappingRecordsHints) -> RecordsHints: + import pandas + + def attempt_parse(quoting: HintQuoting) -> RecordsHints: + with rewound_fileobj(fileobj) as fresh_fileobj: + current_hints = streaming_hints.copy() + current_hints['quoting'] = quoting + logger.info(f"Attempting to parse with quoting: {quoting}") + with stream_csv(fresh_fileobj, current_hints): + return { + 'quoting': quoting + } + + if 'quoting' in streaming_hints: + return attempt_parse(streaming_hints['quoting']) + else: + # Pandas seems to parse quoting=minimal files just fine when + # you pass in quoting=all, making this technique useless to + # distinguish between minimal/nonnumeric/all, so we'll only + # try None and minimal here. + try: + return attempt_parse(quoting='minimal') + except (pandas.errors.ParserError, pandas.errors.EmptyDataError): + return attempt_parse(quoting=None) + + +def sniff_compression_hint(fileobj: IO[bytes]) -> HintCompression: + print(f'Sniffing compression') + with rewound_fileobj(fileobj) as fileobj_rewound: + # https://stackoverflow.com/a/13044946/9795956 + magic_dict: Dict[bytes, HintCompression] = { + b"\x1f\x8b\x08": "GZIP", + b"\x42\x5a\x68": "BZIP", + # "\x50\x4b\x03\x04": "zip" + } + + max_len = max(len(x) for x in magic_dict) + + file_start = fileobj_rewound.read(max_len) + for magic, filetype in magic_dict.items(): + if file_start.startswith(magic): + return filetype + return None def sniff_hints_from_fileobjs(fileobjs: List[IO[bytes]], @@ -49,95 +199,89 @@ def sniff_hints_from_fileobjs(fileobjs: List[IO[bytes]], raise NotImplementedError('Cannot currently sniff hints from mulitple ' 'files--please provide hints') fileobj = fileobjs[0] - if not fileobj.seekable(): - raise NotImplementedError('Cannot currently sniff hints from a pure stream--' - 'please save file to disk and load from there or ' - 'provide explicit records format information') hints = sniff_hints(fileobj, initial_hints=initial_hints) - fileobj.seek(0) return hints -def infer_newline_format(fileobj: IO[bytes], - inferred_hints: MutableRecordsHints, - encoding_hint: str) -> None: - closed = False - if getattr(fileobj, 'closed', None) is not None: - closed = fileobj.closed - if closed or not fileobj.seekable(): - logger.warning("Assuming UNIX newline format, as stream is not rewindable") - return - python_encoding = python_encoding_from_hint[encoding_hint] # type: ignore - original_position = fileobj.tell() - fileobj.seek(0) - text_fileobj = io.TextIOWrapper(fileobj, encoding=python_encoding) - if text_fileobj.newlines is None: # ...and it almost certainly will be... - text_fileobj.readline() # read enough to know newline format - # https://www.python.org/dev/peps/pep-0278/ - if text_fileobj.newlines is not None: - inferred_hints['record-terminator'] = str(text_fileobj.newlines) - logger.info(f"Inferred record terminator as {repr(text_fileobj.newlines)}") - else: - logger.warning("Python could not determine newline format of file.") - text_fileobj.detach() - fileobj.seek(original_position) - - -def other_inferred_csv_hints(fileobj: IO[bytes], - encoding_hint: str) -> RecordsHints: - inferred_hints: MutableRecordsHints = {} - infer_newline_format(fileobj, inferred_hints, encoding_hint) - return inferred_hints - +def sniff_hints(fileobj: IO[bytes], + initial_hints: BootstrappingRecordsHints) -> RecordsHints: + # Major limitations: + # + # * If fileobj isn't rewindable, we can't sniff or we'd keep you + # from being able to use later. + # * We can't sniff from LZO files yet. + # * No detection of 'escape' or date/time format hints. + # * Only limited detection of 'quoting' hint. + try: + # + # We'll need to determine compression and encoding to be able + # to convert from bytes to characters and figure out the rest: + # + if 'compression' not in initial_hints: + compression_hint = sniff_compression_hint(fileobj) + else: + compression_hint = initial_hints['compression'] + if 'encoding' not in initial_hints: + encoding_hint = sniff_encoding_hint(fileobj) + else: + encoding_hint = initial_hints['encoding'] + # If guessing was inconclusive, default to UTF8 + final_encoding_hint: HintEncoding = (encoding_hint or 'UTF8') -def sniff_encoding_hint(fileobj: IO[bytes]) -> Optional[HintEncoding]: - if getattr(fileobj, 'closed', None) is not None: - closed = fileobj.closed - if closed or not fileobj.seekable(): - logger.warning("Could not use chardet to detect encoding, as stream is not rewindable") - return None - original_position = fileobj.tell() - fileobj.seek(0) - detector = chardet.UniversalDetector() - while True: - chunksize = 512 - chunk = fileobj.read(chunksize) - detector.feed(chunk) - if detector.done or len(chunk) < chunksize: - break - detector.close() - fileobj.seek(original_position) - assert detector.result is not None - if 'encoding' in detector.result: - chardet_encoding = detector.result['encoding'] - if chardet_encoding in hint_encoding_from_chardet: - return hint_encoding_from_chardet[chardet_encoding] + # + # Now we can figure out what type of newlines are used in the + # file... + # + record_terminator_hint: Optional[HintRecordTerminator] = None + if 'record-terminator' in initial_hints: + record_terminator_hint = initial_hints['record-terminator'] else: - logger.warning(f"Got unrecognized encoding from chardet sniffing: {detector.result}") - return None - else: - logger.warning(f"Unable to sniff file encoding using chardet: {detector.result}") - return None + record_terminator_hint = infer_newline_format(fileobj, + final_encoding_hint, + compression_hint) + # + # Now we have enough to study each line of the file, Python's + # csv.Sniffer() can teach us some things about how each field + # is encoded and whether there's a header: + # + other_inferred_csv_hints = {} + if record_terminator_hint is not None: + other_inferred_csv_hints['record-terminator'] = record_terminator_hint + python_inferred_hints = csv_hints_from_python(fileobj, + record_terminator_hint, + final_encoding_hint, + compression_hint) -def sniff_hints(fileobj: IO[bytes], - initial_hints: BootstrappingRecordsHints) -> RecordsHints: - if 'encoding' not in initial_hints: - encoding_hint = sniff_encoding_hint(fileobj) - else: - encoding_hint = initial_hints['encoding'] - - streaming_hints = initial_hints.copy() - if encoding_hint is not None: - streaming_hints['encoding'] = encoding_hint - with stream_csv(fileobj, streaming_hints) as reader: - # overwrite hints from reader with user-specified values, as - # the reader isn't smart enough to remember things like which - # quoting setting it was told to use... - pandas_inferred_hints = csv_hints_from_reader(reader) - final_encoding_hint: str = (encoding_hint or # type: ignore - pandas_inferred_hints['encoding']) - return {**pandas_inferred_hints, - 'encoding': final_encoding_hint, - **other_inferred_csv_hints(fileobj, final_encoding_hint), - **initial_hints} # type: ignore + # + # Pandas can both validate that we chose correctly by parsing + # the file using what we have so far, and give us a crude shot + # at finding a working 'quoting' hint: + # + streaming_hints = initial_hints.copy() + streaming_hints['compression'] = compression_hint + if encoding_hint is not None: + streaming_hints['encoding'] = encoding_hint + streaming_hints.update(python_inferred_hints) # type: ignore + pandas_inferred_hints = csv_hints_from_pandas(fileobj, streaming_hints) + + # + # Let's combine these together and present back a refined + # version of the initial hints: + # + out = { + 'compression': compression_hint, + **pandas_inferred_hints, # type: ignore + **python_inferred_hints, # type: ignore + 'encoding': final_encoding_hint, + **other_inferred_csv_hints, # type: ignore + **initial_hints # type: ignore + } + logger.info(f"Inferred hints from combined sources: {out}") + return out + except OSError: + logger.warning("Could not sniff hints, as stream is not rewindable") + return {} + except NotImplementedError as e: + logger.warning(f"Could not sniff hints due to current limitations in records mover: {e}") + return {} diff --git a/records_mover/records/delimited/types.py b/records_mover/records/delimited/types.py index 7aa549d43..d3da60ec4 100644 --- a/records_mover/records/delimited/types.py +++ b/records_mover/records/delimited/types.py @@ -53,6 +53,7 @@ 'encoding': HintEncoding, 'escape': HintEscape, 'compression': HintCompression, + 'record-terminator': HintRecordTerminator, }, total=False) diff --git a/records_mover/utils/rewound_fileobj.py b/records_mover/utils/rewound_fileobj.py new file mode 100644 index 000000000..451a0f385 --- /dev/null +++ b/records_mover/utils/rewound_fileobj.py @@ -0,0 +1,25 @@ +from contextlib import contextmanager +from typing import IO, Iterator +import logging + + +logger = logging.getLogger(__name__) + + +@contextmanager +def rewound_fileobj(fileobj: IO[bytes]) -> Iterator[IO[bytes]]: + if getattr(fileobj, 'closed', None) is not None: + closed = fileobj.closed + if closed: + logger.warning("Stream already closed") + raise OSError('Stream is already closed') + if not fileobj.seekable(): + # OSError is what is thrown when you call .seek() on a + # non-rewindable stream. + raise OSError('Stream is not rewindable') + original_position = fileobj.tell() + fileobj.seek(0) + try: + yield fileobj + finally: + fileobj.seek(original_position) diff --git a/tests/unit/records/delimited/__init__.py b/tests/unit/records/delimited/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/unit/records/delimited/test_sniff.py b/tests/unit/records/delimited/test_sniff.py new file mode 100644 index 000000000..56b7fc667 --- /dev/null +++ b/tests/unit/records/delimited/test_sniff.py @@ -0,0 +1,38 @@ +from records_mover.records.delimited.sniff import ( + rewound_fileobj, infer_newline_format, sniff_encoding_hint +) +from mock import Mock, patch +import unittest + + +class TestSniff(unittest.TestCase): + def test_rewound_fileobj_already_closed(self): + mock_fileobj = Mock(name='fileobj') + mock_fileobj.closed = True + with self.assertRaises(OSError): + with rewound_fileobj(mock_fileobj): + pass + + @patch('records_mover.records.delimited.sniff.io') + def test_infer_newline_format_cant_infer(self, + mock_io): + mock_fileobj = Mock(name='fileobj') + mock_fileobj.closed = False + mock_encoding_hint = 'UTF8' + mock_compression = None + mock_text_fileobj = mock_io.TextIOWrapper.return_value + mock_text_fileobj.newlines = None + out = infer_newline_format(mock_fileobj, + mock_encoding_hint, + mock_compression) + mock_text_fileobj.readline.assert_called + self.assertIsNone(out) + + @patch('records_mover.records.delimited.sniff.chardet') + def test_sniff_encoding_hint_no_result(self, + mock_chardet): + mock_fileobj = Mock(name='fileobj') + mock_fileobj.closed = False + mock_chardet.result = {} + out = sniff_encoding_hint(mock_fileobj) + self.assertIsNone(out) diff --git a/tests/unit/records/test_hints.py b/tests/unit/records/test_hints.py index cba54d06a..728c61250 100644 --- a/tests/unit/records/test_hints.py +++ b/tests/unit/records/test_hints.py @@ -3,6 +3,8 @@ ) from mock import MagicMock, patch import io +import gzip +import bz2 import unittest import json import os @@ -11,19 +13,22 @@ class TestHints(unittest.TestCase): maxDiff = None - def test_sniff_hints(self): - resources_dir = os.path.dirname(os.path.abspath(__file__)) + '/../resources' - hint_sniffing_dir = f"{resources_dir}/hint_sniffing" + def setUp(self): + self.resources_dir = os.path.dirname(os.path.abspath(__file__)) + '/../resources' + self.hint_sniffing_dir = f"{self.resources_dir}/hint_sniffing" - test_cases = [ + def sample_file_basenames(self): + return [ os.path.splitext(os.path.basename(f))[0] - for f in os.listdir(hint_sniffing_dir) - if (os.path.isfile(os.path.join(hint_sniffing_dir, f)) and + for f in os.listdir(self.hint_sniffing_dir) + if (os.path.isfile(os.path.join(self.hint_sniffing_dir, f)) and os.path.splitext(f)[1] == '.csv') ] - for basename in test_cases: - csv_filename = f'{hint_sniffing_dir}/{basename}.csv' - config_filename = f'{hint_sniffing_dir}/{basename}.json' + + def test_sniff_hints(self): + for basename in self.sample_file_basenames(): + csv_filename = f'{self.hint_sniffing_dir}/{basename}.csv' + config_filename = f'{self.hint_sniffing_dir}/{basename}.json' with open(config_filename, 'r') as config_fileobj: config = json.load(config_fileobj) required_hints = config['required'] @@ -35,11 +40,87 @@ def test_sniff_hints(self): f"Expected at least these hints while reading {basename}: " f"{required_hints}, found these hints: {hints}") + def test_sniff_hints_gzipped_preinformed(self): + for basename in self.sample_file_basenames(): + csv_filename = f'{self.hint_sniffing_dir}/{basename}.csv' + config_filename = f'{self.hint_sniffing_dir}/{basename}.json' + with open(config_filename, 'r') as config_fileobj: + config = json.load(config_fileobj) + required_hints = config['required'] + initial_hints = config['initial_hints'] + required_hints['compression'] = 'GZIP' + initial_hints['compression'] = 'GZIP' + + with open(csv_filename, 'rb') as uncompressed_fileobj: + gzipped_data = gzip.compress(uncompressed_fileobj.read()) + fileobj = io.BytesIO(gzipped_data) + hints = sniff_hints(fileobj, initial_hints=initial_hints) + self.assertTrue(set(required_hints.items()).issubset(set(hints.items())), + f"Expected at least these hints while reading {basename}: " + f"{required_hints}, found these hints: {hints}") + + def test_sniff_hints_gzipped_sniffed(self): + for basename in self.sample_file_basenames(): + csv_filename = f'{self.hint_sniffing_dir}/{basename}.csv' + config_filename = f'{self.hint_sniffing_dir}/{basename}.json' + with open(config_filename, 'r') as config_fileobj: + config = json.load(config_fileobj) + required_hints = config['required'] + initial_hints = config['initial_hints'] + required_hints['compression'] = 'GZIP' + + with open(csv_filename, 'rb') as uncompressed_fileobj: + gzipped_data = gzip.compress(uncompressed_fileobj.read()) + fileobj = io.BytesIO(gzipped_data) + hints = sniff_hints(fileobj, initial_hints=initial_hints) + self.assertTrue(set(required_hints.items()).issubset(set(hints.items())), + f"Expected at least these hints while reading {basename}: " + f"{required_hints}, found these hints: {hints}") + + def test_sniff_hints_bzipped_preinformed(self): + for basename in self.sample_file_basenames(): + csv_filename = f'{self.hint_sniffing_dir}/{basename}.csv' + config_filename = f'{self.hint_sniffing_dir}/{basename}.json' + with open(config_filename, 'r') as config_fileobj: + config = json.load(config_fileobj) + required_hints = config['required'] + initial_hints = config['initial_hints'] + required_hints['compression'] = 'BZIP' + initial_hints['compression'] = 'BZIP' + + with open(csv_filename, 'rb') as uncompressed_fileobj: + gzipped_data = bz2.compress(uncompressed_fileobj.read()) + fileobj = io.BytesIO(gzipped_data) + hints = sniff_hints(fileobj, initial_hints=initial_hints) + self.assertTrue(set(required_hints.items()).issubset(set(hints.items())), + f"Expected at least these hints while reading {basename}: " + f"{required_hints}, found these hints: {hints}") + + def test_sniff_hints_bzipped_sniffed(self): + for basename in self.sample_file_basenames(): + csv_filename = f'{self.hint_sniffing_dir}/{basename}.csv' + config_filename = f'{self.hint_sniffing_dir}/{basename}.json' + with open(config_filename, 'r') as config_fileobj: + config = json.load(config_fileobj) + required_hints = config['required'] + initial_hints = config['initial_hints'] + required_hints['compression'] = 'BZIP' + + with open(csv_filename, 'rb') as uncompressed_fileobj: + gzipped_data = bz2.compress(uncompressed_fileobj.read()) + fileobj = io.BytesIO(gzipped_data) + hints = sniff_hints(fileobj, initial_hints=initial_hints) + self.assertTrue(set(required_hints.items()).issubset(set(hints.items())), + f"Expected at least these hints while reading {basename}: " + f"{required_hints}, found these hints: {hints}") + + @patch('records_mover.records.delimited.sniff.csv') @patch('records_mover.records.delimited.sniff.stream_csv') @patch('records_mover.records.delimited.sniff.io') def test_sniff_hints_from_fileobjs(self, mock_io, - mock_stream_csv) -> None: + mock_stream_csv, + mock_csv) -> None: mock_fileobj = MagicMock(name='fileobj') mock_fileobj.closed = False mock_fileobjs = [mock_fileobj] @@ -47,23 +128,20 @@ def test_sniff_hints_from_fileobjs(self, 'field-delimiter': ',' } mock_streaming_engine = mock_stream_csv.return_value.__enter__.return_value._engine + mock_io.TextIOWrapper.return_value.newlines = '\n' mock_streaming_engine.compression = 'gzip' mock_streaming_engine.encoding = 'utf-8' out = sniff_hints_from_fileobjs(fileobjs=mock_fileobjs, initial_hints=mock_initial_hints) self.assertEqual(out, { 'compression': 'GZIP', - 'dateformat': 'YYYY-MM-DD', - 'datetimeformat': 'YYYY-MM-DD HH:MI:SS', - 'datetimeformattz': 'YYYY-MM-DD HH:MI:SSOF', - 'doublequote': mock_streaming_engine.doublequote, + 'doublequote': mock_csv.Sniffer().sniff().doublequote, 'encoding': 'UTF8', - 'escape': mock_streaming_engine.data.dialect.escapechar, + 'quotechar': mock_csv.Sniffer().sniff().quotechar, + 'quoting': 'minimal', 'field-delimiter': ',', - 'header-row': True, - 'quotechar': mock_streaming_engine.data.dialect.quotechar, + 'header-row': mock_csv.Sniffer().has_header(), 'record-terminator': str(mock_io.TextIOWrapper.return_value.newlines), - 'timeonlyformat': 'HH12:MI AM' }) def test_sniff_hints_from_fileobjs_nonseekable(self): @@ -71,8 +149,8 @@ def test_sniff_hints_from_fileobjs_nonseekable(self): csv_bytes = csv.encode('utf-8', errors='replace') with io.BytesIO(csv_bytes) as fileobj: fileobj.seekable = lambda: False - out = sniff_encoding_hint(fileobj=fileobj) - self.assertIsNone(out) + with self.assertRaises(OSError): + sniff_encoding_hint(fileobj=fileobj) def test_sniff_hints_from_fileobjs_encodings(self): expected_hint = { @@ -107,4 +185,5 @@ def test_sniff_hints_from_fileobjs_encodings(self): 'header-row': True, 'record-terminator': '\n' } - self.assertTrue(set(needed_settings.items()).issubset(set(out.items()))) + self.assertTrue(set(needed_settings.items()).issubset(set(out.items())), + f"Needed at least {needed_settings}, got {out}") diff --git a/tests/unit/resources/hint_sniffing/delimited-bluelabs-no-header-pandas-utc.json b/tests/unit/resources/hint_sniffing/delimited-bluelabs-no-header-pandas-utc.json index 7bb05c446..6f8cceb2a 100644 --- a/tests/unit/resources/hint_sniffing/delimited-bluelabs-no-header-pandas-utc.json +++ b/tests/unit/resources/hint_sniffing/delimited-bluelabs-no-header-pandas-utc.json @@ -5,9 +5,7 @@ "quoting": null }, "initial_hints": { - "header-row": false, - "escape": "\\", - "quoting": null + "escape": "\\" }, - "notes": "Nothing here is correctly guessed by our current system. Boo." + "notes": "Escaping is not currently sniffed." } diff --git a/tests/unit/resources/hint_sniffing/delimited-csv-with-header-dos-newlines.csv b/tests/unit/resources/hint_sniffing/delimited-csv-with-header-dos-newlines.csv new file mode 100644 index 000000000..3cdda30dc --- /dev/null +++ b/tests/unit/resources/hint_sniffing/delimited-csv-with-header-dos-newlines.csv @@ -0,0 +1,3 @@ +num,numstr,str,comma,doublequote,quotecommaquote,newlinestr,date,time,timestamp,timestamptz, +123,123,foo,",","""",""",""","* SQL unload would generate multiple files (one for each slice/part) +* Filecat would produce a single data file",1/1/00,12:00 AM,1/2/00 12:34,1/2/00 12:34 diff --git a/tests/unit/resources/hint_sniffing/delimited-csv-with-header-dos-newlines.json b/tests/unit/resources/hint_sniffing/delimited-csv-with-header-dos-newlines.json new file mode 100644 index 000000000..843438659 --- /dev/null +++ b/tests/unit/resources/hint_sniffing/delimited-csv-with-header-dos-newlines.json @@ -0,0 +1,12 @@ +{ + "required": { + "header-row": true, + "escape": null, + "quoting": "minimal", + "record-terminator": "\r\n" + }, + "initial_hints": { + "escape": null + }, + "notes": "Escaping is not currently sniffed." +} diff --git a/tests/unit/resources/hint_sniffing/delimited-csv-with-header-mac-newlines.csv b/tests/unit/resources/hint_sniffing/delimited-csv-with-header-mac-newlines.csv new file mode 100644 index 000000000..f07aad6a1 --- /dev/null +++ b/tests/unit/resources/hint_sniffing/delimited-csv-with-header-mac-newlines.csv @@ -0,0 +1 @@ +num,numstr,str,comma,doublequote,quotecommaquote,newlinestr,date,time,timestamp,timestamptz, 123,123,foo,",","""",""",""","* SQL unload would generate multiple files (one for each slice/part) * Filecat would produce a single data file",1/1/00,12:00 AM,1/2/00 12:34,1/2/00 12:34 \ No newline at end of file diff --git a/tests/unit/resources/hint_sniffing/delimited-csv-with-header-mac-newlines.json b/tests/unit/resources/hint_sniffing/delimited-csv-with-header-mac-newlines.json new file mode 100644 index 000000000..1225ae485 --- /dev/null +++ b/tests/unit/resources/hint_sniffing/delimited-csv-with-header-mac-newlines.json @@ -0,0 +1,12 @@ +{ + "required": { + "header-row": true, + "escape": null, + "quoting": "minimal", + "record-terminator": "\r" + }, + "initial_hints": { + "escape": null + }, + "notes": "Escaping is not currently sniffed." +} diff --git a/tests/unit/resources/hint_sniffing/delimited-csv-with-header.json b/tests/unit/resources/hint_sniffing/delimited-csv-with-header.json index a2dcc1c10..6cc8299ca 100644 --- a/tests/unit/resources/hint_sniffing/delimited-csv-with-header.json +++ b/tests/unit/resources/hint_sniffing/delimited-csv-with-header.json @@ -2,10 +2,11 @@ "required": { "header-row": true, "escape": null, - "quoting": "minimal" + "quoting": "minimal", + "record-terminator": "\n" }, "initial_hints": { - "quoting": "minimal" + "escape": null }, - "notes": "Doesn't look like current code tries to sniff quoting" + "notes": "Escaping is not currently sniffed." } diff --git a/tests/unit/resources/hint_sniffing/delimited-one-column.csv b/tests/unit/resources/hint_sniffing/delimited-one-column.csv new file mode 100644 index 000000000..38c94da49 --- /dev/null +++ b/tests/unit/resources/hint_sniffing/delimited-one-column.csv @@ -0,0 +1,2 @@ +a +1 diff --git a/tests/unit/resources/hint_sniffing/delimited-one-column.json b/tests/unit/resources/hint_sniffing/delimited-one-column.json new file mode 100644 index 000000000..7d62513ad --- /dev/null +++ b/tests/unit/resources/hint_sniffing/delimited-one-column.json @@ -0,0 +1,10 @@ +{ + "required": { + "record-terminator": "\n", + "header-row": true + }, + "initial_hints": { + "header-row": true + }, + "notes": "Python's sniffer doesn't do well on single-column files, meaning we don't get header-row info even if field delimiters are somewhat irrelevant" +} diff --git a/tests/unit/resources/hint_sniffing/delimited-tsv-with-header.csv b/tests/unit/resources/hint_sniffing/delimited-tsv-with-header.csv new file mode 100644 index 000000000..14fdca1e2 --- /dev/null +++ b/tests/unit/resources/hint_sniffing/delimited-tsv-with-header.csv @@ -0,0 +1,3 @@ +num numstr str comma doublequote quotecommaquote newlinestr date time timestamp timestamptz +123 123 foo , """" """,""" "* SQL unload would generate multiple files (one for each slice/part) +* Filecat would produce a single data file" 1/1/00 12:00 AM 1/2/00 12:34 1/2/00 12:34 diff --git a/tests/unit/resources/hint_sniffing/delimited-tsv-with-header.json b/tests/unit/resources/hint_sniffing/delimited-tsv-with-header.json new file mode 100644 index 000000000..bde1c18b8 --- /dev/null +++ b/tests/unit/resources/hint_sniffing/delimited-tsv-with-header.json @@ -0,0 +1,12 @@ +{ + "required": { + "header-row": true, + "escape": null, + "quoting": "minimal", + "field-delimiter": "\t" + }, + "initial_hints": { + "escape": null + }, + "notes": "Escaping is not currently sniffed." +} diff --git a/tests/unit/resources/hint_sniffing/delimited-vertica-no-header.json b/tests/unit/resources/hint_sniffing/delimited-vertica-no-header.json index 191fac9b0..96e5403e2 100644 --- a/tests/unit/resources/hint_sniffing/delimited-vertica-no-header.json +++ b/tests/unit/resources/hint_sniffing/delimited-vertica-no-header.json @@ -5,15 +5,14 @@ "quoting": null, "doublequote": false, "escape": null, - "compression": null, "header-row": false }, "initial_hints": { + "escape": null, "field-delimiter": "\u0001", "record-terminator": "\u0002", - "header-row": false, - "quoting": null, - "doublequote": false + "doublequote": false, + "header-row": false }, - "notes": "This one is a disaster in terms of what is able to be determined" + "notes": "Escaping is not currently sniffed. We only sniff common newline types. The Python csv.Sniffer module doesn't work with non-standard newline types." }