Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Better hint sniffing #57

Merged
merged 38 commits into from
May 16, 2020
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
27693cb
Add TSV test
vinceatbluelabs May 7, 2020
65e9a00
Test newline type sniffing
vinceatbluelabs May 7, 2020
a44ead8
Start refactoring towards using Python csv dialect sniffing
vinceatbluelabs May 7, 2020
3eb6dd8
Fix types
vinceatbluelabs May 7, 2020
c59aca9
Convince tsv files to infer
vinceatbluelabs May 7, 2020
79ec603
Mark more that's now inferred
vinceatbluelabs May 7, 2020
e5cee99
Fix types
vinceatbluelabs May 7, 2020
2a2808a
Fix TSV format
vinceatbluelabs May 7, 2020
40e490e
Fix TSV format
vinceatbluelabs May 7, 2020
eeb3351
Better automatic quoting detection
vinceatbluelabs May 7, 2020
bbc7755
Make sure delimiter gets passed into streaming hints
vinceatbluelabs May 7, 2020
6659c10
Make sure delimiter gets passed into streaming hints
vinceatbluelabs May 7, 2020
cf9bbfb
Fix issue with unrewound streams
vinceatbluelabs May 7, 2020
be11d37
Fix quoting expectations
vinceatbluelabs May 7, 2020
3f89d05
Clean up unused code, rathcet expectations
vinceatbluelabs May 8, 2020
0d816af
Expect that escape is not sniffed
vinceatbluelabs May 12, 2020
2044ae4
Fix test
vinceatbluelabs May 12, 2020
f450c4a
Fix indentation issue
vinceatbluelabs May 12, 2020
712de5c
Update notes in hint_sniffing JSON files
vinceatbluelabs May 12, 2020
19f0de3
Be able to sniff headers
vinceatbluelabs May 12, 2020
8aca240
Clear out quite a bit of pandas inference which is not used
vinceatbluelabs May 12, 2020
7e08111
Fix expectations
vinceatbluelabs May 12, 2020
c86775b
Merge remote-tracking branch 'origin/master' into better_hint_sniffing
vinceatbluelabs May 12, 2020
ee1410f
Start testing compresssed files
vinceatbluelabs May 13, 2020
be932f7
Add sniffing-within-BZIP file support
vinceatbluelabs May 13, 2020
6b0df23
Add sniffing-within-BZIP file support
vinceatbluelabs May 13, 2020
ccd964b
Sniff gzipped files
vinceatbluelabs May 13, 2020
14ff5c9
Add test_sniff_hints_bzipped_sniffed
vinceatbluelabs May 13, 2020
4ce6cd7
Fix tests
vinceatbluelabs May 13, 2020
f793959
Document, clean up code
vinceatbluelabs May 13, 2020
30e7f49
More code cleanup
vinceatbluelabs May 13, 2020
faec7ff
Fix coverage
vinceatbluelabs May 13, 2020
7bfde9b
Document future LZO support
vinceatbluelabs May 13, 2020
7d57423
Add single-column CSV test
vinceatbluelabs May 13, 2020
aa3783f
Fix coverage
vinceatbluelabs May 13, 2020
377b0f8
Fix bigfiles issue
vinceatbluelabs May 13, 2020
d18ec02
Protect import
vinceatbluelabs May 13, 2020
34ba8c5
Merge remote-tracking branch 'origin/master' into better_hint_sniffing
vinceatbluelabs May 16, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
207 changes: 149 additions & 58 deletions records_mover/records/hints.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
import chardet
from contextlib import contextmanager
from . import RecordsHints, BootstrappingRecordsHints
from .csv_streamer import stream_csv, python_encoding_from_hint
import io
import logging
from .types import HintEncoding
from typing import Iterable, List, IO, Optional, Dict, TYPE_CHECKING
import csv
from .types import HintEncoding, HintRecordTerminator, HintQuoting
import pandas
from typing import Iterable, List, IO, Optional, Dict, Iterator, TYPE_CHECKING
if TYPE_CHECKING:
from pandas.io.parsers import TextFileReader

Expand Down Expand Up @@ -79,6 +82,11 @@ def cant_handle_hint(fail_if_cant_handle_hint: bool, hint_name: str, hints: Reco


def csv_hints_from_reader(reader: 'TextFileReader') -> RecordsHints:
# https://github.com/pandas-dev/pandas/blob/master/pandas/io/parsers.py#L783
# C parser:
# https://github.com/pandas-dev/pandas/blob/e9b019b653d37146f9095bb0522525b3a8d9e386/pandas/io/parsers.py#L1903
# Python parser:
# https://github.com/pandas-dev/pandas/blob/e9b019b653d37146f9095bb0522525b3a8d9e386/pandas/io/parsers.py#L2253
header = reader._engine.header
quotechar = reader._engine.data.dialect.quotechar
delimiter = reader._engine.data.dialect.delimiter
Expand Down Expand Up @@ -119,67 +127,142 @@ def sniff_hints_from_fileobjs(fileobjs: List[IO[bytes]],


def infer_newline_format(fileobj: IO[bytes],
inferred_hints: RecordsHints,
encoding_hint: str) -> None:
closed = False
if getattr(fileobj, 'closed', None) is not None:
closed = fileobj.closed
if closed or not fileobj.seekable():
encoding_hint: HintEncoding) ->\
Optional[HintRecordTerminator]:
try:
with rewound_fileobj(fileobj) as fileobj:
python_encoding = python_encoding_from_hint[encoding_hint]
text_fileobj = io.TextIOWrapper(fileobj, encoding=python_encoding)
try:
if text_fileobj.newlines is None: # ...and it almost certainly will be...
text_fileobj.readline() # read enough to know newline format
# https://www.python.org/dev/peps/pep-0278/
if text_fileobj.newlines is not None:
logger.info(f"Inferred record terminator as {repr(text_fileobj.newlines)}")
return str(text_fileobj.newlines)
else:
logger.warning("Python could not determine newline format of file.")
return None
finally:
text_fileobj.detach()
except OSError:
logger.warning("Assuming UNIX newline format, as stream is not rewindable")
return
python_encoding = python_encoding_from_hint[encoding_hint]
original_position = fileobj.tell()
fileobj.seek(0)
text_fileobj = io.TextIOWrapper(fileobj, encoding=python_encoding)
if text_fileobj.newlines is None: # ...and it almost certainly will be...
text_fileobj.readline() # read enough to know newline format
# https://www.python.org/dev/peps/pep-0278/
if text_fileobj.newlines is not None:
inferred_hints['record-terminator'] = str(text_fileobj.newlines)
logger.info(f"Inferred record terminator as {repr(text_fileobj.newlines)}")
else:
logger.warning("Python could not determine newline format of file.")
text_fileobj.detach()
fileobj.seek(original_position)
return None


def other_inferred_csv_hints(fileobj: IO[bytes],
encoding_hint: str) -> RecordsHints:
inferred_hints: RecordsHints = {}
infer_newline_format(fileobj, inferred_hints, encoding_hint)

return inferred_hints


def sniff_encoding_hint(fileobj: IO[bytes]) -> Optional[HintEncoding]:
@contextmanager
def rewound_fileobj(fileobj: IO[bytes]) -> Iterator[IO[bytes]]:
if getattr(fileobj, 'closed', None) is not None:
closed = fileobj.closed
if closed or not fileobj.seekable():
logger.warning("Could not use chardet to detect encoding, as stream is not rewindable")
return None
if closed:
logger.warning("Stream already closed")
raise OSError('Stream is already closed')
if not fileobj.seekable():
logger.warning("Stream not rewindable")
raise OSError('Stream is not rewindable')
original_position = fileobj.tell()
fileobj.seek(0)
detector = chardet.UniversalDetector()
while True:
chunksize = 512
chunk = fileobj.read(chunksize)
detector.feed(chunk)
if detector.done or len(chunk) < chunksize:
break
detector.close()
fileobj.seek(original_position)
assert detector.result is not None
if 'encoding' in detector.result:
chardet_encoding = detector.result['encoding']
if chardet_encoding in hint_encoding_from_chardet:
return hint_encoding_from_chardet[chardet_encoding]
else:
logger.warning(f"Got unrecognized encoding from chardet sniffing: {detector.result}")
return None
else:
logger.warning(f"Unable to sniff file encoding using chardet: {detector.result}")
try:
yield fileobj
finally:
fileobj.seek(original_position)


def sniff_encoding_hint(fileobj: IO[bytes]) -> Optional[HintEncoding]:
try:
with rewound_fileobj(fileobj) as fileobj:
detector = chardet.UniversalDetector()
while True:
chunksize = 512
chunk = fileobj.read(chunksize)
detector.feed(chunk)
if detector.done or len(chunk) < chunksize:
break
detector.close()
assert detector.result is not None
if 'encoding' in detector.result:
chardet_encoding = detector.result['encoding']
if chardet_encoding in hint_encoding_from_chardet:
return hint_encoding_from_chardet[chardet_encoding]
else:
logger.warning("Got unrecognized encoding from chardet "
f"sniffing: {detector.result}")
return None
else:
logger.warning(f"Unable to sniff file encoding using chardet: {detector.result}")
return None
except OSError:
logger.warning("Could not use chardet to detect encoding, as stream is not rewindable")
return None


def csv_hints_from_python(fileobj: IO[bytes],
record_terminator_hint: Optional[HintRecordTerminator],
encoding_hint: HintEncoding) -> RecordsHints:
# https://docs.python.org/3/library/csv.html#csv.Sniffer
try:
with rewound_fileobj(fileobj) as fileobj:
# Sniffer tries to determine quotechar, doublequote,
# delimiter, skipinitialspace. does not try to determine
# lineterminator.
# https://github.com/python/cpython/blob/master/Lib/csv.py#L165
python_encoding = python_encoding_from_hint[encoding_hint]
try:
text_fileobj = io.TextIOWrapper(fileobj,
encoding=python_encoding,
newline=record_terminator_hint)
# TODO: How to get 1024? processing instructions?
vinceatbluelabs marked this conversation as resolved.
Show resolved Hide resolved
dialect = csv.Sniffer().sniff(text_fileobj.read(1024))
out: RecordsHints = {
'doublequote': dialect.doublequote,
'field-delimiter': dialect.delimiter,
'quotechar': dialect.quotechar
}
logger.info(f"Python csv.Dialect sniffed: {out}")
return out
finally:
text_fileobj.detach()
except OSError:
logger.warning("Could not use Python's csv library to detect hints, "
"as stream is not rewindable")
return {}


def csv_hints_from_pandas(fileobj: IO[bytes],
streaming_hints: BootstrappingRecordsHints) -> RecordsHints:
def attempt_parse(quoting: HintQuoting) -> RecordsHints:
current_hints = streaming_hints.copy()
current_hints['quoting'] = quoting
with stream_csv(fileobj, current_hints) as reader:
return {
**csv_hints_from_reader(reader),
'quoting': quoting
}

if 'quoting' in streaming_hints:
return attempt_parse(streaming_hints['quoting'])
else:
try:
return attempt_parse(quoting=None)
except pandas.errors.ParserError:
try:
return attempt_parse(quoting='all')
except pandas.errors.ParserError:
try:
return attempt_parse(quoting='nonnumeric')
except pandas.errors.ParserError:
try:
return attempt_parse(quoting='minimal')
except pandas.errors.ParserError:
return attempt_parse(quoting=None)

def sniff_hints(fileobj: IO[bytes],
initial_hints: BootstrappingRecordsHints) -> RecordsHints:
if 'encoding' not in initial_hints:
Expand All @@ -190,14 +273,22 @@ def sniff_hints(fileobj: IO[bytes],
streaming_hints = initial_hints.copy()
if encoding_hint is not None:
streaming_hints['encoding'] = encoding_hint
with stream_csv(fileobj, streaming_hints) as reader:
# overwrite hints from reader with user-specified values, as
# the reader isn't smart enough to remember things like which
# quoting setting it was told to use...
pandas_inferred_hints = csv_hints_from_reader(reader)
final_encoding_hint: str = (encoding_hint or # type: ignore
pandas_inferred_hints['encoding'])
return {**pandas_inferred_hints,
'encoding': final_encoding_hint,
**other_inferred_csv_hints(fileobj, final_encoding_hint),
**initial_hints} # type: ignore
pandas_inferred_hints = csv_hints_from_pandas(fileobj, streaming_hints)
final_encoding_hint: HintEncoding = (encoding_hint or # type: ignore
pandas_inferred_hints['encoding'])
other_inferred_csv_hints = {}
record_terminator_hint = infer_newline_format(fileobj, final_encoding_hint)
if record_terminator_hint is not None:
other_inferred_csv_hints['record-terminator'] = record_terminator_hint
python_inferred_hints = csv_hints_from_python(fileobj,
record_terminator_hint,
final_encoding_hint)
out = {
**pandas_inferred_hints, # type: ignore
**python_inferred_hints, # type: ignore
'encoding': final_encoding_hint,
**other_inferred_csv_hints, # type: ignore
**initial_hints # type: ignore
}
logger.info(f"Inferred hints from combined sources: {out}")
return out # type: ignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
num,numstr,str,comma,doublequote,quotecommaquote,newlinestr,date,time,timestamp,timestamptz,
123,123,foo,",","""",""",""","* SQL unload would generate multiple files (one for each slice/part)
* Filecat would produce a single data file",1/1/00,12:00 AM,1/2/00 12:34,1/2/00 12:34
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"required": {
"header-row": true,
"escape": null,
"quoting": "minimal",
"record-terminator": "\r\n"
},
"initial_hints": {
"quoting": "minimal"
},
"notes": "Doesn't look like current code tries to sniff quoting"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
num,numstr,str,comma,doublequote,quotecommaquote,newlinestr,date,time,timestamp,timestamptz,123,123,foo,",","""",""",""","* SQL unload would generate multiple files (one for each slice/part)* Filecat would produce a single data file",1/1/00,12:00 AM,1/2/00 12:34,1/2/00 12:34
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"required": {
"header-row": true,
"escape": null,
"quoting": "minimal",
"record-terminator": "\r"
},
"initial_hints": {
"quoting": "minimal"
},
"notes": "Doesn't look like current code tries to sniff quoting"
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
"required": {
"header-row": true,
"escape": null,
"quoting": "minimal"
"quoting": "minimal",
"record-terminator": "\n"
},
"initial_hints": {
"quoting": "minimal"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
num numstr str comma doublequote quotecommaquote newlinestr date time timestamp timestamptz
123 123 foo , """" """,""" "* SQL unload would generate multiple files (one for each slice/part)
* Filecat would produce a single data file" 1/1/00 12:00 AM 1/2/00 12:34 1/2/00 12:34
13 changes: 13 additions & 0 deletions tests/unit/resources/hint_sniffing/delimited-tsv-with-header.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"required": {
"header-row": true,
"escape": null,
"quoting": "minimal",
"field-delimiter": "\t"
},
"initial_hints": {
"quoting": "minimal",
"field-delimiter": "\t"
},
"notes": "TODO Lack of delimiter detection is painful, as tabs are hard to specify on the command line"
vinceatbluelabs marked this conversation as resolved.
Show resolved Hide resolved
}
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,8 @@
"header-row": false
},
"initial_hints": {
"field-delimiter": "\u0001",
"record-terminator": "\u0002",
"header-row": false,
"quoting": null,
"doublequote": false
"header-row": false
},
"notes": "This one is a disaster in terms of what is able to be determined"
}