-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
The Pandas hint inference is actually not doing that much for us. The following things aren't inferred at all by it: 'escape' 'doublequote' 'quotechar', 'encoding' 'header-row' This is confusing, as our current code nominally pulls that info from a pandas object - but in practice, those things are either pulled from what we originally passed in to read_csv() or set to default values - https://github.com/pandas-dev/pandas/blob/e9b019b653d37146f9095bb0522525b3a8d9e386/pandas/io/parsers.py#L2253 It turns out that Python itself ships with an csv.Sniffer class that can do a lot of this. There are some limitations - it only works with some common record terminators and it only operates on decompressed files, but those are things we can work around. The net effect is a lot better hint sniffing overall, which shows in the component test results here (sample CSV need fewer initial hints to be able to understand the files).
- Loading branch information
1 parent
6ec62d5
commit ff786a8
Showing
20 changed files
with
493 additions
and
162 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
93.9600 | ||
93.9900 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
91.8200 | ||
92.1000 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
from contextlib import contextmanager | ||
from typing import IO, Iterator | ||
import logging | ||
|
||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
@contextmanager | ||
def rewound_fileobj(fileobj: IO[bytes]) -> Iterator[IO[bytes]]: | ||
if getattr(fileobj, 'closed', None) is not None: | ||
closed = fileobj.closed | ||
if closed: | ||
logger.warning("Stream already closed") | ||
raise OSError('Stream is already closed') | ||
if not fileobj.seekable(): | ||
# OSError is what is thrown when you call .seek() on a | ||
# non-rewindable stream. | ||
raise OSError('Stream is not rewindable') | ||
original_position = fileobj.tell() | ||
fileobj.seek(0) | ||
try: | ||
yield fileobj | ||
finally: | ||
fileobj.seek(original_position) |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
from records_mover.records.delimited.sniff import ( | ||
rewound_fileobj, infer_newline_format, sniff_encoding_hint | ||
) | ||
from mock import Mock, patch | ||
import unittest | ||
|
||
|
||
class TestSniff(unittest.TestCase): | ||
def test_rewound_fileobj_already_closed(self): | ||
mock_fileobj = Mock(name='fileobj') | ||
mock_fileobj.closed = True | ||
with self.assertRaises(OSError): | ||
with rewound_fileobj(mock_fileobj): | ||
pass | ||
|
||
@patch('records_mover.records.delimited.sniff.io') | ||
def test_infer_newline_format_cant_infer(self, | ||
mock_io): | ||
mock_fileobj = Mock(name='fileobj') | ||
mock_fileobj.closed = False | ||
mock_encoding_hint = 'UTF8' | ||
mock_compression = None | ||
mock_text_fileobj = mock_io.TextIOWrapper.return_value | ||
mock_text_fileobj.newlines = None | ||
out = infer_newline_format(mock_fileobj, | ||
mock_encoding_hint, | ||
mock_compression) | ||
mock_text_fileobj.readline.assert_called | ||
self.assertIsNone(out) | ||
|
||
@patch('records_mover.records.delimited.sniff.chardet') | ||
def test_sniff_encoding_hint_no_result(self, | ||
mock_chardet): | ||
mock_fileobj = Mock(name='fileobj') | ||
mock_fileobj.closed = False | ||
mock_chardet.result = {} | ||
out = sniff_encoding_hint(mock_fileobj) | ||
self.assertIsNone(out) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
3 changes: 3 additions & 0 deletions
3
tests/unit/resources/hint_sniffing/delimited-csv-with-header-dos-newlines.csv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
num,numstr,str,comma,doublequote,quotecommaquote,newlinestr,date,time,timestamp,timestamptz, | ||
123,123,foo,",","""",""",""","* SQL unload would generate multiple files (one for each slice/part) | ||
* Filecat would produce a single data file",1/1/00,12:00 AM,1/2/00 12:34,1/2/00 12:34 |
12 changes: 12 additions & 0 deletions
12
tests/unit/resources/hint_sniffing/delimited-csv-with-header-dos-newlines.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
{ | ||
"required": { | ||
"header-row": true, | ||
"escape": null, | ||
"quoting": "minimal", | ||
"record-terminator": "\r\n" | ||
}, | ||
"initial_hints": { | ||
"escape": null | ||
}, | ||
"notes": "Escaping is not currently sniffed." | ||
} |
1 change: 1 addition & 0 deletions
1
tests/unit/resources/hint_sniffing/delimited-csv-with-header-mac-newlines.csv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
12 changes: 12 additions & 0 deletions
12
tests/unit/resources/hint_sniffing/delimited-csv-with-header-mac-newlines.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
{ | ||
"required": { | ||
"header-row": true, | ||
"escape": null, | ||
"quoting": "minimal", | ||
"record-terminator": "\r" | ||
}, | ||
"initial_hints": { | ||
"escape": null | ||
}, | ||
"notes": "Escaping is not currently sniffed." | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
a | ||
1 |
10 changes: 10 additions & 0 deletions
10
tests/unit/resources/hint_sniffing/delimited-one-column.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
{ | ||
"required": { | ||
"record-terminator": "\n", | ||
"header-row": true | ||
}, | ||
"initial_hints": { | ||
"header-row": true | ||
}, | ||
"notes": "Python's sniffer doesn't do well on single-column files, meaning we don't get header-row info even if field delimiters are somewhat irrelevant" | ||
} |
3 changes: 3 additions & 0 deletions
3
tests/unit/resources/hint_sniffing/delimited-tsv-with-header.csv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
num numstr str comma doublequote quotecommaquote newlinestr date time timestamp timestamptz | ||
123 123 foo , """" """,""" "* SQL unload would generate multiple files (one for each slice/part) | ||
* Filecat would produce a single data file" 1/1/00 12:00 AM 1/2/00 12:34 1/2/00 12:34 |
12 changes: 12 additions & 0 deletions
12
tests/unit/resources/hint_sniffing/delimited-tsv-with-header.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
{ | ||
"required": { | ||
"header-row": true, | ||
"escape": null, | ||
"quoting": "minimal", | ||
"field-delimiter": "\t" | ||
}, | ||
"initial_hints": { | ||
"escape": null | ||
}, | ||
"notes": "Escaping is not currently sniffed." | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters