Skip to content

Commit

Permalink
ENH: Support malformed row handling in Python engine (#15925)
Browse files Browse the repository at this point in the history
  • Loading branch information
gfyoung authored and jreback committed Apr 7, 2017
1 parent c25fbde commit 5d17a94
Show file tree
Hide file tree
Showing 5 changed files with 155 additions and 71 deletions.
4 changes: 2 additions & 2 deletions doc/source/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -342,11 +342,11 @@ error_bad_lines : boolean, default ``True``
Lines with too many fields (e.g. a csv line with too many commas) will by
default cause an exception to be raised, and no DataFrame will be returned. If
``False``, then these "bad lines" will dropped from the DataFrame that is
returned (only valid with C parser). See :ref:`bad lines <io.bad_lines>`
returned. See :ref:`bad lines <io.bad_lines>`
below.
warn_bad_lines : boolean, default ``True``
If error_bad_lines is ``False``, and warn_bad_lines is ``True``, a warning for
each "bad line" will be output (only valid with C parser).
each "bad line" will be output.

.. _io.dtypes:

Expand Down
3 changes: 2 additions & 1 deletion doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -368,9 +368,10 @@ Other Enhancements
- ``pandas.io.json.json_normalize()`` gained the option ``errors='ignore'|'raise'``; the default is ``errors='raise'`` which is backward compatible. (:issue:`14583`)
- ``pandas.io.json.json_normalize()`` with an empty ``list`` will return an empty ``DataFrame`` (:issue:`15534`)
- ``pandas.io.json.json_normalize()`` has gained a ``sep`` option that accepts ``str`` to separate joined fields; the default is ".", which is backward compatible. (:issue:`14883`)
- ``pd.read_csv()`` will now raise a ``csv.Error`` error whenever an end-of-file character is encountered in the middle of a data row (:issue:`15913`)
- A new function has been added to a ``MultiIndex`` to facilitate :ref:`Removing Unused Levels <advanced.shown_levels>`. (:issue:`15694`)
- :func:`MultiIndex.remove_unused_levels` has been added to facilitate :ref:`removing unused levels <advanced.shown_levels>`. (:issue:`15694`)
- ``pd.read_csv()`` will now raise a ``ParserError`` error whenever any parsing error occurs (:issue:`15913`, :issue:`15925`)
- ``pd.read_csv()`` now supports the ``error_bad_lines`` and ``warn_bad_lines`` arguments for the Python parser (:issue:`15925`)


.. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations
Expand Down
168 changes: 106 additions & 62 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,10 +263,10 @@
Lines with too many fields (e.g. a csv line with too many commas) will by
default cause an exception to be raised, and no DataFrame will be returned.
If False, then these "bad lines" will dropped from the DataFrame that is
returned. (Only valid with C parser)
returned.
warn_bad_lines : boolean, default True
If error_bad_lines is False, and warn_bad_lines is True, a warning for each
"bad line" will be output. (Only valid with C parser).
"bad line" will be output.
low_memory : boolean, default True
Internally process the file in chunks, resulting in lower memory use
while parsing, but possibly mixed type inference. To ensure no mixed
Expand Down Expand Up @@ -485,8 +485,6 @@ def _read(filepath_or_buffer, kwds):
_python_unsupported = set([
'low_memory',
'buffer_lines',
'error_bad_lines',
'warn_bad_lines',
'float_precision',
])
_deprecated_args = set([
Expand Down Expand Up @@ -1897,6 +1895,9 @@ def __init__(self, f, **kwds):
self.usecols, _ = _validate_usecols_arg(kwds['usecols'])
self.skip_blank_lines = kwds['skip_blank_lines']

self.warn_bad_lines = kwds['warn_bad_lines']
self.error_bad_lines = kwds['error_bad_lines']

self.names_passed = kwds['names'] or None

self.na_filter = kwds['na_filter']
Expand Down Expand Up @@ -2469,16 +2470,19 @@ def _next_line(self):
next(self.data)

while True:
orig_line = self._next_iter_line()
line = self._check_comments([orig_line])[0]
orig_line = self._next_iter_line(row_num=self.pos + 1)
self.pos += 1
if (not self.skip_blank_lines and
(self._empty(orig_line) or line)):
break
elif self.skip_blank_lines:
ret = self._check_empty([line])
if ret:
line = ret[0]

if orig_line is not None:
line = self._check_comments([orig_line])[0]

if self.skip_blank_lines:
ret = self._check_empty([line])

if ret:
line = ret[0]
break
elif self._empty(orig_line) or line:
break

# This was the first line of the file,
Expand All @@ -2491,7 +2495,28 @@ def _next_line(self):
self.buf.append(line)
return line

def _next_iter_line(self, **kwargs):
def _alert_malformed(self, msg, row_num):
"""
Alert a user about a malformed row.
If `self.error_bad_lines` is True, the alert will be `ParserError`.
If `self.warn_bad_lines` is True, the alert will be printed out.
Parameters
----------
msg : The error message to display.
row_num : The row number where the parsing error occurred.
Because this row number is displayed, we 1-index,
even though we 0-index internally.
"""

if self.error_bad_lines:
raise ParserError(msg)
elif self.warn_bad_lines:
base = 'Skipping line {row_num}: '.format(row_num=row_num)
sys.stderr.write(base + msg + '\n')

def _next_iter_line(self, row_num):
"""
Wrapper around iterating through `self.data` (CSV source).
Expand All @@ -2501,32 +2526,34 @@ def _next_iter_line(self, **kwargs):
Parameters
----------
kwargs : Keyword arguments used to customize the error message.
row_num : The row number of the line being parsed.
"""

try:
return next(self.data)
except csv.Error as e:
msg = str(e)

if 'NULL byte' in msg:
msg = ('NULL byte detected. This byte '
'cannot be processed in Python\'s '
'native csv library at the moment, '
'so please pass in engine=\'c\' instead')
elif 'newline inside string' in msg:
msg = ('EOF inside string starting with '
'line ' + str(kwargs['row_num']))

if self.skipfooter > 0:
reason = ('Error could possibly be due to '
'parsing errors in the skipped footer rows '
'(the skipfooter keyword is only applied '
'after Python\'s csv library has parsed '
'all rows).')
msg += '. ' + reason

raise csv.Error(msg)
if self.warn_bad_lines or self.error_bad_lines:
msg = str(e)

if 'NULL byte' in msg:
msg = ('NULL byte detected. This byte '
'cannot be processed in Python\'s '
'native csv library at the moment, '
'so please pass in engine=\'c\' instead')
elif 'newline inside string' in msg:
msg = ('EOF inside string starting with '
'line ' + str(row_num))

if self.skipfooter > 0:
reason = ('Error could possibly be due to '
'parsing errors in the skipped footer rows '
'(the skipfooter keyword is only applied '
'after Python\'s csv library has parsed '
'all rows).')
msg += '. ' + reason

self._alert_malformed(msg, row_num)
return None

def _check_comments(self, lines):
if self.comment is None:
Expand Down Expand Up @@ -2657,42 +2684,57 @@ def _get_index_name(self, columns):
return index_name, orig_names, columns

def _rows_to_cols(self, content):
if self.skipfooter < 0:
raise ValueError('skip footer cannot be negative')

col_len = self.num_original_columns

if self._implicit_index:
col_len += len(self.index_col)

# see gh-13320
zipped_content = list(lib.to_object_array(
content, min_width=col_len).T)
zip_len = len(zipped_content)

if self.skipfooter < 0:
raise ValueError('skip footer cannot be negative')
max_len = max([len(row) for row in content])

# Loop through rows to verify lengths are correct.
if (col_len != zip_len and
# Check that there are no rows with too many
# elements in their row (rows with too few
# elements are padded with NaN).
if (max_len > col_len and
self.index_col is not False and
self.usecols is None):
i = 0
for (i, l) in enumerate(content):
if len(l) != col_len:
break

footers = 0
if self.skipfooter:
footers = self.skipfooter
footers = self.skipfooter if self.skipfooter else 0
bad_lines = []

row_num = self.pos - (len(content) - i + footers)
iter_content = enumerate(content)
content_len = len(content)
content = []

msg = ('Expected %d fields in line %d, saw %d' %
(col_len, row_num + 1, zip_len))
if len(self.delimiter) > 1 and self.quoting != csv.QUOTE_NONE:
# see gh-13374
reason = ('Error could possibly be due to quotes being '
'ignored when a multi-char delimiter is used.')
msg += '. ' + reason
raise ValueError(msg)
for (i, l) in iter_content:
actual_len = len(l)

if actual_len > col_len:
if self.error_bad_lines or self.warn_bad_lines:
row_num = self.pos - (content_len - i + footers)
bad_lines.append((row_num, actual_len))

if self.error_bad_lines:
break
else:
content.append(l)

for row_num, actual_len in bad_lines:
msg = ('Expected %d fields in line %d, saw %d' %
(col_len, row_num + 1, actual_len))
if len(self.delimiter) > 1 and self.quoting != csv.QUOTE_NONE:
# see gh-13374
reason = ('Error could possibly be due to quotes being '
'ignored when a multi-char delimiter is used.')
msg += '. ' + reason

self._alert_malformed(msg, row_num + 1)

# see gh-13320
zipped_content = list(lib.to_object_array(
content, min_width=col_len).T)

if self.usecols:
if self._implicit_index:
Expand Down Expand Up @@ -2750,10 +2792,12 @@ def _get_lines(self, rows=None):

while True:
new_row = self._next_iter_line(
row_num=self.pos + rows)
new_rows.append(new_row)
row_num=self.pos + rows + 1)
rows += 1

if new_row is not None:
new_rows.append(new_row)

except StopIteration:
if self.skiprows:
new_rows = [row for i, row in enumerate(new_rows)
Expand Down
42 changes: 40 additions & 2 deletions pandas/tests/io/parser/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from pandas import compat
from pandas.compat import (StringIO, BytesIO, PY3,
range, lrange, u)
from pandas.errors import DtypeWarning, EmptyDataError
from pandas.errors import DtypeWarning, EmptyDataError, ParserError
from pandas.io.common import URLError
from pandas.io.parsers import TextFileReader, TextParser

Expand Down Expand Up @@ -1569,7 +1569,7 @@ def test_null_byte_char(self):
tm.assert_frame_equal(out, expected)
else:
msg = "NULL byte detected"
with tm.assertRaisesRegexp(csv.Error, msg):
with tm.assertRaisesRegexp(ParserError, msg):
self.read_csv(StringIO(data), names=cols)

def test_utf8_bom(self):
Expand Down Expand Up @@ -1695,3 +1695,41 @@ class InvalidBuffer(object):

with tm.assertRaisesRegexp(ValueError, msg):
self.read_csv(mock.Mock())

def test_skip_bad_lines(self):
# see gh-15925
data = 'a\n1\n1,2,3\n4\n5,6,7'

with tm.assertRaises(ParserError):
self.read_csv(StringIO(data))

with tm.assertRaises(ParserError):
self.read_csv(StringIO(data), error_bad_lines=True)

stderr = sys.stderr
expected = DataFrame({'a': [1, 4]})

sys.stderr = StringIO()
try:
out = self.read_csv(StringIO(data),
error_bad_lines=False,
warn_bad_lines=False)
tm.assert_frame_equal(out, expected)

val = sys.stderr.getvalue()
self.assertEqual(val, '')
finally:
sys.stderr = stderr

sys.stderr = StringIO()
try:
out = self.read_csv(StringIO(data),
error_bad_lines=False,
warn_bad_lines=True)
tm.assert_frame_equal(out, expected)

val = sys.stderr.getvalue()
self.assertTrue('Skipping line 3' in val)
self.assertTrue('Skipping line 5' in val)
finally:
sys.stderr = stderr
9 changes: 5 additions & 4 deletions pandas/tests/io/parser/python_parser_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import pandas.util.testing as tm
from pandas import DataFrame, Index
from pandas import compat
from pandas.errors import ParserError
from pandas.compat import StringIO, BytesIO, u


Expand Down Expand Up @@ -213,13 +214,13 @@ def test_multi_char_sep_quotes(self):
data = 'a,,b\n1,,a\n2,,"2,,b"'
msg = 'ignored when a multi-char delimiter is used'

with tm.assertRaisesRegexp(ValueError, msg):
with tm.assertRaisesRegexp(ParserError, msg):
self.read_csv(StringIO(data), sep=',,')

# We expect no match, so there should be an assertion
# error out of the inner context manager.
with tm.assertRaises(AssertionError):
with tm.assertRaisesRegexp(ValueError, msg):
with tm.assertRaisesRegexp(ParserError, msg):
self.read_csv(StringIO(data), sep=',,',
quoting=csv.QUOTE_NONE)

Expand All @@ -231,11 +232,11 @@ def test_skipfooter_bad_row(self):

for data in ('a\n1\n"b"a',
'a,b,c\ncat,foo,bar\ndog,foo,"baz'):
with tm.assertRaisesRegexp(csv.Error, msg):
with tm.assertRaisesRegexp(ParserError, msg):
self.read_csv(StringIO(data), skipfooter=1)

# We expect no match, so there should be an assertion
# error out of the inner context manager.
with tm.assertRaises(AssertionError):
with tm.assertRaisesRegexp(csv.Error, msg):
with tm.assertRaisesRegexp(ParserError, msg):
self.read_csv(StringIO(data))

0 comments on commit 5d17a94

Please sign in to comment.