From 5d17a94506c9234abb6578e232a5bb4a921c851c Mon Sep 17 00:00:00 2001 From: gfyoung Date: Fri, 7 Apr 2017 15:47:29 -0400 Subject: [PATCH] ENH: Support malformed row handling in Python engine (#15925) --- doc/source/io.rst | 4 +- doc/source/whatsnew/v0.20.0.txt | 3 +- pandas/io/parsers.py | 168 ++++++++++++------- pandas/tests/io/parser/common.py | 42 ++++- pandas/tests/io/parser/python_parser_only.py | 9 +- 5 files changed, 155 insertions(+), 71 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 5cec27c329a7f..f4676f3ad964e 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -342,11 +342,11 @@ error_bad_lines : boolean, default ``True`` Lines with too many fields (e.g. a csv line with too many commas) will by default cause an exception to be raised, and no DataFrame will be returned. If ``False``, then these "bad lines" will dropped from the DataFrame that is - returned (only valid with C parser). See :ref:`bad lines ` + returned. See :ref:`bad lines ` below. warn_bad_lines : boolean, default ``True`` If error_bad_lines is ``False``, and warn_bad_lines is ``True``, a warning for - each "bad line" will be output (only valid with C parser). + each "bad line" will be output. .. _io.dtypes: diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 132f20cb73142..d3207ffa86c6a 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -368,9 +368,10 @@ Other Enhancements - ``pandas.io.json.json_normalize()`` gained the option ``errors='ignore'|'raise'``; the default is ``errors='raise'`` which is backward compatible. (:issue:`14583`) - ``pandas.io.json.json_normalize()`` with an empty ``list`` will return an empty ``DataFrame`` (:issue:`15534`) - ``pandas.io.json.json_normalize()`` has gained a ``sep`` option that accepts ``str`` to separate joined fields; the default is ".", which is backward compatible. (:issue:`14883`) -- ``pd.read_csv()`` will now raise a ``csv.Error`` error whenever an end-of-file character is encountered in the middle of a data row (:issue:`15913`) - A new function has been added to a ``MultiIndex`` to facilitate :ref:`Removing Unused Levels `. (:issue:`15694`) - :func:`MultiIndex.remove_unused_levels` has been added to facilitate :ref:`removing unused levels `. (:issue:`15694`) +- ``pd.read_csv()`` will now raise a ``ParserError`` error whenever any parsing error occurs (:issue:`15913`, :issue:`15925`) +- ``pd.read_csv()`` now supports the ``error_bad_lines`` and ``warn_bad_lines`` arguments for the Python parser (:issue:`15925`) .. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index a85f9cda50879..10f8c53987471 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -263,10 +263,10 @@ Lines with too many fields (e.g. a csv line with too many commas) will by default cause an exception to be raised, and no DataFrame will be returned. If False, then these "bad lines" will dropped from the DataFrame that is - returned. (Only valid with C parser) + returned. warn_bad_lines : boolean, default True If error_bad_lines is False, and warn_bad_lines is True, a warning for each - "bad line" will be output. (Only valid with C parser). + "bad line" will be output. low_memory : boolean, default True Internally process the file in chunks, resulting in lower memory use while parsing, but possibly mixed type inference. To ensure no mixed @@ -485,8 +485,6 @@ def _read(filepath_or_buffer, kwds): _python_unsupported = set([ 'low_memory', 'buffer_lines', - 'error_bad_lines', - 'warn_bad_lines', 'float_precision', ]) _deprecated_args = set([ @@ -1897,6 +1895,9 @@ def __init__(self, f, **kwds): self.usecols, _ = _validate_usecols_arg(kwds['usecols']) self.skip_blank_lines = kwds['skip_blank_lines'] + self.warn_bad_lines = kwds['warn_bad_lines'] + self.error_bad_lines = kwds['error_bad_lines'] + self.names_passed = kwds['names'] or None self.na_filter = kwds['na_filter'] @@ -2469,16 +2470,19 @@ def _next_line(self): next(self.data) while True: - orig_line = self._next_iter_line() - line = self._check_comments([orig_line])[0] + orig_line = self._next_iter_line(row_num=self.pos + 1) self.pos += 1 - if (not self.skip_blank_lines and - (self._empty(orig_line) or line)): - break - elif self.skip_blank_lines: - ret = self._check_empty([line]) - if ret: - line = ret[0] + + if orig_line is not None: + line = self._check_comments([orig_line])[0] + + if self.skip_blank_lines: + ret = self._check_empty([line]) + + if ret: + line = ret[0] + break + elif self._empty(orig_line) or line: break # This was the first line of the file, @@ -2491,7 +2495,28 @@ def _next_line(self): self.buf.append(line) return line - def _next_iter_line(self, **kwargs): + def _alert_malformed(self, msg, row_num): + """ + Alert a user about a malformed row. + + If `self.error_bad_lines` is True, the alert will be `ParserError`. + If `self.warn_bad_lines` is True, the alert will be printed out. + + Parameters + ---------- + msg : The error message to display. + row_num : The row number where the parsing error occurred. + Because this row number is displayed, we 1-index, + even though we 0-index internally. + """ + + if self.error_bad_lines: + raise ParserError(msg) + elif self.warn_bad_lines: + base = 'Skipping line {row_num}: '.format(row_num=row_num) + sys.stderr.write(base + msg + '\n') + + def _next_iter_line(self, row_num): """ Wrapper around iterating through `self.data` (CSV source). @@ -2501,32 +2526,34 @@ def _next_iter_line(self, **kwargs): Parameters ---------- - kwargs : Keyword arguments used to customize the error message. + row_num : The row number of the line being parsed. """ try: return next(self.data) except csv.Error as e: - msg = str(e) - - if 'NULL byte' in msg: - msg = ('NULL byte detected. This byte ' - 'cannot be processed in Python\'s ' - 'native csv library at the moment, ' - 'so please pass in engine=\'c\' instead') - elif 'newline inside string' in msg: - msg = ('EOF inside string starting with ' - 'line ' + str(kwargs['row_num'])) - - if self.skipfooter > 0: - reason = ('Error could possibly be due to ' - 'parsing errors in the skipped footer rows ' - '(the skipfooter keyword is only applied ' - 'after Python\'s csv library has parsed ' - 'all rows).') - msg += '. ' + reason - - raise csv.Error(msg) + if self.warn_bad_lines or self.error_bad_lines: + msg = str(e) + + if 'NULL byte' in msg: + msg = ('NULL byte detected. This byte ' + 'cannot be processed in Python\'s ' + 'native csv library at the moment, ' + 'so please pass in engine=\'c\' instead') + elif 'newline inside string' in msg: + msg = ('EOF inside string starting with ' + 'line ' + str(row_num)) + + if self.skipfooter > 0: + reason = ('Error could possibly be due to ' + 'parsing errors in the skipped footer rows ' + '(the skipfooter keyword is only applied ' + 'after Python\'s csv library has parsed ' + 'all rows).') + msg += '. ' + reason + + self._alert_malformed(msg, row_num) + return None def _check_comments(self, lines): if self.comment is None: @@ -2657,42 +2684,57 @@ def _get_index_name(self, columns): return index_name, orig_names, columns def _rows_to_cols(self, content): + if self.skipfooter < 0: + raise ValueError('skip footer cannot be negative') + col_len = self.num_original_columns if self._implicit_index: col_len += len(self.index_col) - # see gh-13320 - zipped_content = list(lib.to_object_array( - content, min_width=col_len).T) - zip_len = len(zipped_content) - - if self.skipfooter < 0: - raise ValueError('skip footer cannot be negative') + max_len = max([len(row) for row in content]) - # Loop through rows to verify lengths are correct. - if (col_len != zip_len and + # Check that there are no rows with too many + # elements in their row (rows with too few + # elements are padded with NaN). + if (max_len > col_len and self.index_col is not False and self.usecols is None): - i = 0 - for (i, l) in enumerate(content): - if len(l) != col_len: - break - footers = 0 - if self.skipfooter: - footers = self.skipfooter + footers = self.skipfooter if self.skipfooter else 0 + bad_lines = [] - row_num = self.pos - (len(content) - i + footers) + iter_content = enumerate(content) + content_len = len(content) + content = [] - msg = ('Expected %d fields in line %d, saw %d' % - (col_len, row_num + 1, zip_len)) - if len(self.delimiter) > 1 and self.quoting != csv.QUOTE_NONE: - # see gh-13374 - reason = ('Error could possibly be due to quotes being ' - 'ignored when a multi-char delimiter is used.') - msg += '. ' + reason - raise ValueError(msg) + for (i, l) in iter_content: + actual_len = len(l) + + if actual_len > col_len: + if self.error_bad_lines or self.warn_bad_lines: + row_num = self.pos - (content_len - i + footers) + bad_lines.append((row_num, actual_len)) + + if self.error_bad_lines: + break + else: + content.append(l) + + for row_num, actual_len in bad_lines: + msg = ('Expected %d fields in line %d, saw %d' % + (col_len, row_num + 1, actual_len)) + if len(self.delimiter) > 1 and self.quoting != csv.QUOTE_NONE: + # see gh-13374 + reason = ('Error could possibly be due to quotes being ' + 'ignored when a multi-char delimiter is used.') + msg += '. ' + reason + + self._alert_malformed(msg, row_num + 1) + + # see gh-13320 + zipped_content = list(lib.to_object_array( + content, min_width=col_len).T) if self.usecols: if self._implicit_index: @@ -2750,10 +2792,12 @@ def _get_lines(self, rows=None): while True: new_row = self._next_iter_line( - row_num=self.pos + rows) - new_rows.append(new_row) + row_num=self.pos + rows + 1) rows += 1 + if new_row is not None: + new_rows.append(new_row) + except StopIteration: if self.skiprows: new_rows = [row for i, row in enumerate(new_rows) diff --git a/pandas/tests/io/parser/common.py b/pandas/tests/io/parser/common.py index 36d5f2dd5274b..ee0f00506cef3 100644 --- a/pandas/tests/io/parser/common.py +++ b/pandas/tests/io/parser/common.py @@ -19,7 +19,7 @@ from pandas import compat from pandas.compat import (StringIO, BytesIO, PY3, range, lrange, u) -from pandas.errors import DtypeWarning, EmptyDataError +from pandas.errors import DtypeWarning, EmptyDataError, ParserError from pandas.io.common import URLError from pandas.io.parsers import TextFileReader, TextParser @@ -1569,7 +1569,7 @@ def test_null_byte_char(self): tm.assert_frame_equal(out, expected) else: msg = "NULL byte detected" - with tm.assertRaisesRegexp(csv.Error, msg): + with tm.assertRaisesRegexp(ParserError, msg): self.read_csv(StringIO(data), names=cols) def test_utf8_bom(self): @@ -1695,3 +1695,41 @@ class InvalidBuffer(object): with tm.assertRaisesRegexp(ValueError, msg): self.read_csv(mock.Mock()) + + def test_skip_bad_lines(self): + # see gh-15925 + data = 'a\n1\n1,2,3\n4\n5,6,7' + + with tm.assertRaises(ParserError): + self.read_csv(StringIO(data)) + + with tm.assertRaises(ParserError): + self.read_csv(StringIO(data), error_bad_lines=True) + + stderr = sys.stderr + expected = DataFrame({'a': [1, 4]}) + + sys.stderr = StringIO() + try: + out = self.read_csv(StringIO(data), + error_bad_lines=False, + warn_bad_lines=False) + tm.assert_frame_equal(out, expected) + + val = sys.stderr.getvalue() + self.assertEqual(val, '') + finally: + sys.stderr = stderr + + sys.stderr = StringIO() + try: + out = self.read_csv(StringIO(data), + error_bad_lines=False, + warn_bad_lines=True) + tm.assert_frame_equal(out, expected) + + val = sys.stderr.getvalue() + self.assertTrue('Skipping line 3' in val) + self.assertTrue('Skipping line 5' in val) + finally: + sys.stderr = stderr diff --git a/pandas/tests/io/parser/python_parser_only.py b/pandas/tests/io/parser/python_parser_only.py index 36356315419c4..9a1eb94270e28 100644 --- a/pandas/tests/io/parser/python_parser_only.py +++ b/pandas/tests/io/parser/python_parser_only.py @@ -14,6 +14,7 @@ import pandas.util.testing as tm from pandas import DataFrame, Index from pandas import compat +from pandas.errors import ParserError from pandas.compat import StringIO, BytesIO, u @@ -213,13 +214,13 @@ def test_multi_char_sep_quotes(self): data = 'a,,b\n1,,a\n2,,"2,,b"' msg = 'ignored when a multi-char delimiter is used' - with tm.assertRaisesRegexp(ValueError, msg): + with tm.assertRaisesRegexp(ParserError, msg): self.read_csv(StringIO(data), sep=',,') # We expect no match, so there should be an assertion # error out of the inner context manager. with tm.assertRaises(AssertionError): - with tm.assertRaisesRegexp(ValueError, msg): + with tm.assertRaisesRegexp(ParserError, msg): self.read_csv(StringIO(data), sep=',,', quoting=csv.QUOTE_NONE) @@ -231,11 +232,11 @@ def test_skipfooter_bad_row(self): for data in ('a\n1\n"b"a', 'a,b,c\ncat,foo,bar\ndog,foo,"baz'): - with tm.assertRaisesRegexp(csv.Error, msg): + with tm.assertRaisesRegexp(ParserError, msg): self.read_csv(StringIO(data), skipfooter=1) # We expect no match, so there should be an assertion # error out of the inner context manager. with tm.assertRaises(AssertionError): - with tm.assertRaisesRegexp(csv.Error, msg): + with tm.assertRaisesRegexp(ParserError, msg): self.read_csv(StringIO(data))