Merge pull request #6889 from mcwitt/fix-gh6607

BUG/ENH: Add fallback warnings and correctly handle leading whitespace in C parser
pandas-dev · Apr 23, 2014 · 7168d98 · 7168d98
2 parents 759a907 + f45b714
commit 7168d98
Show file tree

Hide file tree

Showing 8 changed files with 677 additions and 137 deletions.
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -92,7 +92,8 @@ They can take a number of arguments:
   - ``dialect``: string or :class:`python:csv.Dialect` instance to expose more
     ways to specify the file format
   - ``dtype``: A data type name or a dict of column name to data type. If not
-    specified, data types will be inferred.
+    specified, data types will be inferred. (Unsupported with
+    ``engine='python'``)
   - ``header``: row number(s) to use as the column names, and the start of the
     data.  Defaults to 0 if no ``names`` passed, otherwise ``None``. Explicitly
     pass ``header=0`` to be able to replace existing names. The header can be
@@ -154,6 +155,7 @@ They can take a number of arguments:
     pieces. Will cause an ``TextFileReader`` object to be returned. More on this
     below in the section on :ref:`iterating and chunking <io.chunking>`
   - ``skip_footer``: number of lines to skip at bottom of file (default 0)
+    (Unsupported with ``engine='c'``)
   - ``converters``: a dictionary of functions for converting values in certain
     columns, where keys are either integers or column labels
   - ``encoding``: a string representing the encoding to use for decoding
@@ -275,6 +277,11 @@ individual columns:
     df = pd.read_csv(StringIO(data), dtype={'b': object, 'c': np.float64})
     df.dtypes
 
+.. note::
+    The ``dtype`` option is currently only supported by the C engine.
+    Specifying ``dtype`` with ``engine`` other than 'c' raises a
+    ``ValueError``.
+
 .. _io.headers:
 
 Handling column names
@@ -1029,6 +1036,22 @@ Specifying ``iterator=True`` will also return the ``TextFileReader`` object:
    os.remove('tmp.sv')
    os.remove('tmp2.sv')
 
+Specifying the parser engine
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Under the hood pandas uses a fast and efficient parser implemented in C as well
+as a python implementation which is currently more feature-complete. Where
+possible pandas uses the C parser (specified as ``engine='c'``), but may fall
+back to python if C-unsupported options are specified. Currently, C-unsupported
+options include:
+
+- ``sep`` other than a single character (e.g. regex separators)
+- ``skip_footer``
+- ``sep=None`` with ``delim_whitespace=False``
+
+Specifying any of the above options will produce a ``ParserWarning`` unless the
+python engine is selected explicitly using ``engine='python'``.
+
 .. _io.store_in_csv:
 
 Writing to CSV format

diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -176,6 +176,8 @@ API Changes
 - ``.quantile`` on a ``datetime[ns]`` series now returns ``Timestamp`` instead
   of ``np.datetime64`` objects (:issue:`6810`)
 - change ``AssertionError`` to ``TypeError`` for invalid types passed to ``concat`` (:issue:`6583`)
+- Add :class:`~pandas.io.parsers.ParserWarning` class for fallback and option
+  validation warnings in :func:`read_csv`/:func:`read_table` (:issue:`6607`)
 
 Deprecations
 ~~~~~~~~~~~~
@@ -280,6 +282,9 @@ Improvements to existing features
 - Added ``how`` option to rolling-moment functions to dictate how to handle resampling; :func:``rolling_max`` defaults to max,
   :func:``rolling_min`` defaults to min, and all others default to mean (:issue:`6297`)
 - ``pd.stats.moments.rolling_var`` now uses Welford's method for increased numerical stability (:issue:`6817`)
+- Translate ``sep='\s+'`` to ``delim_whitespace=True`` in
+  :func:`read_csv`/:func:`read_table` if no other C-unsupported options
+  specified (:issue:`6607`)
 
 .. _release.bug_fixes-0.14.0:
 
@@ -402,6 +407,17 @@ Bug Fixes
 - Bug in `DataFrame.plot` and `Series.plot` legend behave inconsistently when plotting to the same axes repeatedly (:issue:`6678`)
 - Internal tests for patching ``__finalize__`` / bug in merge not finalizing (:issue:`6923`, :issue:`6927`)
 - accept ``TextFileReader`` in ``concat``, which was affecting a common user idiom (:issue:`6583`)
+- Raise :class:`ValueError` when ``sep`` specified with
+  ``delim_whitespace=True`` in :func:`read_csv`/:func:`read_table`
+  (:issue:`6607`)
+- Raise :class:`ValueError` when `engine='c'` specified with unsupported
+  options (:issue:`6607`)
+- Raise :class:`ValueError` when fallback to python parser causes options to be
+  ignored (:issue:`6607`)
+- Produce :class:`~pandas.io.parsers.ParserWarning` on fallback to python
+  parser when no options are ignored (:issue:`6607`)
+- Bug in C parser with leading whitespace (:issue:`3374`)
+- Bug in C parser with ``delim_whitespace=True`` and ``\r``-delimited lines
 
 pandas 0.13.1
 -------------

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -6,6 +6,7 @@
 from pandas import compat
 import re
 import csv
+import warnings
 
 import numpy as np
 
@@ -24,6 +25,8 @@
 import pandas.tslib as tslib
 import pandas.parser as _parser
 
+class ParserWarning(Warning):
+    pass
 
 _parser_params = """Also supports optionally iterating or breaking of the file
 into chunks.
@@ -50,6 +53,7 @@
     One-character string used to escape delimiter when quoting is QUOTE_NONE.
 dtype : Type name or dict of column -> type
     Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32}
+    (Unsupported with engine='python')
 compression : {'gzip', 'bz2', None}, default None
     For on-the-fly decompression of on-disk data
 dialect : string or csv.Dialect instance, default None
@@ -113,7 +117,7 @@
 chunksize : int, default None
     Return TextFileReader object for iteration
 skipfooter : int, default 0
-    Number of line at bottom of file to skip
+    Number of lines at bottom of file to skip (Unsupported with engine='c')
 converters : dict. optional
     Dict of functions for converting values in certain columns. Keys can either
     be integers or column labels
@@ -125,24 +129,24 @@
     Encoding to use for UTF when reading/writing (ex. 'utf-8')
 squeeze : boolean, default False
     If the parsed data only contains one column then return a Series
-na_filter: boolean, default True
+na_filter : boolean, default True
     Detect missing value markers (empty strings and the value of na_values). In
     data without any NAs, passing na_filter=False can improve the performance
     of reading a large file
 usecols : array-like
     Return a subset of the columns.
     Results in much faster parsing time and lower memory usage.
-mangle_dupe_cols: boolean, default True
+mangle_dupe_cols : boolean, default True
     Duplicate columns will be specified as 'X.0'...'X.N', rather than 'X'...'X'
-tupleize_cols: boolean, default False
+tupleize_cols : boolean, default False
     Leave a list of tuples on columns as is (default is to convert to
     a Multi Index on the columns)
-error_bad_lines: boolean, default True
+error_bad_lines : boolean, default True
     Lines with too many fields (e.g. a csv line with too many commas) will by
     default cause an exception to be raised, and no DataFrame will be returned.
     If False, then these "bad lines" will dropped from the DataFrame that is
-    returned. (Only valid with C parser).
-warn_bad_lines: boolean, default True
+    returned. (Only valid with C parser)
+warn_bad_lines : boolean, default True
     If error_bad_lines is False, and warn_bad_lines is True, a warning for each
     "bad line" will be output. (Only valid with C parser).
 infer_datetime_format : boolean, default False
@@ -154,25 +158,30 @@
 result : DataFrame or TextParser
 """
 
-_csv_sep = """sep : string, default ','
+_csv_params = """sep : string, default ','
     Delimiter to use. If sep is None, will try to automatically determine
     this. Regular expressions are accepted.
-"""
+engine : {'c', 'python'}
+    Parser engine to use. The C engine is faster while the python engine is
+    currently more feature-complete."""
 
-_table_sep = """sep : string, default \\t (tab-stop)
-    Delimiter to use. Regular expressions are accepted."""
+_table_params = """sep : string, default \\t (tab-stop)
+    Delimiter to use. Regular expressions are accepted.
+engine : {'c', 'python'}
+    Parser engine to use. The C engine is faster while the python engine is
+    currently more feature-complete."""
 
 _read_csv_doc = """
 Read CSV (comma-separated) file into DataFrame
 
 %s
-""" % (_parser_params % _csv_sep)
+""" % (_parser_params % _csv_params)
 
 _read_table_doc = """
 Read general delimited file into DataFrame
 
 %s
-""" % (_parser_params % _table_sep)
+""" % (_parser_params % _table_params)
 
 _fwf_widths = """\
 colspecs : list of pairs (int, int) or 'infer'. optional
@@ -297,6 +306,8 @@ def _read(filepath_or_buffer, kwds):
 
 def _make_parser_function(name, sep=','):
 
+    default_sep = sep
+
     def parser_f(filepath_or_buffer,
                  sep=sep,
                  dialect=None,
@@ -325,7 +336,7 @@ def parser_f(filepath_or_buffer,
                  dtype=None,
                  usecols=None,
 
-                 engine='c',
+                 engine=None,
                  delim_whitespace=False,
                  as_recarray=False,
                  na_filter=True,
@@ -362,10 +373,21 @@ def parser_f(filepath_or_buffer,
         if delimiter is None:
             delimiter = sep
 
+        if delim_whitespace and delimiter is not default_sep:
+            raise ValueError("Specified a delimiter with both sep and"\
+                    " delim_whitespace=True; you can only specify one.")
+
+        if engine is not None:
+            engine_specified = True
+        else:
+            engine = 'c'
+            engine_specified = False
+
         kwds = dict(delimiter=delimiter,
                     engine=engine,
                     dialect=dialect,
                     compression=compression,
+                    engine_specified=engine_specified,
 
                     doublequote=doublequote,
                     escapechar=escapechar,
@@ -468,10 +490,18 @@ class TextFileReader(object):
 
     """
 
-    def __init__(self, f, engine='python', **kwds):
+    def __init__(self, f, engine=None, **kwds):
 
         self.f = f
 
+        if engine is not None:
+            engine_specified = True
+        else:
+            engine = 'python'
+            engine_specified = False
+
+        self._engine_specified = kwds.get('engine_specified', engine_specified)
+
         if kwds.get('dialect') is not None:
             dialect = kwds['dialect']
             kwds['delimiter'] = dialect.delimiter
@@ -530,30 +560,60 @@ def _get_options_with_defaults(self, engine):
     def _clean_options(self, options, engine):
         result = options.copy()
 
+        engine_specified = self._engine_specified
+        fallback_reason = None
+
         sep = options['delimiter']
         delim_whitespace = options['delim_whitespace']
 
+        # C engine not supported yet
+        if engine == 'c':
+            if options['skip_footer'] > 0:
+                fallback_reason = "the 'c' engine does not support"\
+                                  " skip_footer"
+                engine = 'python'
+
         if sep is None and not delim_whitespace:
             if engine == 'c':
+                fallback_reason = "the 'c' engine does not support"\
+                                  " sep=None with delim_whitespace=False"
                 engine = 'python'
         elif sep is not None and len(sep) > 1:
-            # wait until regex engine integrated
-            if engine not in ('python', 'python-fwf'):
+            if engine == 'c' and sep == '\s+':
+                result['delim_whitespace'] = True
+                del result['delimiter']
+            elif engine not in ('python', 'python-fwf'):
+                # wait until regex engine integrated
+                fallback_reason = "the 'c' engine does not support"\
+                                  " regex separators"
                 engine = 'python'
 
-        # C engine not supported yet
-        if engine == 'c':
-            if options['skip_footer'] > 0:
-                engine = 'python'
+        if fallback_reason and engine_specified:
+            raise ValueError(fallback_reason)
 
         if engine == 'c':
             for arg in _c_unsupported:
                 del result[arg]
 
         if 'python' in engine:
             for arg in _python_unsupported:
+                if fallback_reason and result[arg] != _c_parser_defaults[arg]:
+                    msg = ("Falling back to the 'python' engine because"
+                           " {reason}, but this causes {option!r} to be"
+                           " ignored as it is not supported by the 'python'"
+                           " engine.").format(reason=fallback_reason, option=arg)
+                    if arg == 'dtype':
+                        msg += " (Note the 'converters' option provides"\
+                               " similar functionality.)"
+                    raise ValueError(msg)
                 del result[arg]
 
+        if fallback_reason:
+            warnings.warn(("Falling back to the 'python' engine because"
+                           " {0}; you can avoid this warning by specifying"
+                           " engine='python'.").format(fallback_reason),
+                          ParserWarning)
+
         index_col = options['index_col']
         names = options['names']
         converters = options['converters']

diff --git a/pandas/io/tests/test_cparser.py b/pandas/io/tests/test_cparser.py
@@ -323,6 +323,9 @@ def _test(text, **kwargs):
         data = 'A  B  C\r  2  3\r4  5  6'
         _test(data, delim_whitespace=True)
 
+        data = 'A B C\r2 3\r4 5 6'
+        _test(data, delim_whitespace=True)
+
     def test_empty_field_eof(self):
         data = 'a,b,c\n1,2,3\n4,,'