BUG: pandas-dev#7757 Fix CSV parsing of singleton list header

Fix header list manipulation resulting in NaN DataFrame Write new test to for bug Update what's new
threecgreen · Aug 3, 2017 · 01c07bc · 01c07bc
1 parent f9a552d
commit 01c07bc
Show file tree

Hide file tree

Showing 4 changed files with 24 additions and 12 deletions.
diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt
@@ -283,6 +283,7 @@ I/O
 - Bug in :func:`read_csv` in which non integer values for the header argument generated an unhelpful / unrelated error message (:issue:`16338`)
 - Bug in :func:`read_csv` in which memory management issues in exception handling, under certain conditions, would cause the interpreter to segfault (:issue:`14696, :issue:`16798`).
 - Bug in :func:`read_csv` when called with ``low_memory=False`` in which a CSV with at least one column > 2GB in size would incorrectly raise a ``MemoryError`` (:issue:`16798`).
+- Bug in :func:`read_csv` when called with a single-element list ``header`` would return a ``DataFrame`` of all NaN values (:issue:`7757`)
 - Bug in :func:`read_stata` where value labels could not be read when using an iterator (:issue:`16923`)
 - Bug in :func:`read_html` where import check fails when run in multiple threads (:issue:`16928`)
 

diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -535,23 +535,26 @@ cdef class TextReader:
             self.parser_start = 0
             self.header = []
         else:
-            if isinstance(header, list) and len(header):
-                # need to artifically skip the final line
-                # which is still a header line
-                header = list(header)
-                header.append(header[-1] + 1)
+            if isinstance(header, list):
+                if len(header) > 1:
+                    # need to artifically skip the final line
+                    # which is still a header line
+                    header = list(header)
+                    header.append(header[-1] + 1)
+                    self.parser.header_end = header[-1]
+                    self.has_mi_columns = 1
+                else:
+                    self.parser.header_end = header[0]
 
+                self.parser_start = header[-1] + 1
                 self.parser.header_start = header[0]
-                self.parser.header_end = header[-1]
                 self.parser.header = header[0]
-                self.parser_start = header[-1] + 1
-                self.has_mi_columns = 1
                 self.header = header
             else:
                 self.parser.header_start = header
                 self.parser.header_end = header
-                self.parser.header = header
                 self.parser_start = header + 1
+                self.parser.header = header
                 self.header = [ header ]
 
         self.names = names

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -2279,10 +2279,11 @@ def _infer_columns(self):
         if self.header is not None:
             header = self.header
 
-            # we have a mi columns, so read an extra line
             if isinstance(header, (list, tuple, np.ndarray)):
-                have_mi_columns = True
-                header = list(header) + [header[-1] + 1]
+                have_mi_columns = len(header) > 1
+                # we have a mi columns, so read an extra line
+                if have_mi_columns:
+                    header = list(header) + [header[-1] + 1]
             else:
                 have_mi_columns = False
                 header = [header]

diff --git a/pandas/tests/io/parser/header.py b/pandas/tests/io/parser/header.py
@@ -286,3 +286,10 @@ def test_non_int_header(self):
             self.read_csv(StringIO(data), sep=',', header=['a', 'b'])
         with tm.assert_raises_regex(ValueError, msg):
             self.read_csv(StringIO(data), sep=',', header='string_header')
+
+    def test_singleton_header(self):
+        # See GH #7757
+        data = """a,b,c\n0,1,2\n1,2,3"""
+        df = self.read_csv(StringIO(data), header=[0])
+        expected = DataFrame({"a": [0, 1], "b": [1, 2], "c": [2, 3]})
+        tm.assert_frame_equal(df, expected)