API: Warn about dups in names for read_csv

xref gh-17095.
pandas-dev · Aug 29, 2017 · 1497183 · 1497183
1 parent e8a1765
commit 1497183
Show file tree

Hide file tree

Showing 5 changed files with 77 additions and 29 deletions.
diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt
@@ -282,6 +282,7 @@ Other API Changes
 - The Categorical constructor no longer accepts a scalar for the ``categories`` keyword. (:issue:`16022`)
 - Accessing a non-existent attribute on a closed :class:`~pandas.HDFStore` will now
   raise an ``AttributeError`` rather than a ``ClosedFileError`` (:issue:`16301`)
+- :func:`read_csv` now issues a ``UserWarning`` if the ``names`` parameter contains duplicates (:issue:`17095`)
 - :func:`read_csv` now treats ``'null'`` strings as missing values by default (:issue:`16471`)
 - :func:`read_csv` now treats ``'n/a'`` strings as missing values by default (:issue:`16078`)
 - :class:`pandas.HDFStore`'s string representation is now faster and less detailed. For the previous behavior, use ``pandas.HDFStore.info()``. (:issue:`16503`).

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -384,6 +384,38 @@ def _validate_integer(name, val, min_val=0):
     return val
 
 
+def _check_dup_names(names):
+    """
+    Check if the `names` parameter contains duplicates.
+
+    Currently, this function issues a warning if that is the case. In the
+    future, we will raise an error.
+
+    Parameters
+    ----------
+    names : array-like or None
+        An array containing a list of the names used for the output DataFrame.
+    """
+
+    if names is None:
+        return
+
+    counts = {}
+    warn_dups = False
+
+    for name in names:
+        if name in counts:
+            warn_dups = True
+            break
+
+        counts[name] = True
+
+    if warn_dups:
+        msg = ("Duplicate names specified. This "
+               "will raise an error in the future.")
+        warnings.warn(msg, FutureWarning, stacklevel=3)
+
+
 def _read(filepath_or_buffer, kwds):
     """Generic reader of line files."""
     encoding = kwds.get('encoding', None)
@@ -406,6 +438,10 @@ def _read(filepath_or_buffer, kwds):
     chunksize = _validate_integer('chunksize', kwds.get('chunksize', None), 1)
     nrows = _validate_integer('nrows', kwds.get('nrows', None))
 
+    # Check for duplicates in names.
+    names = kwds.get("names", None)
+    _check_dup_names(names)
+
     # Create the parser.
     parser = TextFileReader(filepath_or_buffer, **kwds)
 

diff --git a/pandas/tests/io/parser/common.py b/pandas/tests/io/parser/common.py
@@ -1357,20 +1357,6 @@ def test_euro_decimal_format(self):
         assert df2['Number2'].dtype == float
         assert df2['Number3'].dtype == float
 
-    def test_read_duplicate_names(self):
-        # See gh-7160
-        data = "a,b,a\n0,1,2\n3,4,5"
-        df = self.read_csv(StringIO(data))
-        expected = DataFrame([[0, 1, 2], [3, 4, 5]],
-                             columns=['a', 'b', 'a.1'])
-        tm.assert_frame_equal(df, expected)
-
-        data = "0,1,2\n3,4,5"
-        df = self.read_csv(StringIO(data), names=["a", "b", "a"])
-        expected = DataFrame([[0, 1, 2], [3, 4, 5]],
-                             columns=['a', 'b', 'a.1'])
-        tm.assert_frame_equal(df, expected)
-
     def test_inf_parsing(self):
         data = """\
 ,A

diff --git a/pandas/tests/io/parser/dtypes.py b/pandas/tests/io/parser/dtypes.py
@@ -204,10 +204,11 @@ def test_empty_with_dup_column_pass_dtype_by_indexes(self):
         result = self.read_csv(StringIO(data), dtype={0: 'u1', 1: 'f'})
         tm.assert_frame_equal(result, expected, check_index_type=False)
 
-        data = ''
-        result = self.read_csv(StringIO(data), names=['one', 'one'],
-                               dtype={0: 'u1', 1: 'f'})
-        tm.assert_frame_equal(result, expected, check_index_type=False)
+        with tm.assert_produces_warning(FutureWarning):
+            data = ''
+            result = self.read_csv(StringIO(data), names=['one', 'one'],
+                                   dtype={0: 'u1', 1: 'f'})
+            tm.assert_frame_equal(result, expected, check_index_type=False)
 
     def test_raise_on_passed_int_dtype_with_nas(self):
         # see gh-2631

diff --git a/pandas/tests/io/parser/mangle_dupes.py b/pandas/tests/io/parser/mangle_dupes.py
@@ -7,6 +7,9 @@
 """
 
 from pandas.compat import StringIO
+from pandas import DataFrame
+
+import pandas.util.testing as tm
 
 
 class DupeColumnTests(object):
@@ -25,6 +28,21 @@ def test_basic(self):
                                        mangle_dupe_cols=True)
             assert list(df.columns) == expected
 
+    def test_basic_names(self):
+        # See gh-7160
+        data = "a,b,a\n0,1,2\n3,4,5"
+        expected = DataFrame([[0, 1, 2], [3, 4, 5]],
+                             columns=["a", "b", "a.1"])
+
+        df = self.read_csv(StringIO(data))
+        tm.assert_frame_equal(df, expected)
+
+        with tm.assert_produces_warning(FutureWarning):
+            data = "0,1,2\n3,4,5"
+            df = self.read_csv(StringIO(data),
+                               names=["a", "b", "a"])
+            tm.assert_frame_equal(df, expected)
+
     def test_thorough_mangle_columns(self):
         # see gh-17060
         data = "a,a,a.1\n1,2,3"
@@ -45,20 +63,26 @@ def test_thorough_mangle_names(self):
         # see gh-17095
         data = "a,b,b\n1,2,3"
         names = ["a.1", "a.1", "a.1.1"]
-        df = self.read_csv(StringIO(data), sep=",", names=names,
-                           mangle_dupe_cols=True)
-        assert list(df.columns) == ["a.1", "a.1.1", "a.1.1.1"]
+
+        with tm.assert_produces_warning(FutureWarning):
+            df = self.read_csv(StringIO(data), sep=",", names=names,
+                               mangle_dupe_cols=True)
+            assert list(df.columns) == ["a.1", "a.1.1", "a.1.1.1"]
 
         data = "a,b,c,d,e,f\n1,2,3,4,5,6"
         names = ["a", "a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1"]
-        df = self.read_csv(StringIO(data), sep=",", names=names,
-                           mangle_dupe_cols=True)
-        assert list(df.columns) == ["a", "a.1", "a.1.1", "a.1.1.1",
-                                    "a.1.1.1.1", "a.1.1.1.1.1"]
+
+        with tm.assert_produces_warning(FutureWarning):
+            df = self.read_csv(StringIO(data), sep=",", names=names,
+                               mangle_dupe_cols=True)
+            assert list(df.columns) == ["a", "a.1", "a.1.1", "a.1.1.1",
+                                        "a.1.1.1.1", "a.1.1.1.1.1"]
 
         data = "a,b,c,d,e,f,g\n1,2,3,4,5,6,7"
         names = ["a", "a", "a.3", "a.1", "a.2", "a", "a"]
-        df = self.read_csv(StringIO(data), sep=",", names=names,
-                           mangle_dupe_cols=True)
-        assert list(df.columns) == ["a", "a.1", "a.3", "a.1.1",
-                                    "a.2", "a.2.1", "a.3.1"]
+
+        with tm.assert_produces_warning(FutureWarning):
+            df = self.read_csv(StringIO(data), sep=",", names=names,
+                               mangle_dupe_cols=True)
+            assert list(df.columns) == ["a", "a.1", "a.3", "a.1.1",
+                                        "a.2", "a.2.1", "a.3.1"]