From 9fcffa7932f3f780714e6c9bc07f78a434172cf8 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Fri, 25 Aug 2017 23:02:28 -0700 Subject: [PATCH] API: Warn about dups in names for read_csv xref gh-17095. --- doc/source/io.rst | 4 +-- doc/source/whatsnew/v0.21.0.txt | 1 + pandas/io/parsers.py | 33 ++++++++++++++++-- pandas/tests/io/parser/common.py | 14 -------- pandas/tests/io/parser/dtypes.py | 9 ++--- pandas/tests/io/parser/mangle_dupes.py | 46 ++++++++++++++++++++------ 6 files changed, 74 insertions(+), 33 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index ab1ad74ee8516..d6abed6e9d1ad 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -113,8 +113,8 @@ header : int or list of ints, default ``'infer'`` rather than the first line of the file. names : array-like, default ``None`` List of column names to use. If file contains no header row, then you should - explicitly pass ``header=None``. Duplicates in this list are not allowed unless - ``mangle_dupe_cols=True``, which is the default. + explicitly pass ``header=None``. Duplicates in this list will cause + a ``UserWarning`` to be issued. index_col : int or sequence or ``False``, default ``None`` Column to use as the row labels of the DataFrame. If a sequence is given, a MultiIndex is used. If you have a malformed file with delimiters at the end of diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 5003aa0d97c1c..41d38eea94e33 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -422,6 +422,7 @@ Other API Changes - The Categorical constructor no longer accepts a scalar for the ``categories`` keyword. (:issue:`16022`) - Accessing a non-existent attribute on a closed :class:`~pandas.HDFStore` will now raise an ``AttributeError`` rather than a ``ClosedFileError`` (:issue:`16301`) +- :func:`read_csv` now issues a ``UserWarning`` if the ``names`` parameter contains duplicates (:issue:`17095`) - :func:`read_csv` now treats ``'null'`` strings as missing values by default (:issue:`16471`) - :func:`read_csv` now treats ``'n/a'`` strings as missing values by default (:issue:`16078`) - :class:`pandas.HDFStore`'s string representation is now faster and less detailed. For the previous behavior, use ``pandas.HDFStore.info()``. (:issue:`16503`). diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index d9e83176d0d6e..ed15d4295d688 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -84,8 +84,8 @@ rather than the first line of the file. names : array-like, default None List of column names to use. If file contains no header row, then you - should explicitly pass header=None. Duplicates in this list are not - allowed unless mangle_dupe_cols=True, which is the default. + should explicitly pass header=None. Duplicates in this list will cause + a ``UserWarning`` to be issued. index_col : int or sequence or False, default None Column to use as the row labels of the DataFrame. If a sequence is given, a MultiIndex is used. If you have a malformed file with delimiters at the end @@ -385,6 +385,32 @@ def _validate_integer(name, val, min_val=0): return val +def _validate_names(names): + """ + Check if the `names` parameter contains duplicates. + + If duplicates are found, we issue a warning before returning. + + Parameters + ---------- + names : array-like or None + An array containing a list of the names used for the output DataFrame. + + Returns + ------- + names : array-like or None + The original `names` parameter. + """ + + if names is not None: + if len(names) != len(set(names)): + msg = ("Duplicate names specified. This " + "will raise an error in the future.") + warnings.warn(msg, UserWarning, stacklevel=3) + + return names + + def _read(filepath_or_buffer, kwds): """Generic reader of line files.""" encoding = kwds.get('encoding', None) @@ -407,6 +433,9 @@ def _read(filepath_or_buffer, kwds): chunksize = _validate_integer('chunksize', kwds.get('chunksize', None), 1) nrows = _validate_integer('nrows', kwds.get('nrows', None)) + # Check for duplicates in names. + _validate_names(kwds.get("names", None)) + # Create the parser. parser = TextFileReader(filepath_or_buffer, **kwds) diff --git a/pandas/tests/io/parser/common.py b/pandas/tests/io/parser/common.py index cfc4a1d7c55eb..e85d3ad294655 100644 --- a/pandas/tests/io/parser/common.py +++ b/pandas/tests/io/parser/common.py @@ -1357,20 +1357,6 @@ def test_euro_decimal_format(self): assert df2['Number2'].dtype == float assert df2['Number3'].dtype == float - def test_read_duplicate_names(self): - # See gh-7160 - data = "a,b,a\n0,1,2\n3,4,5" - df = self.read_csv(StringIO(data)) - expected = DataFrame([[0, 1, 2], [3, 4, 5]], - columns=['a', 'b', 'a.1']) - tm.assert_frame_equal(df, expected) - - data = "0,1,2\n3,4,5" - df = self.read_csv(StringIO(data), names=["a", "b", "a"]) - expected = DataFrame([[0, 1, 2], [3, 4, 5]], - columns=['a', 'b', 'a.1']) - tm.assert_frame_equal(df, expected) - def test_inf_parsing(self): data = """\ ,A diff --git a/pandas/tests/io/parser/dtypes.py b/pandas/tests/io/parser/dtypes.py index 7311c9200f269..402fa0817595c 100644 --- a/pandas/tests/io/parser/dtypes.py +++ b/pandas/tests/io/parser/dtypes.py @@ -204,10 +204,11 @@ def test_empty_with_dup_column_pass_dtype_by_indexes(self): result = self.read_csv(StringIO(data), dtype={0: 'u1', 1: 'f'}) tm.assert_frame_equal(result, expected, check_index_type=False) - data = '' - result = self.read_csv(StringIO(data), names=['one', 'one'], - dtype={0: 'u1', 1: 'f'}) - tm.assert_frame_equal(result, expected, check_index_type=False) + with tm.assert_produces_warning(UserWarning, check_stacklevel=False): + data = '' + result = self.read_csv(StringIO(data), names=['one', 'one'], + dtype={0: 'u1', 1: 'f'}) + tm.assert_frame_equal(result, expected, check_index_type=False) def test_raise_on_passed_int_dtype_with_nas(self): # see gh-2631 diff --git a/pandas/tests/io/parser/mangle_dupes.py b/pandas/tests/io/parser/mangle_dupes.py index e2efb1377f8b0..6df69eb475bf7 100644 --- a/pandas/tests/io/parser/mangle_dupes.py +++ b/pandas/tests/io/parser/mangle_dupes.py @@ -7,6 +7,9 @@ """ from pandas.compat import StringIO +from pandas import DataFrame + +import pandas.util.testing as tm class DupeColumnTests(object): @@ -25,6 +28,21 @@ def test_basic(self): mangle_dupe_cols=True) assert list(df.columns) == expected + def test_basic_names(self): + # See gh-7160 + data = "a,b,a\n0,1,2\n3,4,5" + expected = DataFrame([[0, 1, 2], [3, 4, 5]], + columns=["a", "b", "a.1"]) + + df = self.read_csv(StringIO(data)) + tm.assert_frame_equal(df, expected) + + with tm.assert_produces_warning(UserWarning, check_stacklevel=False): + data = "0,1,2\n3,4,5" + df = self.read_csv(StringIO(data), + names=["a", "b", "a"]) + tm.assert_frame_equal(df, expected) + def test_thorough_mangle_columns(self): # see gh-17060 data = "a,a,a.1\n1,2,3" @@ -45,20 +63,26 @@ def test_thorough_mangle_names(self): # see gh-17095 data = "a,b,b\n1,2,3" names = ["a.1", "a.1", "a.1.1"] - df = self.read_csv(StringIO(data), sep=",", names=names, - mangle_dupe_cols=True) - assert list(df.columns) == ["a.1", "a.1.1", "a.1.1.1"] + + with tm.assert_produces_warning(UserWarning, check_stacklevel=False): + df = self.read_csv(StringIO(data), sep=",", names=names, + mangle_dupe_cols=True) + assert list(df.columns) == ["a.1", "a.1.1", "a.1.1.1"] data = "a,b,c,d,e,f\n1,2,3,4,5,6" names = ["a", "a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1"] - df = self.read_csv(StringIO(data), sep=",", names=names, - mangle_dupe_cols=True) - assert list(df.columns) == ["a", "a.1", "a.1.1", "a.1.1.1", - "a.1.1.1.1", "a.1.1.1.1.1"] + + with tm.assert_produces_warning(UserWarning, check_stacklevel=False): + df = self.read_csv(StringIO(data), sep=",", names=names, + mangle_dupe_cols=True) + assert list(df.columns) == ["a", "a.1", "a.1.1", "a.1.1.1", + "a.1.1.1.1", "a.1.1.1.1.1"] data = "a,b,c,d,e,f,g\n1,2,3,4,5,6,7" names = ["a", "a", "a.3", "a.1", "a.2", "a", "a"] - df = self.read_csv(StringIO(data), sep=",", names=names, - mangle_dupe_cols=True) - assert list(df.columns) == ["a", "a.1", "a.3", "a.1.1", - "a.2", "a.2.1", "a.3.1"] + + with tm.assert_produces_warning(UserWarning, check_stacklevel=False): + df = self.read_csv(StringIO(data), sep=",", names=names, + mangle_dupe_cols=True) + assert list(df.columns) == ["a", "a.1", "a.3", "a.1.1", + "a.2", "a.2.1", "a.3.1"]