From b1a7a4a68b536e423f90c5a2bc80319702ffea51 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Fri, 25 Aug 2017 23:02:28 -0700 Subject: [PATCH] API: Warn about dups in names for read_csv xref gh-17095. --- doc/source/whatsnew/v0.21.0.txt | 1 + pandas/io/parsers.py | 36 ++++++++++++++++++++ pandas/tests/io/parser/common.py | 14 -------- pandas/tests/io/parser/dtypes.py | 9 ++--- pandas/tests/io/parser/mangle_dupes.py | 46 ++++++++++++++++++++------ 5 files changed, 77 insertions(+), 29 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 942e37a29f8d57..61a2bfe9ec3a31 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -282,6 +282,7 @@ Other API Changes - The Categorical constructor no longer accepts a scalar for the ``categories`` keyword. (:issue:`16022`) - Accessing a non-existent attribute on a closed :class:`~pandas.HDFStore` will now raise an ``AttributeError`` rather than a ``ClosedFileError`` (:issue:`16301`) +- :func:`read_csv` now issues a ``UserWarning`` if the ``names`` parameter contains duplicates (:issue:`17095`) - :func:`read_csv` now treats ``'null'`` strings as missing values by default (:issue:`16471`) - :func:`read_csv` now treats ``'n/a'`` strings as missing values by default (:issue:`16078`) - :class:`pandas.HDFStore`'s string representation is now faster and less detailed. For the previous behavior, use ``pandas.HDFStore.info()``. (:issue:`16503`). diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index a9821be3fa5e2d..bec66134eae57d 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -384,6 +384,38 @@ def _validate_integer(name, val, min_val=0): return val +def _check_dup_names(names): + """ + Check if the `names` parameter contains duplicates. + + Currently, this function issues a warning if that is the case. In the + future, we will raise an error. + + Parameters + ---------- + names : array-like or None + An array containing a list of the names used for the output DataFrame. + """ + + if names is None: + return + + counts = {} + warn_dups = False + + for name in names: + if name in counts: + warn_dups = True + break + + counts[name] = True + + if warn_dups: + msg = ("Duplicate names specified. This " + "will raise an error in the future.") + warnings.warn(msg, FutureWarning, stacklevel=3) + + def _read(filepath_or_buffer, kwds): """Generic reader of line files.""" encoding = kwds.get('encoding', None) @@ -406,6 +438,10 @@ def _read(filepath_or_buffer, kwds): chunksize = _validate_integer('chunksize', kwds.get('chunksize', None), 1) nrows = _validate_integer('nrows', kwds.get('nrows', None)) + # Check for duplicates in names. + names = kwds.get("names", None) + _check_dup_names(names) + # Create the parser. parser = TextFileReader(filepath_or_buffer, **kwds) diff --git a/pandas/tests/io/parser/common.py b/pandas/tests/io/parser/common.py index cfc4a1d7c55eb0..e85d3ad294655c 100644 --- a/pandas/tests/io/parser/common.py +++ b/pandas/tests/io/parser/common.py @@ -1357,20 +1357,6 @@ def test_euro_decimal_format(self): assert df2['Number2'].dtype == float assert df2['Number3'].dtype == float - def test_read_duplicate_names(self): - # See gh-7160 - data = "a,b,a\n0,1,2\n3,4,5" - df = self.read_csv(StringIO(data)) - expected = DataFrame([[0, 1, 2], [3, 4, 5]], - columns=['a', 'b', 'a.1']) - tm.assert_frame_equal(df, expected) - - data = "0,1,2\n3,4,5" - df = self.read_csv(StringIO(data), names=["a", "b", "a"]) - expected = DataFrame([[0, 1, 2], [3, 4, 5]], - columns=['a', 'b', 'a.1']) - tm.assert_frame_equal(df, expected) - def test_inf_parsing(self): data = """\ ,A diff --git a/pandas/tests/io/parser/dtypes.py b/pandas/tests/io/parser/dtypes.py index 7311c9200f269a..0cb8db39763a88 100644 --- a/pandas/tests/io/parser/dtypes.py +++ b/pandas/tests/io/parser/dtypes.py @@ -204,10 +204,11 @@ def test_empty_with_dup_column_pass_dtype_by_indexes(self): result = self.read_csv(StringIO(data), dtype={0: 'u1', 1: 'f'}) tm.assert_frame_equal(result, expected, check_index_type=False) - data = '' - result = self.read_csv(StringIO(data), names=['one', 'one'], - dtype={0: 'u1', 1: 'f'}) - tm.assert_frame_equal(result, expected, check_index_type=False) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + data = '' + result = self.read_csv(StringIO(data), names=['one', 'one'], + dtype={0: 'u1', 1: 'f'}) + tm.assert_frame_equal(result, expected, check_index_type=False) def test_raise_on_passed_int_dtype_with_nas(self): # see gh-2631 diff --git a/pandas/tests/io/parser/mangle_dupes.py b/pandas/tests/io/parser/mangle_dupes.py index e2efb1377f8b0a..35ccf2df724ace 100644 --- a/pandas/tests/io/parser/mangle_dupes.py +++ b/pandas/tests/io/parser/mangle_dupes.py @@ -7,6 +7,9 @@ """ from pandas.compat import StringIO +from pandas import DataFrame + +import pandas.util.testing as tm class DupeColumnTests(object): @@ -25,6 +28,21 @@ def test_basic(self): mangle_dupe_cols=True) assert list(df.columns) == expected + def test_basic_names(self): + # See gh-7160 + data = "a,b,a\n0,1,2\n3,4,5" + expected = DataFrame([[0, 1, 2], [3, 4, 5]], + columns=["a", "b", "a.1"]) + + df = self.read_csv(StringIO(data)) + tm.assert_frame_equal(df, expected) + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + data = "0,1,2\n3,4,5" + df = self.read_csv(StringIO(data), + names=["a", "b", "a"]) + tm.assert_frame_equal(df, expected) + def test_thorough_mangle_columns(self): # see gh-17060 data = "a,a,a.1\n1,2,3" @@ -45,20 +63,26 @@ def test_thorough_mangle_names(self): # see gh-17095 data = "a,b,b\n1,2,3" names = ["a.1", "a.1", "a.1.1"] - df = self.read_csv(StringIO(data), sep=",", names=names, - mangle_dupe_cols=True) - assert list(df.columns) == ["a.1", "a.1.1", "a.1.1.1"] + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + df = self.read_csv(StringIO(data), sep=",", names=names, + mangle_dupe_cols=True) + assert list(df.columns) == ["a.1", "a.1.1", "a.1.1.1"] data = "a,b,c,d,e,f\n1,2,3,4,5,6" names = ["a", "a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1"] - df = self.read_csv(StringIO(data), sep=",", names=names, - mangle_dupe_cols=True) - assert list(df.columns) == ["a", "a.1", "a.1.1", "a.1.1.1", - "a.1.1.1.1", "a.1.1.1.1.1"] + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + df = self.read_csv(StringIO(data), sep=",", names=names, + mangle_dupe_cols=True) + assert list(df.columns) == ["a", "a.1", "a.1.1", "a.1.1.1", + "a.1.1.1.1", "a.1.1.1.1.1"] data = "a,b,c,d,e,f,g\n1,2,3,4,5,6,7" names = ["a", "a", "a.3", "a.1", "a.2", "a", "a"] - df = self.read_csv(StringIO(data), sep=",", names=names, - mangle_dupe_cols=True) - assert list(df.columns) == ["a", "a.1", "a.3", "a.1.1", - "a.2", "a.2.1", "a.3.1"] + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + df = self.read_csv(StringIO(data), sep=",", names=names, + mangle_dupe_cols=True) + assert list(df.columns) == ["a", "a.1", "a.3", "a.1.1", + "a.2", "a.2.1", "a.3.1"]