Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

API: Warn about dups in names for read_csv #17346

Merged
merged 1 commit into from
Sep 24, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions doc/source/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -113,8 +113,8 @@ header : int or list of ints, default ``'infer'``
rather than the first line of the file.
names : array-like, default ``None``
List of column names to use. If file contains no header row, then you should
explicitly pass ``header=None``. Duplicates in this list are not allowed unless
``mangle_dupe_cols=True``, which is the default.
explicitly pass ``header=None``. Duplicates in this list will cause
a ``UserWarning`` to be issued.
index_col : int or sequence or ``False``, default ``None``
Column to use as the row labels of the DataFrame. If a sequence is given, a
MultiIndex is used. If you have a malformed file with delimiters at the end of
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.21.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -422,6 +422,7 @@ Other API Changes
- The Categorical constructor no longer accepts a scalar for the ``categories`` keyword. (:issue:`16022`)
- Accessing a non-existent attribute on a closed :class:`~pandas.HDFStore` will now
raise an ``AttributeError`` rather than a ``ClosedFileError`` (:issue:`16301`)
- :func:`read_csv` now issues a ``UserWarning`` if the ``names`` parameter contains duplicates (:issue:`17095`)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should be FutureWarning

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Doh! My bad for not catching that. Fixed.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changing back to UserWarning in light of later discussion.

- :func:`read_csv` now treats ``'null'`` strings as missing values by default (:issue:`16471`)
- :func:`read_csv` now treats ``'n/a'`` strings as missing values by default (:issue:`16078`)
- :class:`pandas.HDFStore`'s string representation is now faster and less detailed. For the previous behavior, use ``pandas.HDFStore.info()``. (:issue:`16503`).
Expand Down
33 changes: 31 additions & 2 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,8 +84,8 @@
rather than the first line of the file.
names : array-like, default None
List of column names to use. If file contains no header row, then you
should explicitly pass header=None. Duplicates in this list are not
allowed unless mangle_dupe_cols=True, which is the default.
should explicitly pass header=None. Duplicates in this list will cause
a ``UserWarning`` to be issued.
index_col : int or sequence or False, default None
Column to use as the row labels of the DataFrame. If a sequence is given, a
MultiIndex is used. If you have a malformed file with delimiters at the end
Expand Down Expand Up @@ -385,6 +385,32 @@ def _validate_integer(name, val, min_val=0):
return val


def _validate_names(names):
"""
Check if the `names` parameter contains duplicates.

If duplicates are found, we issue a warning before returning.

Parameters
----------
names : array-like or None
An array containing a list of the names used for the output DataFrame.

Returns
-------
names : array-like or None
The original `names` parameter.
"""

if names is not None:
if len(names) != len(set(names)):
msg = ("Duplicate names specified. This "
"will raise an error in the future.")
warnings.warn(msg, UserWarning, stacklevel=3)

return names


def _read(filepath_or_buffer, kwds):
"""Generic reader of line files."""
encoding = kwds.get('encoding', None)
Expand All @@ -407,6 +433,9 @@ def _read(filepath_or_buffer, kwds):
chunksize = _validate_integer('chunksize', kwds.get('chunksize', None), 1)
nrows = _validate_integer('nrows', kwds.get('nrows', None))

# Check for duplicates in names.
_validate_names(kwds.get("names", None))

# Create the parser.
parser = TextFileReader(filepath_or_buffer, **kwds)

Expand Down
14 changes: 0 additions & 14 deletions pandas/tests/io/parser/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1357,20 +1357,6 @@ def test_euro_decimal_format(self):
assert df2['Number2'].dtype == float
assert df2['Number3'].dtype == float

def test_read_duplicate_names(self):
# See gh-7160
data = "a,b,a\n0,1,2\n3,4,5"
df = self.read_csv(StringIO(data))
expected = DataFrame([[0, 1, 2], [3, 4, 5]],
columns=['a', 'b', 'a.1'])
tm.assert_frame_equal(df, expected)

data = "0,1,2\n3,4,5"
df = self.read_csv(StringIO(data), names=["a", "b", "a"])
expected = DataFrame([[0, 1, 2], [3, 4, 5]],
columns=['a', 'b', 'a.1'])
tm.assert_frame_equal(df, expected)

def test_inf_parsing(self):
data = """\
,A
Expand Down
9 changes: 5 additions & 4 deletions pandas/tests/io/parser/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,10 +204,11 @@ def test_empty_with_dup_column_pass_dtype_by_indexes(self):
result = self.read_csv(StringIO(data), dtype={0: 'u1', 1: 'f'})
tm.assert_frame_equal(result, expected, check_index_type=False)

data = ''
result = self.read_csv(StringIO(data), names=['one', 'one'],
dtype={0: 'u1', 1: 'f'})
tm.assert_frame_equal(result, expected, check_index_type=False)
with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
data = ''
result = self.read_csv(StringIO(data), names=['one', 'one'],
dtype={0: 'u1', 1: 'f'})
tm.assert_frame_equal(result, expected, check_index_type=False)

def test_raise_on_passed_int_dtype_with_nas(self):
# see gh-2631
Expand Down
46 changes: 35 additions & 11 deletions pandas/tests/io/parser/mangle_dupes.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@
"""

from pandas.compat import StringIO
from pandas import DataFrame

import pandas.util.testing as tm


class DupeColumnTests(object):
Expand All @@ -25,6 +28,21 @@ def test_basic(self):
mangle_dupe_cols=True)
assert list(df.columns) == expected

def test_basic_names(self):
# See gh-7160
data = "a,b,a\n0,1,2\n3,4,5"
expected = DataFrame([[0, 1, 2], [3, 4, 5]],
columns=["a", "b", "a.1"])

df = self.read_csv(StringIO(data))
tm.assert_frame_equal(df, expected)

with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
data = "0,1,2\n3,4,5"
df = self.read_csv(StringIO(data),
names=["a", "b", "a"])
tm.assert_frame_equal(df, expected)

def test_thorough_mangle_columns(self):
# see gh-17060
data = "a,a,a.1\n1,2,3"
Expand All @@ -45,20 +63,26 @@ def test_thorough_mangle_names(self):
# see gh-17095
data = "a,b,b\n1,2,3"
names = ["a.1", "a.1", "a.1.1"]
df = self.read_csv(StringIO(data), sep=",", names=names,
mangle_dupe_cols=True)
assert list(df.columns) == ["a.1", "a.1.1", "a.1.1.1"]

with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
df = self.read_csv(StringIO(data), sep=",", names=names,
mangle_dupe_cols=True)
assert list(df.columns) == ["a.1", "a.1.1", "a.1.1.1"]

data = "a,b,c,d,e,f\n1,2,3,4,5,6"
names = ["a", "a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1"]
df = self.read_csv(StringIO(data), sep=",", names=names,
mangle_dupe_cols=True)
assert list(df.columns) == ["a", "a.1", "a.1.1", "a.1.1.1",
"a.1.1.1.1", "a.1.1.1.1.1"]

with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
df = self.read_csv(StringIO(data), sep=",", names=names,
mangle_dupe_cols=True)
assert list(df.columns) == ["a", "a.1", "a.1.1", "a.1.1.1",
"a.1.1.1.1", "a.1.1.1.1.1"]

data = "a,b,c,d,e,f,g\n1,2,3,4,5,6,7"
names = ["a", "a", "a.3", "a.1", "a.2", "a", "a"]
df = self.read_csv(StringIO(data), sep=",", names=names,
mangle_dupe_cols=True)
assert list(df.columns) == ["a", "a.1", "a.3", "a.1.1",
"a.2", "a.2.1", "a.3.1"]

with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
df = self.read_csv(StringIO(data), sep=",", names=names,
mangle_dupe_cols=True)
assert list(df.columns) == ["a", "a.1", "a.3", "a.1.1",
"a.2", "a.2.1", "a.3.1"]