Skip to content

Commit

Permalink
BUG: Thoroughly dedup columns in read_csv
Browse files Browse the repository at this point in the history
  • Loading branch information
gfyoung committed Jul 24, 2017
1 parent e7c10bb commit 97636ec
Show file tree
Hide file tree
Showing 6 changed files with 61 additions and 28 deletions.
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.21.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -263,11 +263,11 @@ Indexing
I/O
^^^

- Bug in :func:`read_csv` in which columns were not being thoroughly de-duplicated (:issue:`17060`)
- Bug in :func:`read_csv` in which non integer values for the header argument generated an unhelpful / unrelated error message (:issue:`16338`)
- Bug in :func:`read_csv` in which memory management issues in exception handling, under certain conditions, would cause the interpreter to segfault (:issue:`14696, :issue:`16798`).
- Bug in :func:`read_csv` when called with ``low_memory=False`` in which a CSV with at least one column > 2GB in size would incorrectly raise a ``MemoryError`` (:issue:`16798`).
- Bug in :func:`read_stata` where value labels could not be read when using an iterator (:issue:`16923`)

- Bug in :func:`read_html` where import check fails when run in multiple threads (:issue:`16928`)

Plotting
Expand Down
13 changes: 8 additions & 5 deletions pandas/_libs/parsers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -788,11 +788,14 @@ cdef class TextReader:
unnamed_count += 1

count = counts.get(name, 0)
if (count > 0 and self.mangle_dupe_cols
and not self.has_mi_columns):
this_header.append('%s.%d' % (name, count))
else:
this_header.append(name)

if not self.has_mi_columns and self.mangle_dupe_cols:
while count > 0:
counts[name] = count + 1
name = '%s.%d' % (name, count)
count = counts.get(name, 0)

this_header.append(name)
counts[name] = count + 1

if self.has_mi_columns:
Expand Down
10 changes: 8 additions & 2 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2331,10 +2331,16 @@ def _infer_columns(self):

if not have_mi_columns and self.mangle_dupe_cols:
counts = {}

for i, col in enumerate(this_columns):
cur_count = counts.get(col, 0)
if cur_count > 0:
this_columns[i] = '%s.%d' % (col, cur_count)

while cur_count > 0:
counts[col] = cur_count + 1
col = "%s.%d" % (col, cur_count)
cur_count = counts.get(col, 0)

this_columns[i] = col
counts[col] = cur_count + 1
elif have_mi_columns:

Expand Down
19 changes: 0 additions & 19 deletions pandas/tests/io/parser/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,25 +224,6 @@ def test_unnamed_columns(self):
Index(['A', 'B', 'C', 'Unnamed: 3',
'Unnamed: 4']))

def test_duplicate_columns(self):
# TODO: add test for condition 'mangle_dupe_cols=False'
# once it is actually supported (gh-12935)
data = """A,A,B,B,B
1,2,3,4,5
6,7,8,9,10
11,12,13,14,15
"""

for method in ('read_csv', 'read_table'):

# check default behavior
df = getattr(self, method)(StringIO(data), sep=',')
assert list(df.columns) == ['A', 'A.1', 'B', 'B.1', 'B.2']

df = getattr(self, method)(StringIO(data), sep=',',
mangle_dupe_cols=True)
assert list(df.columns) == ['A', 'A.1', 'B', 'B.1', 'B.2']

def test_csv_mixed_type(self):
data = """A,B,C
a,1,2
Expand Down
41 changes: 41 additions & 0 deletions pandas/tests/io/parser/mangle_dupes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# -*- coding: utf-8 -*-

"""
Tests that duplicate columns are handled appropriately when parsed by the
CSV engine. They are either ignored or are thoroughly de-duplicated.
"""

from pandas.compat import StringIO


class DupeColumnTests(object):
def test_basic(self):
# TODO: add test for condition "mangle_dupe_cols=False"
# once it is actually supported (gh-12935)
data = "a,a,b,b,b\n1,2,3,4,5"

for method in ("read_csv", "read_table"):
# Check default behavior.
expected = ["a", "a.1", "b", "b.1", "b.2"]
df = getattr(self, method)(StringIO(data), sep=",")
assert list(df.columns) == expected

df = getattr(self, method)(StringIO(data), sep=",",
mangle_dupe_cols=True)
assert list(df.columns) == expected

def test_thorough_mangle(self):
# see gh-17060
data = "a,a,a.1\n1,2,3"
df = self.read_csv(StringIO(data), sep=",", mangle_dupe_cols=True)
assert list(df.columns) == ["a", "a.1", "a.1.1"]

data = "a,a,a.1,a.1.1,a.1.1.1,a.1.1.1.1\n1,2,3,4,5,6"
df = self.read_csv(StringIO(data), sep=",", mangle_dupe_cols=True)
assert list(df.columns) == ["a", "a.1", "a.1.1", "a.1.1.1",
"a.1.1.1.1", "a.1.1.1.1.1"]

data = "a,a,a.3,a.1,a.2,a,a\n1,2,3,4,5,6,7"
df = self.read_csv(StringIO(data), sep=",", mangle_dupe_cols=True)
assert list(df.columns) == ["a", "a.1", "a.3", "a.1.1",
"a.2", "a.2.1", "a.3.1"]
4 changes: 3 additions & 1 deletion pandas/tests/io/parser/test_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,18 +19,20 @@
from .c_parser_only import CParserTests
from .parse_dates import ParseDatesTests
from .compression import CompressionTests
from .mangle_dupes import DupeColumnTests
from .multithread import MultithreadTests
from .python_parser_only import PythonParserTests
from .dtypes import DtypeTests


class BaseParser(CommentTests, CompressionTests,
ConverterTests, DialectTests,
DtypeTests, DupeColumnTests,
HeaderTests, IndexColTests,
MultithreadTests, NAvaluesTests,
ParseDatesTests, ParserTests,
SkipRowsTests, UsecolsTests,
QuotingTests, DtypeTests):
QuotingTests):

def read_csv(self, *args, **kwargs):
raise NotImplementedError
Expand Down

0 comments on commit 97636ec

Please sign in to comment.