diff --git a/doc/source/release.rst b/doc/source/release.rst index ce08a1ca0a175..056292322c297 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -480,6 +480,7 @@ Bug Fixes - Fixed wrong check for overlapping in ``DatetimeIndex.union`` (:issue:`4564`) - Fixed conflict between thousands separator and date parser in csv_parser (:issue:`4678`) - Fix appending when dtypes are not the same (error showing mixing float/np.datetime64) (:issue:`4993`) + - Fixed wrong index name during read_csv if using usecols. Applies to c parser only. (:issue:`4201`) pandas 0.12.0 ------------- diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 21b791d2a1acc..426d71b05e30a 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2,7 +2,7 @@ Module contains tools for processing files into DataFrames or other objects """ from __future__ import print_function -from pandas.compat import range, lrange, StringIO, lzip, zip +from pandas.compat import range, lrange, StringIO, lzip, zip, string_types from pandas import compat import re import csv @@ -15,7 +15,6 @@ import datetime import pandas.core.common as com from pandas.core.config import get_option -from pandas import compat from pandas.io.date_converters import generic_parser from pandas.io.common import get_filepath_or_buffer @@ -24,7 +23,7 @@ import pandas.lib as lib import pandas.tslib as tslib import pandas.parser as _parser -from pandas.tseries.period import Period + _parser_params = """Also supports optionally iterating or breaking of the file into chunks. @@ -982,7 +981,19 @@ def __init__(self, src, **kwds): else: self.names = lrange(self._reader.table_width) - # XXX + # If the names were inferred (not passed by user) and usedcols is defined, + # then ensure names refers to the used columns, not the document's columns. + if self.usecols and passed_names: + col_indices = [] + for u in self.usecols: + if isinstance(u, string_types): + col_indices.append(self.names.index(u)) + else: + col_indices.append(u) + self.names = [n for i, n in enumerate(self.names) if i in col_indices] + if len(self.names) < len(self.usecols): + raise ValueError("Usecols do not match names.") + self._set_noconvert_columns() self.orig_names = self.names diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 48c47238aec6f..fadf70877409f 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -1865,6 +1865,32 @@ def test_parse_integers_above_fp_precision(self): self.assertTrue(np.array_equal(result['Numbers'], expected['Numbers'])) + def test_usecols_index_col_conflict(self): + # Issue 4201 Test that index_col as integer reflects usecols + data = """SecId,Time,Price,P2,P3 +10000,2013-5-11,100,10,1 +500,2013-5-12,101,11,1 +""" + expected = DataFrame({'Price': [100, 101]}, index=[datetime(2013, 5, 11), datetime(2013, 5, 12)]) + expected.index.name = 'Time' + + df = pd.read_csv(StringIO(data), usecols=['Time', 'Price'], parse_dates=True, index_col=0) + tm.assert_frame_equal(expected, df) + + df = pd.read_csv(StringIO(data), usecols=['Time', 'Price'], parse_dates=True, index_col='Time') + tm.assert_frame_equal(expected, df) + + df = pd.read_csv(StringIO(data), usecols=[1, 2], parse_dates=True, index_col='Time') + tm.assert_frame_equal(expected, df) + + df = pd.read_csv(StringIO(data), usecols=[1, 2], parse_dates=True, index_col=0) + tm.assert_frame_equal(expected, df) + + expected = DataFrame({'P3': [1, 1], 'Price': (100, 101), 'P2': (10, 11)}) + expected = expected.set_index(['Price', 'P2']) + df = pd.read_csv(StringIO(data), usecols=['Price', 'P2', 'P3'], parse_dates=True, index_col=['Price', 'P2']) + tm.assert_frame_equal(expected, df) + class TestPythonParser(ParserTests, unittest.TestCase):