diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 33b7e128ef8bf..87896778bea14 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -273,6 +273,30 @@ named ``.isna()`` and ``.notna()``, these are included for classes ``Categorical The configuration option ``pd.options.mode.use_inf_as_null`` is deprecated, and ``pd.options.mode.use_inf_as_na`` is added as a replacement. +.. _whatsnew_210.api.multiindex_single: + +MultiIndex Constructor with a Single Level +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The ``MultiIndex`` constructors no longer squeeze a MultiIndex with all +length-one levels down to a regular ``Index``. This affects all the +``MultiIndex`` constructors. (:issue:`17178`) + +Previous behavior: + +.. code-block:: ipython + + In [2]: pd.MultiIndex.from_tuples([('a',), ('b',)]) + Out[2]: Index(['a', 'b'], dtype='object') + +Length 1 levels are no longer special-cased. They behave exactly as if you had +length 2+ levels, so a :class:`MultiIndex` is always returned from all of the +``MultiIndex`` constructors: + +.. ipython:: python + + pd.MultiIndex.from_tuples([('a',), ('b',)]) + .. _whatsnew_0210.api: Other API Changes diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b5b3df64d24c0..5991ec825c841 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -67,7 +67,8 @@ _dict_compat, standardize_mapping) from pandas.core.generic import NDFrame, _shared_docs -from pandas.core.index import Index, MultiIndex, _ensure_index +from pandas.core.index import (Index, MultiIndex, _ensure_index, + _ensure_index_from_sequences) from pandas.core.indexing import (maybe_droplevels, convert_to_index_sliceable, check_bool_indexer) from pandas.core.internals import (BlockManager, @@ -1155,9 +1156,9 @@ def from_records(cls, data, index=None, exclude=None, columns=None, else: try: to_remove = [arr_columns.get_loc(field) for field in index] - - result_index = MultiIndex.from_arrays( - [arrays[i] for i in to_remove], names=index) + index_data = [arrays[i] for i in to_remove] + result_index = _ensure_index_from_sequences(index_data, + names=index) exclude.update(index) except Exception: @@ -3000,7 +3001,7 @@ def set_index(self, keys, drop=True, append=False, inplace=False, to_remove.append(col) arrays.append(level) - index = MultiIndex.from_arrays(arrays, names=names) + index = _ensure_index_from_sequences(arrays, names) if verify_integrity and not index.is_unique: duplicates = index.get_duplicates() diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index 323d50166e7b6..d20a0b0a2c73d 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -1,6 +1,9 @@ -from pandas.core.indexes.base import (Index, _new_Index, # noqa - _ensure_index, _get_na_value, - InvalidIndexError) +from pandas.core.indexes.base import (Index, + _new_Index, + _ensure_index, + _ensure_index_from_sequences, + _get_na_value, + InvalidIndexError) # noqa from pandas.core.indexes.category import CategoricalIndex # noqa from pandas.core.indexes.multi import MultiIndex # noqa from pandas.core.indexes.interval import IntervalIndex # noqa @@ -22,7 +25,8 @@ 'InvalidIndexError', 'TimedeltaIndex', 'PeriodIndex', 'DatetimeIndex', '_new_Index', 'NaT', - '_ensure_index', '_get_na_value', '_get_combined_index', + '_ensure_index', '_ensure_index_from_sequences', '_get_na_value', + '_get_combined_index', '_get_objs_combined_axis', '_union_indexes', '_get_consensus_names', '_all_indexes_same'] diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 31cf1e48b8529..6a30eaefaaae7 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4012,7 +4012,76 @@ def invalid_op(self, other=None): Index._add_comparison_methods() +def _ensure_index_from_sequences(sequences, names=None): + """Construct an index from sequences of data. + + A single sequence returns an Index. Many sequences returns a + MultiIndex. + + Parameters + ---------- + sequences : sequence of sequences + names : sequence of str + + Returns + ------- + index : Index or MultiIndex + + Examples + -------- + >>> _ensure_index_from_sequences([[1, 2, 3]], names=['name']) + Int64Index([1, 2, 3], dtype='int64', name='name') + + >>> _ensure_index_from_sequences([['a', 'a'], ['a', 'b']], + names=['L1', 'L2']) + MultiIndex(levels=[['a'], ['a', 'b']], + labels=[[0, 0], [0, 1]], + names=['L1', 'L2']) + + See Also + -------- + _ensure_index + """ + from .multi import MultiIndex + + if len(sequences) == 1: + if names is not None: + names = names[0] + return Index(sequences[0], name=names) + else: + return MultiIndex.from_arrays(sequences, names=names) + + def _ensure_index(index_like, copy=False): + """ + Ensure that we have an index from some index-like object + + Parameters + ---------- + index : sequence + An Index or other sequence + copy : bool + + Returns + ------- + index : Index or MultiIndex + + Examples + -------- + >>> _ensure_index(['a', 'b']) + Index(['a', 'b'], dtype='object') + + >>> _ensure_index([('a', 'a'), ('b', 'c')]) + Index([('a', 'a'), ('b', 'c')], dtype='object') + + >>> _ensure_index([['a', 'a'], ['b', 'c']]) + MultiIndex(levels=[['a'], ['b', 'c']], + labels=[[0, 0], [0, 1]]) + + See Also + -------- + _ensure_index_from_sequences + """ if isinstance(index_like, Index): if copy: index_like = index_like.copy() diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index ea45b4700172f..d7d5b6d128a2c 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -91,12 +91,6 @@ def __new__(cls, levels=None, labels=None, sortorder=None, names=None, raise ValueError('Length of levels and labels must be the same.') if len(levels) == 0: raise ValueError('Must pass non-zero number of levels/labels') - if len(levels) == 1: - if names: - name = names[0] - else: - name = None - return Index(levels[0], name=name, copy=True).take(labels[0]) result = object.__new__(MultiIndex) @@ -1084,10 +1078,6 @@ def from_arrays(cls, arrays, sortorder=None, names=None): MultiIndex.from_product : Make a MultiIndex from cartesian product of iterables """ - if len(arrays) == 1: - name = None if names is None else names[0] - return Index(arrays[0], name=name) - # Check if lengths of all arrays are equal or not, # raise ValueError, if not for i in range(1, len(arrays)): diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 455da9246783c..b4abba8026b35 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -31,7 +31,7 @@ from pandas.core.frame import _shared_docs from pandas.util._decorators import Appender -from pandas.core.index import MultiIndex, _get_na_value +from pandas.core.index import Index, MultiIndex, _get_na_value class _Unstacker(object): @@ -311,10 +311,14 @@ def _unstack_multiple(data, clocs): recons_labels = decons_obs_group_ids(comp_ids, obs_ids, shape, clabels, xnull=False) - dummy_index = MultiIndex(levels=rlevels + [obs_ids], - labels=rlabels + [comp_ids], - names=rnames + ['__placeholder__'], - verify_integrity=False) + if rlocs == []: + # Everything is in clocs, so the dummy df has a regular index + dummy_index = Index(obs_ids, name='__placeholder__') + else: + dummy_index = MultiIndex(levels=rlevels + [obs_ids], + labels=rlabels + [comp_ids], + names=rnames + ['__placeholder__'], + verify_integrity=False) if isinstance(data, Series): dummy = data.copy() @@ -446,7 +450,12 @@ def _slow_pivot(index, columns, values): def unstack(obj, level, fill_value=None): if isinstance(level, (tuple, list)): - return _unstack_multiple(obj, level) + if len(level) != 1: + # _unstack_multiple only handles MultiIndexes, + # and isn't needed for a single level + return _unstack_multiple(obj, level) + else: + level = level[0] if isinstance(obj, DataFrame): if isinstance(obj.index, MultiIndex): diff --git a/pandas/core/sparse/scipy_sparse.py b/pandas/core/sparse/scipy_sparse.py index ea108e3e89935..d2b9583d8efe5 100644 --- a/pandas/core/sparse/scipy_sparse.py +++ b/pandas/core/sparse/scipy_sparse.py @@ -71,7 +71,11 @@ def robust_get_level_values(i): labels_to_i = Series(labels_to_i) if len(subset) > 1: labels_to_i.index = MultiIndex.from_tuples(labels_to_i.index) - labels_to_i.index.names = [index.names[i] for i in subset] + labels_to_i.index.names = [index.names[i] for i in subset] + else: + labels_to_i.index = Index(x[0] for x in labels_to_i.index) + labels_to_i.index.name = index.names[subset[0]] + labels_to_i.name = 'value' return (labels_to_i) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 2f95e510bba5e..48bc2ee05dd68 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1452,7 +1452,12 @@ def cons_row(x): if expand: result = list(result) - return MultiIndex.from_tuples(result, names=name) + out = MultiIndex.from_tuples(result, names=name) + if out.nlevels == 1: + # We had all tuples of length-one, which are + # better represented as a regular Index. + out = out.get_level_values(0) + return out else: return Index(result, name=name) else: diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index a9821be3fa5e2..8b1a921536a1d 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -23,7 +23,8 @@ is_scalar, is_categorical_dtype) from pandas.core.dtypes.missing import isna from pandas.core.dtypes.cast import astype_nansafe -from pandas.core.index import Index, MultiIndex, RangeIndex +from pandas.core.index import (Index, MultiIndex, RangeIndex, + _ensure_index_from_sequences) from pandas.core.series import Series from pandas.core.frame import DataFrame from pandas.core.categorical import Categorical @@ -1444,7 +1445,8 @@ def _agg_index(self, index, try_parse_dates=True): arr, _ = self._infer_types(arr, col_na_values | col_na_fvalues) arrays.append(arr) - index = MultiIndex.from_arrays(arrays, names=self.index_names) + names = self.index_names + index = _ensure_index_from_sequences(arrays, names) return index @@ -1808,7 +1810,7 @@ def read(self, nrows=None): try_parse_dates=True) arrays.append(values) - index = MultiIndex.from_arrays(arrays) + index = _ensure_index_from_sequences(arrays) if self.usecols is not None: names = self._filter_usecols(names) @@ -3138,9 +3140,8 @@ def _get_empty_meta(columns, index_col, index_names, dtype=None): if index_col is None or index_col is False: index = Index([]) else: - index = [Series([], dtype=dtype[index_name]) - for index_name in index_names] - index = MultiIndex.from_arrays(index, names=index_names) + data = [Series([], dtype=dtype[name]) for name in index_names] + index = _ensure_index_from_sequences(data, names=index_names) index_col.sort() for i, n in enumerate(index_col): columns.pop(n - i) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 07e98c326bcaa..aa32e75ba0d58 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -17,7 +17,7 @@ DataFrame, Float64Index, Int64Index, CategoricalIndex, DatetimeIndex, TimedeltaIndex, PeriodIndex, isna) -from pandas.core.index import _get_combined_index +from pandas.core.index import _get_combined_index, _ensure_index_from_sequences from pandas.util.testing import assert_almost_equal from pandas.compat.numpy import np_datetime64_compat @@ -2112,3 +2112,19 @@ def test_intersect_str_dates(self): res = i2.intersection(i1) assert len(res) == 0 + + +class TestIndexUtils(object): + + @pytest.mark.parametrize('data, names, expected', [ + ([[1, 2, 3]], None, Index([1, 2, 3])), + ([[1, 2, 3]], ['name'], Index([1, 2, 3], name='name')), + ([['a', 'a'], ['c', 'd']], None, + MultiIndex([['a'], ['c', 'd']], [[0, 0], [0, 1]])), + ([['a', 'a'], ['c', 'd']], ['L1', 'L2'], + MultiIndex([['a'], ['c', 'd']], [[0, 0], [0, 1]], + names=['L1', 'L2'])), + ]) + def test_ensure_index_from_sequences(self, data, names, expected): + result = _ensure_index_from_sequences(data, names) + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index c66775f4690cc..798d244468961 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -537,15 +537,12 @@ def test_astype(self): self.index.astype(np.dtype(int)) def test_constructor_single_level(self): - single_level = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']], - labels=[[0, 1, 2, 3]], names=['first']) - assert isinstance(single_level, Index) - assert not isinstance(single_level, MultiIndex) - assert single_level.name == 'first' - - single_level = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']], - labels=[[0, 1, 2, 3]]) - assert single_level.name is None + result = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']], + labels=[[0, 1, 2, 3]], names=['first']) + assert isinstance(result, MultiIndex) + expected = Index(['foo', 'bar', 'baz', 'qux'], name='first') + tm.assert_index_equal(result.levels[0], expected) + assert result.names == ['first'] def test_constructor_no_levels(self): tm.assert_raises_regex(ValueError, "non-zero number " @@ -768,8 +765,9 @@ def test_from_arrays_empty(self): # 1 level result = MultiIndex.from_arrays(arrays=[[]], names=['A']) + assert isinstance(result, MultiIndex) expected = Index([], name='A') - tm.assert_index_equal(result, expected) + tm.assert_index_equal(result.levels[0], expected) # N levels for N in [2, 3]: @@ -830,7 +828,7 @@ def test_from_product_empty(self): # 1 level result = MultiIndex.from_product([[]], names=['A']) expected = pd.Index([], name='A') - tm.assert_index_equal(result, expected) + tm.assert_index_equal(result.levels[0], expected) # 2 levels l1 = [[], ['foo', 'bar', 'baz'], []] diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 5a17cb6d7dc47..7dac83953ad8f 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -1909,7 +1909,11 @@ def keyfunc(x): # convert tuples to index if nentries == 1: + # we have a single level of tuples, i.e. a regular Index index = Index(tuples[0], name=names[0]) + elif nlevels == 1: + name = None if names is None else names[0] + index = Index((x[0] for x in tuples), name=name) else: index = MultiIndex.from_tuples(tuples, names=names) return index