API: Have MultiIndex consturctors always return a MI (pandas-dev#17236)

* API: Have MultiIndex constructors return MI This removes the special case for MultiIndex constructors returning an Index if all the levels are length-1. Now this will return a MultiIndex with a single level. This is a backwards incompatabile change, with no clear method for deprecation, so we're making a clean break. Closes pandas-dev#17178 * fixup! API: Have MultiIndex constructors return MI * Update for comments
jowens · Sep 20, 2017 · 0f8205c · 0f8205c
1 parent c33af56
commit 0f8205c
Show file tree

Hide file tree

Showing 12 changed files with 170 additions and 45 deletions.
diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt
@@ -275,6 +275,30 @@ named ``.isna()`` and ``.notna()``, these are included for classes ``Categorical
 
 The configuration option ``pd.options.mode.use_inf_as_null`` is deprecated, and ``pd.options.mode.use_inf_as_na`` is added as a replacement.
 
+.. _whatsnew_210.api.multiindex_single:
+
+MultiIndex Constructor with a Single Level
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The ``MultiIndex`` constructors no longer squeeze a MultiIndex with all
+length-one levels down to a regular ``Index``. This affects all the
+``MultiIndex`` constructors. (:issue:`17178`)
+
+Previous behavior:
+
+.. code-block:: ipython
+
+   In [2]: pd.MultiIndex.from_tuples([('a',), ('b',)])
+   Out[2]: Index(['a', 'b'], dtype='object')
+
+Length 1 levels are no longer special-cased. They behave exactly as if you had
+length 2+ levels, so a :class:`MultiIndex` is always returned from all of the
+``MultiIndex`` constructors:
+
+.. ipython:: python
+
+   pd.MultiIndex.from_tuples([('a',), ('b',)])
+
 .. _whatsnew_0210.api:
 
 Other API Changes

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -67,7 +67,8 @@
                                 _dict_compat,
                                 standardize_mapping)
 from pandas.core.generic import NDFrame, _shared_docs
-from pandas.core.index import Index, MultiIndex, _ensure_index
+from pandas.core.index import (Index, MultiIndex, _ensure_index,
+                               _ensure_index_from_sequences)
 from pandas.core.indexing import (maybe_droplevels, convert_to_index_sliceable,
                                   check_bool_indexer)
 from pandas.core.internals import (BlockManager,
@@ -1155,9 +1156,9 @@ def from_records(cls, data, index=None, exclude=None, columns=None,
             else:
                 try:
                     to_remove = [arr_columns.get_loc(field) for field in index]
-
-                    result_index = MultiIndex.from_arrays(
-                        [arrays[i] for i in to_remove], names=index)
+                    index_data = [arrays[i] for i in to_remove]
+                    result_index = _ensure_index_from_sequences(index_data,
+                                                                names=index)
 
                     exclude.update(index)
                 except Exception:
@@ -3000,7 +3001,7 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
                     to_remove.append(col)
             arrays.append(level)
 
-        index = MultiIndex.from_arrays(arrays, names=names)
+        index = _ensure_index_from_sequences(arrays, names)
 
         if verify_integrity and not index.is_unique:
             duplicates = index.get_duplicates()

diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py
@@ -1,6 +1,9 @@
-from pandas.core.indexes.base import (Index, _new_Index,  # noqa
-                                 _ensure_index, _get_na_value,
-                                 InvalidIndexError)
+from pandas.core.indexes.base import (Index,
+                                      _new_Index,
+                                      _ensure_index,
+                                      _ensure_index_from_sequences,
+                                      _get_na_value,
+                                      InvalidIndexError)  # noqa
 from pandas.core.indexes.category import CategoricalIndex  # noqa
 from pandas.core.indexes.multi import MultiIndex  # noqa
 from pandas.core.indexes.interval import IntervalIndex  # noqa
@@ -22,7 +25,8 @@
            'InvalidIndexError', 'TimedeltaIndex',
            'PeriodIndex', 'DatetimeIndex',
            '_new_Index', 'NaT',
-           '_ensure_index', '_get_na_value', '_get_combined_index',
+           '_ensure_index', '_ensure_index_from_sequences', '_get_na_value',
+           '_get_combined_index',
            '_get_objs_combined_axis', '_union_indexes',
            '_get_consensus_names',
            '_all_indexes_same']

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -4012,7 +4012,76 @@ def invalid_op(self, other=None):
 Index._add_comparison_methods()
 
 
+def _ensure_index_from_sequences(sequences, names=None):
+    """Construct an index from sequences of data.
+
+    A single sequence returns an Index. Many sequences returns a
+    MultiIndex.
+
+    Parameters
+    ----------
+    sequences : sequence of sequences
+    names : sequence of str
+
+    Returns
+    -------
+    index : Index or MultiIndex
+
+    Examples
+    --------
+    >>> _ensure_index_from_sequences([[1, 2, 3]], names=['name'])
+    Int64Index([1, 2, 3], dtype='int64', name='name')
+
+    >>> _ensure_index_from_sequences([['a', 'a'], ['a', 'b']],
+                                     names=['L1', 'L2'])
+    MultiIndex(levels=[['a'], ['a', 'b']],
+               labels=[[0, 0], [0, 1]],
+               names=['L1', 'L2'])
+
+    See Also
+    --------
+    _ensure_index
+    """
+    from .multi import MultiIndex
+
+    if len(sequences) == 1:
+        if names is not None:
+            names = names[0]
+        return Index(sequences[0], name=names)
+    else:
+        return MultiIndex.from_arrays(sequences, names=names)
+
+
 def _ensure_index(index_like, copy=False):
+    """
+    Ensure that we have an index from some index-like object
+
+    Parameters
+    ----------
+    index : sequence
+        An Index or other sequence
+    copy : bool
+
+    Returns
+    -------
+    index : Index or MultiIndex
+
+    Examples
+    --------
+    >>> _ensure_index(['a', 'b'])
+    Index(['a', 'b'], dtype='object')
+
+    >>> _ensure_index([('a', 'a'),  ('b', 'c')])
+    Index([('a', 'a'), ('b', 'c')], dtype='object')
+
+    >>> _ensure_index([['a', 'a'], ['b', 'c']])
+    MultiIndex(levels=[['a'], ['b', 'c']],
+               labels=[[0, 0], [0, 1]])
+
+    See Also
+    --------
+    _ensure_index_from_sequences
+    """
     if isinstance(index_like, Index):
         if copy:
             index_like = index_like.copy()

diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
@@ -91,12 +91,6 @@ def __new__(cls, levels=None, labels=None, sortorder=None, names=None,
             raise ValueError('Length of levels and labels must be the same.')
         if len(levels) == 0:
             raise ValueError('Must pass non-zero number of levels/labels')
-        if len(levels) == 1:
-            if names:
-                name = names[0]
-            else:
-                name = None
-            return Index(levels[0], name=name, copy=True).take(labels[0])
 
         result = object.__new__(MultiIndex)
 
@@ -1084,10 +1078,6 @@ def from_arrays(cls, arrays, sortorder=None, names=None):
         MultiIndex.from_product : Make a MultiIndex from cartesian product
                                   of iterables
         """
-        if len(arrays) == 1:
-            name = None if names is None else names[0]
-            return Index(arrays[0], name=name)
-
         # Check if lengths of all arrays are equal or not,
         # raise ValueError, if not
         for i in range(1, len(arrays)):

diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py
@@ -31,7 +31,7 @@
 
 from pandas.core.frame import _shared_docs
 from pandas.util._decorators import Appender
-from pandas.core.index import MultiIndex, _get_na_value
+from pandas.core.index import Index, MultiIndex, _get_na_value
 
 
 class _Unstacker(object):
@@ -311,10 +311,14 @@ def _unstack_multiple(data, clocs):
     recons_labels = decons_obs_group_ids(comp_ids, obs_ids, shape, clabels,
                                          xnull=False)
 
-    dummy_index = MultiIndex(levels=rlevels + [obs_ids],
-                             labels=rlabels + [comp_ids],
-                             names=rnames + ['__placeholder__'],
-                             verify_integrity=False)
+    if rlocs == []:
+        # Everything is in clocs, so the dummy df has a regular index
+        dummy_index = Index(obs_ids, name='__placeholder__')
+    else:
+        dummy_index = MultiIndex(levels=rlevels + [obs_ids],
+                                 labels=rlabels + [comp_ids],
+                                 names=rnames + ['__placeholder__'],
+                                 verify_integrity=False)
 
     if isinstance(data, Series):
         dummy = data.copy()
@@ -446,7 +450,12 @@ def _slow_pivot(index, columns, values):
 
 def unstack(obj, level, fill_value=None):
     if isinstance(level, (tuple, list)):
-        return _unstack_multiple(obj, level)
+        if len(level) != 1:
+            # _unstack_multiple only handles MultiIndexes,
+            # and isn't needed for a single level
+            return _unstack_multiple(obj, level)
+        else:
+            level = level[0]
 
     if isinstance(obj, DataFrame):
         if isinstance(obj.index, MultiIndex):

diff --git a/pandas/core/sparse/scipy_sparse.py b/pandas/core/sparse/scipy_sparse.py
@@ -71,7 +71,11 @@ def robust_get_level_values(i):
             labels_to_i = Series(labels_to_i)
             if len(subset) > 1:
                 labels_to_i.index = MultiIndex.from_tuples(labels_to_i.index)
-            labels_to_i.index.names = [index.names[i] for i in subset]
+                labels_to_i.index.names = [index.names[i] for i in subset]
+            else:
+                labels_to_i.index = Index(x[0] for x in labels_to_i.index)
+                labels_to_i.index.name = index.names[subset[0]]
+
             labels_to_i.name = 'value'
             return (labels_to_i)
 

diff --git a/pandas/core/strings.py b/pandas/core/strings.py
@@ -1452,7 +1452,12 @@ def cons_row(x):
 
             if expand:
                 result = list(result)
-                return MultiIndex.from_tuples(result, names=name)
+                out = MultiIndex.from_tuples(result, names=name)
+                if out.nlevels == 1:
+                    # We had all tuples of length-one, which are
+                    # better represented as a regular Index.
+                    out = out.get_level_values(0)
+                return out
             else:
                 return Index(result, name=name)
         else:

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -23,7 +23,8 @@
     is_scalar, is_categorical_dtype)
 from pandas.core.dtypes.missing import isna
 from pandas.core.dtypes.cast import astype_nansafe
-from pandas.core.index import Index, MultiIndex, RangeIndex
+from pandas.core.index import (Index, MultiIndex, RangeIndex,
+                               _ensure_index_from_sequences)
 from pandas.core.series import Series
 from pandas.core.frame import DataFrame
 from pandas.core.categorical import Categorical
@@ -1444,7 +1445,8 @@ def _agg_index(self, index, try_parse_dates=True):
             arr, _ = self._infer_types(arr, col_na_values | col_na_fvalues)
             arrays.append(arr)
 
-        index = MultiIndex.from_arrays(arrays, names=self.index_names)
+        names = self.index_names
+        index = _ensure_index_from_sequences(arrays, names)
 
         return index
 
@@ -1808,7 +1810,7 @@ def read(self, nrows=None):
                                                  try_parse_dates=True)
                 arrays.append(values)
 
-            index = MultiIndex.from_arrays(arrays)
+            index = _ensure_index_from_sequences(arrays)
 
             if self.usecols is not None:
                 names = self._filter_usecols(names)
@@ -3138,9 +3140,8 @@ def _get_empty_meta(columns, index_col, index_names, dtype=None):
     if index_col is None or index_col is False:
         index = Index([])
     else:
-        index = [Series([], dtype=dtype[index_name])
-                 for index_name in index_names]
-        index = MultiIndex.from_arrays(index, names=index_names)
+        data = [Series([], dtype=dtype[name]) for name in index_names]
+        index = _ensure_index_from_sequences(data, names=index_names)
         index_col.sort()
         for i, n in enumerate(index_col):
             columns.pop(n - i)

diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py
@@ -17,7 +17,7 @@
                     DataFrame, Float64Index, Int64Index,
                     CategoricalIndex, DatetimeIndex, TimedeltaIndex,
                     PeriodIndex, isna)
-from pandas.core.index import _get_combined_index
+from pandas.core.index import _get_combined_index, _ensure_index_from_sequences
 from pandas.util.testing import assert_almost_equal
 from pandas.compat.numpy import np_datetime64_compat
 
@@ -2112,3 +2112,19 @@ def test_intersect_str_dates(self):
         res = i2.intersection(i1)
 
         assert len(res) == 0
+
+
+class TestIndexUtils(object):
+
+    @pytest.mark.parametrize('data, names, expected', [
+        ([[1, 2, 3]], None, Index([1, 2, 3])),
+        ([[1, 2, 3]], ['name'], Index([1, 2, 3], name='name')),
+        ([['a', 'a'], ['c', 'd']], None,
+         MultiIndex([['a'], ['c', 'd']], [[0, 0], [0, 1]])),
+        ([['a', 'a'], ['c', 'd']], ['L1', 'L2'],
+         MultiIndex([['a'], ['c', 'd']], [[0, 0], [0, 1]],
+                    names=['L1', 'L2'])),
+    ])
+    def test_ensure_index_from_sequences(self, data, names, expected):
+        result = _ensure_index_from_sequences(data, names)
+        tm.assert_index_equal(result, expected)
diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py
@@ -537,15 +537,12 @@ def test_astype(self):
             self.index.astype(np.dtype(int))
 
     def test_constructor_single_level(self):
-        single_level = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']],
-                                  labels=[[0, 1, 2, 3]], names=['first'])
-        assert isinstance(single_level, Index)
-        assert not isinstance(single_level, MultiIndex)
-        assert single_level.name == 'first'
-
-        single_level = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']],
-                                  labels=[[0, 1, 2, 3]])
-        assert single_level.name is None
+        result = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']],
+                            labels=[[0, 1, 2, 3]], names=['first'])
+        assert isinstance(result, MultiIndex)
+        expected = Index(['foo', 'bar', 'baz', 'qux'], name='first')
+        tm.assert_index_equal(result.levels[0], expected)
+        assert result.names == ['first']
 
     def test_constructor_no_levels(self):
         tm.assert_raises_regex(ValueError, "non-zero number "
@@ -768,8 +765,9 @@ def test_from_arrays_empty(self):
 
         # 1 level
         result = MultiIndex.from_arrays(arrays=[[]], names=['A'])
+        assert isinstance(result, MultiIndex)
         expected = Index([], name='A')
-        tm.assert_index_equal(result, expected)
+        tm.assert_index_equal(result.levels[0], expected)
 
         # N levels
         for N in [2, 3]:
@@ -830,7 +828,7 @@ def test_from_product_empty(self):
         # 1 level
         result = MultiIndex.from_product([[]], names=['A'])
         expected = pd.Index([], name='A')
-        tm.assert_index_equal(result, expected)
+        tm.assert_index_equal(result.levels[0], expected)
 
         # 2 levels
         l1 = [[], ['foo', 'bar', 'baz'], []]

diff --git a/pandas/util/testing.py b/pandas/util/testing.py
@@ -1909,7 +1909,11 @@ def keyfunc(x):
 
     # convert tuples to index
     if nentries == 1:
+        # we have a single level of tuples, i.e. a regular Index
         index = Index(tuples[0], name=names[0])
+    elif nlevels == 1:
+        name = None if names is None else names[0]
+        index = Index((x[0] for x in tuples), name=name)
     else:
         index = MultiIndex.from_tuples(tuples, names=names)
     return index