From f8484a3fe63669ca50feff620e555431c1b4ee77 Mon Sep 17 00:00:00 2001 From: Bryce Guinta Date: Wed, 4 Apr 2018 20:51:53 -0600 Subject: [PATCH] Stop concat from attempting to sort mismatched columns by default Preserve column order upon concatenation to obey least astonishment principle. Allow old behavior to be enabled by adding a boolean switch to concat and DataFrame.append, mismatch_sort, which is by default disabled. Close #4588 --- pandas/_libs/lib.pyx | 11 ++++++----- pandas/core/frame.py | 8 ++++++-- pandas/core/indexes/api.py | 13 +++++++------ pandas/core/reshape/concat.py | 13 +++++++++---- pandas/tests/reshape/test_concat.py | 18 ++++++++++++++---- 5 files changed, 42 insertions(+), 21 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 30521760327b4..cd698a8f102ff 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -157,7 +157,7 @@ def fast_unique_multiple(list arrays): @cython.wraparound(False) @cython.boundscheck(False) -def fast_unique_multiple_list(list lists): +def fast_unique_multiple_list(list lists, bint sort = True): cdef: list buf Py_ssize_t k = len(lists) @@ -174,10 +174,11 @@ def fast_unique_multiple_list(list lists): if val not in table: table[val] = stub uniques.append(val) - try: - uniques.sort() - except Exception: - pass + if sort: + try: + uniques.sort() + except Exception: + pass return uniques diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f33ef9597f456..4a53d7fb48494 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5982,7 +5982,8 @@ def infer(x): # ---------------------------------------------------------------------- # Merging / joining methods - def append(self, other, ignore_index=False, verify_integrity=False): + def append(self, other, ignore_index=False, + verify_integrity=False, mismatch_sort=False): """ Append rows of `other` to the end of this frame, returning a new object. Columns not in this frame are added as new columns. @@ -5995,6 +5996,8 @@ def append(self, other, ignore_index=False, verify_integrity=False): If True, do not use the index labels. verify_integrity : boolean, default False If True, raise ValueError on creating index with duplicates. + mismatch_sort: boolean, default False + Sort columns if given object doesn't have the same columns Returns ------- @@ -6103,7 +6106,8 @@ def append(self, other, ignore_index=False, verify_integrity=False): else: to_concat = [self, other] return concat(to_concat, ignore_index=ignore_index, - verify_integrity=verify_integrity) + verify_integrity=verify_integrity, + mismatch_sort=mismatch_sort) def join(self, other, on=None, how='left', lsuffix='', rsuffix='', sort=False): diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index 2e5ec8b554ce7..75232e3db7e55 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -31,17 +31,17 @@ '_all_indexes_same'] -def _get_objs_combined_axis(objs, intersect=False, axis=0): +def _get_objs_combined_axis(objs, intersect=False, axis=0, sort=True): # Extract combined index: return intersection or union (depending on the # value of "intersect") of indexes on given axis, or None if all objects # lack indexes (e.g. they are numpy arrays) obs_idxes = [obj._get_axis(axis) for obj in objs if hasattr(obj, '_get_axis')] if obs_idxes: - return _get_combined_index(obs_idxes, intersect=intersect) + return _get_combined_index(obs_idxes, intersect=intersect, sort=sort) -def _get_combined_index(indexes, intersect=False): +def _get_combined_index(indexes, intersect=False, sort=True): # TODO: handle index names! indexes = com._get_distinct_objs(indexes) if len(indexes) == 0: @@ -53,11 +53,11 @@ def _get_combined_index(indexes, intersect=False): for other in indexes[1:]: index = index.intersection(other) return index - union = _union_indexes(indexes) + union = _union_indexes(indexes, sort=sort) return _ensure_index(union) -def _union_indexes(indexes): +def _union_indexes(indexes, sort=True): if len(indexes) == 0: raise AssertionError('Must have at least 1 Index to union') if len(indexes) == 1: @@ -74,7 +74,8 @@ def conv(i): i = i.tolist() return i - return Index(lib.fast_unique_multiple_list([conv(i) for i in inds])) + return Index( + lib.fast_unique_multiple_list([conv(i) for i in inds], sort=sort)) if kind == 'special': result = indexes[0] diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 20f4384a3d698..c1068a2c5cc08 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -20,7 +20,7 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, keys=None, levels=None, names=None, verify_integrity=False, - copy=True): + copy=True, mismatch_sort=False): """ Concatenate pandas objects along a particular axis with optional set logic along the other axes. @@ -62,6 +62,8 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, be very expensive relative to the actual data concatenation copy : boolean, default True If False, do not copy data unnecessarily + mismatch_sort : boolean, default False + Sort columns if all passed object columns are not the same Returns ------- @@ -209,7 +211,7 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, ignore_index=ignore_index, join=join, keys=keys, levels=levels, names=names, verify_integrity=verify_integrity, - copy=copy) + copy=copy, sort=mismatch_sort) return op.get_result() @@ -220,7 +222,8 @@ class _Concatenator(object): def __init__(self, objs, axis=0, join='outer', join_axes=None, keys=None, levels=None, names=None, - ignore_index=False, verify_integrity=False, copy=True): + ignore_index=False, verify_integrity=False, copy=True, + sort=False): if isinstance(objs, (NDFrame, compat.string_types)): raise TypeError('first argument must be an iterable of pandas ' 'objects, you passed an object of type ' @@ -355,6 +358,7 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None, self.keys = keys self.names = names or getattr(keys, 'names', None) self.levels = levels + self.sort = sort self.ignore_index = ignore_index self.verify_integrity = verify_integrity @@ -447,7 +451,8 @@ def _get_comb_axis(self, i): data_axis = self.objs[0]._get_block_manager_axis(i) try: return _get_objs_combined_axis(self.objs, axis=data_axis, - intersect=self.intersect) + intersect=self.intersect, + sort=self.sort) except IndexError: types = [type(x).__name__ for x in self.objs] raise TypeError("Cannot concatenate list of {types}" diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 437b4179c580a..8d98f0001be23 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -852,8 +852,9 @@ def test_append_dtype_coerce(self): dt.datetime(2013, 1, 2, 0, 0), dt.datetime(2013, 1, 3, 0, 0), dt.datetime(2013, 1, 4, 0, 0)], - name='start_time')], axis=1) - result = df1.append(df2, ignore_index=True) + name='start_time')], + axis=1, mismatch_sort=True) + result = df1.append(df2, ignore_index=True, mismatch_sort=True) assert_frame_equal(result, expected) def test_append_missing_column_proper_upcast(self): @@ -1011,7 +1012,8 @@ def test_concat_ignore_index(self): frame1.index = Index(["x", "y", "z"]) frame2.index = Index(["x", "y", "q"]) - v1 = concat([frame1, frame2], axis=1, ignore_index=True) + v1 = concat([frame1, frame2], axis=1, + ignore_index=True, mismatch_sort=True) nan = np.nan expected = DataFrame([[nan, nan, nan, 4.3], @@ -1463,7 +1465,7 @@ def test_concat_series_axis1(self): # must reindex, #2603 s = Series(randn(3), index=['c', 'a', 'b'], name='A') s2 = Series(randn(4), index=['d', 'a', 'b', 'c'], name='B') - result = concat([s, s2], axis=1) + result = concat([s, s2], axis=1, mismatch_sort=True) expected = DataFrame({'A': s, 'B': s2}) assert_frame_equal(result, expected) @@ -2155,3 +2157,11 @@ def test_concat_empty_and_non_empty_series_regression(): expected = s1 result = pd.concat([s1, s2]) tm.assert_series_equal(result, expected) + + +def test_concat_preserve_column_order_differing_columns(): + # GH 4588 regression test + dfa = pd.DataFrame(columns=['C', 'A'], data=[[1, 2]]) + dfb = pd.DataFrame(columns=['C', 'Z'], data=[[5, 6]]) + result = pd.concat([dfa, dfb]) + assert result.columns.tolist() == ['C', 'A', 'Z']