Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Stop concat from attempting to sort mismatched columns by default
Browse files Browse the repository at this point in the history
Preserve column order upon concatenation to obey
least astonishment principle.

Allow old behavior to be enabled by adding a boolean switch to
concat and DataFrame.append, mismatch_sort, which is by default disabled.

Close pandas-dev#4588
brycepg committed Apr 5, 2018

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature. The key has expired.
1 parent 6d610a4 commit db4c521
Showing 5 changed files with 38 additions and 21 deletions.
11 changes: 6 additions & 5 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
@@ -157,7 +157,7 @@ def fast_unique_multiple(list arrays):

@cython.wraparound(False)
@cython.boundscheck(False)
def fast_unique_multiple_list(list lists):
def fast_unique_multiple_list(list lists, bint sort = True):
cdef:
list buf
Py_ssize_t k = len(lists)
@@ -174,10 +174,11 @@ def fast_unique_multiple_list(list lists):
if val not in table:
table[val] = stub
uniques.append(val)
try:
uniques.sort()
except Exception:
pass
if sort:
try:
uniques.sort()
except Exception:
pass

return uniques

7 changes: 5 additions & 2 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
@@ -5982,7 +5982,7 @@ def infer(x):
# ----------------------------------------------------------------------
# Merging / joining methods

def append(self, other, ignore_index=False, verify_integrity=False):
def append(self, other, ignore_index=False, verify_integrity=False, mismatch_sort=False):
"""
Append rows of `other` to the end of this frame, returning a new
object. Columns not in this frame are added as new columns.
@@ -5995,6 +5995,8 @@ def append(self, other, ignore_index=False, verify_integrity=False):
If True, do not use the index labels.
verify_integrity : boolean, default False
If True, raise ValueError on creating index with duplicates.
mismatch_sort: boolean, default False
Sort columns if given object doesn't have the same columns
Returns
-------
@@ -6103,7 +6105,8 @@ def append(self, other, ignore_index=False, verify_integrity=False):
else:
to_concat = [self, other]
return concat(to_concat, ignore_index=ignore_index,
verify_integrity=verify_integrity)
verify_integrity=verify_integrity,
mismatch_sort=mismatch_sort)

def join(self, other, on=None, how='left', lsuffix='', rsuffix='',
sort=False):
12 changes: 6 additions & 6 deletions pandas/core/indexes/api.py
Original file line number Diff line number Diff line change
@@ -31,17 +31,17 @@
'_all_indexes_same']


def _get_objs_combined_axis(objs, intersect=False, axis=0):
def _get_objs_combined_axis(objs, intersect=False, axis=0, sort=True):
# Extract combined index: return intersection or union (depending on the
# value of "intersect") of indexes on given axis, or None if all objects
# lack indexes (e.g. they are numpy arrays)
obs_idxes = [obj._get_axis(axis) for obj in objs
if hasattr(obj, '_get_axis')]
if obs_idxes:
return _get_combined_index(obs_idxes, intersect=intersect)
return _get_combined_index(obs_idxes, intersect=intersect, sort=sort)


def _get_combined_index(indexes, intersect=False):
def _get_combined_index(indexes, intersect=False, sort=True):
# TODO: handle index names!
indexes = com._get_distinct_objs(indexes)
if len(indexes) == 0:
@@ -53,11 +53,11 @@ def _get_combined_index(indexes, intersect=False):
for other in indexes[1:]:
index = index.intersection(other)
return index
union = _union_indexes(indexes)
union = _union_indexes(indexes, sort=sort)
return _ensure_index(union)


def _union_indexes(indexes):
def _union_indexes(indexes, sort=True):
if len(indexes) == 0:
raise AssertionError('Must have at least 1 Index to union')
if len(indexes) == 1:
@@ -74,7 +74,7 @@ def conv(i):
i = i.tolist()
return i

return Index(lib.fast_unique_multiple_list([conv(i) for i in inds]))
return Index(lib.fast_unique_multiple_list([conv(i) for i in inds], sort=sort))

if kind == 'special':
result = indexes[0]
13 changes: 9 additions & 4 deletions pandas/core/reshape/concat.py
Original file line number Diff line number Diff line change
@@ -20,7 +20,7 @@

def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False,
keys=None, levels=None, names=None, verify_integrity=False,
copy=True):
copy=True, mismatch_sort=False):
"""
Concatenate pandas objects along a particular axis with optional set logic
along the other axes.
@@ -62,6 +62,8 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False,
be very expensive relative to the actual data concatenation
copy : boolean, default True
If False, do not copy data unnecessarily
mismatch_sort : boolean, default False
Sort columns if all passed object columns are not the same
Returns
-------
@@ -209,7 +211,7 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False,
ignore_index=ignore_index, join=join,
keys=keys, levels=levels, names=names,
verify_integrity=verify_integrity,
copy=copy)
copy=copy, sort=mismatch_sort)
return op.get_result()


@@ -220,7 +222,8 @@ class _Concatenator(object):

def __init__(self, objs, axis=0, join='outer', join_axes=None,
keys=None, levels=None, names=None,
ignore_index=False, verify_integrity=False, copy=True):
ignore_index=False, verify_integrity=False, copy=True,
sort=False):
if isinstance(objs, (NDFrame, compat.string_types)):
raise TypeError('first argument must be an iterable of pandas '
'objects, you passed an object of type '
@@ -355,6 +358,7 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None,
self.keys = keys
self.names = names or getattr(keys, 'names', None)
self.levels = levels
self.sort = sort

self.ignore_index = ignore_index
self.verify_integrity = verify_integrity
@@ -447,7 +451,8 @@ def _get_comb_axis(self, i):
data_axis = self.objs[0]._get_block_manager_axis(i)
try:
return _get_objs_combined_axis(self.objs, axis=data_axis,
intersect=self.intersect)
intersect=self.intersect,
sort=self.sort)
except IndexError:
types = [type(x).__name__ for x in self.objs]
raise TypeError("Cannot concatenate list of {types}"
16 changes: 12 additions & 4 deletions pandas/tests/reshape/test_concat.py
Original file line number Diff line number Diff line change
@@ -852,8 +852,8 @@ def test_append_dtype_coerce(self):
dt.datetime(2013, 1, 2, 0, 0),
dt.datetime(2013, 1, 3, 0, 0),
dt.datetime(2013, 1, 4, 0, 0)],
name='start_time')], axis=1)
result = df1.append(df2, ignore_index=True)
name='start_time')], axis=1, mismatch_sort=True)
result = df1.append(df2, ignore_index=True, mismatch_sort=True)
assert_frame_equal(result, expected)

def test_append_missing_column_proper_upcast(self):
@@ -1011,7 +1011,7 @@ def test_concat_ignore_index(self):
frame1.index = Index(["x", "y", "z"])
frame2.index = Index(["x", "y", "q"])

v1 = concat([frame1, frame2], axis=1, ignore_index=True)
v1 = concat([frame1, frame2], axis=1, ignore_index=True, mismatch_sort=True)

nan = np.nan
expected = DataFrame([[nan, nan, nan, 4.3],
@@ -1463,7 +1463,7 @@ def test_concat_series_axis1(self):
# must reindex, #2603
s = Series(randn(3), index=['c', 'a', 'b'], name='A')
s2 = Series(randn(4), index=['d', 'a', 'b', 'c'], name='B')
result = concat([s, s2], axis=1)
result = concat([s, s2], axis=1, mismatch_sort=True)
expected = DataFrame({'A': s, 'B': s2})
assert_frame_equal(result, expected)

@@ -2155,3 +2155,11 @@ def test_concat_empty_and_non_empty_series_regression():
expected = s1
result = pd.concat([s1, s2])
tm.assert_series_equal(result, expected)


def test_concat_preserve_column_order_differing_columns():
# GH 4588 regression test
dfa = pd.DataFrame(columns=['C', 'A'], data=[[1,2]])
dfb = pd.DataFrame(columns=['C', 'Z'], data=[[5,6]])
result = pd.concat([dfa, dfb])
assert result.columns.tolist() == ['C', 'A', 'Z']

0 comments on commit db4c521

Please sign in to comment.