Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/master' into multi-index-join
Browse files Browse the repository at this point in the history
# Conflicts:
#	doc/source/whatsnew/v0.24.0.txt
#	pandas/core/indexes/base.py
#	pandas/core/reshape/merge.py
#	pandas/tests/reshape/merge/test_merge.py
  • Loading branch information
harisbal committed Sep 19, 2018
2 parents b581789 + 40dfadd commit 5689f0a
Showing 1 changed file with 112 additions and 10 deletions.
122 changes: 112 additions & 10 deletions pandas/tests/reshape/merge/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -896,17 +896,119 @@ def _check_merge(x, y):
assert_frame_equal(result, expected, check_names=False)


class TestMergeDtypes(object):
class TestMergeMulti(object):

@pytest.mark.parametrize('right_vals', [
['foo', 'bar'],
Series(['foo', 'bar']).astype('category'),
[1, 2],
[1.0, 2.0],
Series([1, 2], dtype='uint64'),
Series([1, 2], dtype='int32')
])
def test_different(self, right_vals):
def setup_method(self, method):
self.index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
['one', 'two', 'three']],
labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
[0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
names=['first', 'second'])
self.to_join = DataFrame(np.random.randn(10, 3), index=self.index,
columns=['j_one', 'j_two', 'j_three'])

# a little relevant example with NAs
key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux',
'qux', 'snap']
key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two',
'three', 'one']

data = np.random.randn(len(key1))
self.data = DataFrame({'key1': key1, 'key2': key2,
'data': data})

def test_merge_on_multikey(self):
joined = self.data.join(self.to_join, on=['key1', 'key2'])

join_key = Index(lzip(self.data['key1'], self.data['key2']))
indexer = self.to_join.index.get_indexer(join_key)
ex_values = self.to_join.values.take(indexer, axis=0)
ex_values[indexer == -1] = np.nan
expected = self.data.join(DataFrame(ex_values,
columns=self.to_join.columns))

# TODO: columns aren't in the same order yet
assert_frame_equal(joined, expected.loc[:, joined.columns])

left = self.data.join(self.to_join, on=['key1', 'key2'], sort=True)
right = expected.loc[:, joined.columns].sort_values(['key1', 'key2'],
kind='mergesort')
assert_frame_equal(left, right)

def test_left_join_multi_index(self):
icols = ['1st', '2nd', '3rd']

def bind_cols(df):
iord = lambda a: 0 if a != a else ord(a)
f = lambda ts: ts.map(iord) - ord('a')
return (f(df['1st']) + f(df['3rd']) * 1e2 +
df['2nd'].fillna(0) * 1e4)

def run_asserts(left, right):
for sort in [False, True]:
res = left.join(right, on=icols, how='left', sort=sort)

assert len(left) < len(res) + 1
assert not res['4th'].isna().any()
assert not res['5th'].isna().any()

tm.assert_series_equal(
res['4th'], - res['5th'], check_names=False)
result = bind_cols(res.iloc[:, :-2])
tm.assert_series_equal(res['4th'], result, check_names=False)
assert result.name is None

if sort:
tm.assert_frame_equal(
res, res.sort_values(icols, kind='mergesort'))

out = merge(left, right.reset_index(), on=icols,
sort=sort, how='left')

res.index = np.arange(len(res))
tm.assert_frame_equal(out, res)

lc = list(map(chr, np.arange(ord('a'), ord('z') + 1)))
left = DataFrame(np.random.choice(lc, (5000, 2)),
columns=['1st', '3rd'])
left.insert(1, '2nd', np.random.randint(0, 1000, len(left)))

i = np.random.permutation(len(left))
right = left.iloc[i].copy()

left['4th'] = bind_cols(left)
right['5th'] = - bind_cols(right)
right.set_index(icols, inplace=True)

run_asserts(left, right)

# inject some nulls
left.loc[1::23, '1st'] = np.nan
left.loc[2::37, '2nd'] = np.nan
left.loc[3::43, '3rd'] = np.nan
left['4th'] = bind_cols(left)

i = np.random.permutation(len(left))
right = left.iloc[i, :-1]
right['5th'] = - bind_cols(right)
right.set_index(icols, inplace=True)

run_asserts(left, right)

def test_merge_right_vs_left(self):
# compare left vs right merge with multikey
for sort in [False, True]:
merged1 = self.data.merge(self.to_join, left_on=['key1', 'key2'],
right_index=True, how='left', sort=sort)

merged2 = self.to_join.merge(self.data, right_on=['key1', 'key2'],
left_index=True, how='right',
sort=sort)

merged2 = merged2.loc[:, merged1.columns]
assert_frame_equal(merged1, merged2)

def test_compress_group_combinations(self):

# ~ 40000000 possible unique groups
key1 = tm.rands_array(10, 10000)
Expand Down

0 comments on commit 5689f0a

Please sign in to comment.