From da94fdb51c4ae206d1a527d7acb3bf5ee0e75534 Mon Sep 17 00:00:00 2001 From: "Jon M. Mease" Date: Wed, 19 Jul 2017 14:36:06 -0400 Subject: [PATCH 01/34] Support merging frames on a combo of columns and index levels (GH 14355) --- doc/source/merging.rst | 61 +++++++++++-- doc/source/whatsnew/v0.21.0.txt | 1 + pandas/core/frame.py | 30 +++++++ pandas/core/reshape/merge.py | 80 +++++++++++++++-- pandas/tests/reshape/test_merge.py | 133 +++++++++++++++++++++++++++++ 5 files changed, 292 insertions(+), 13 deletions(-) diff --git a/doc/source/merging.rst b/doc/source/merging.rst index a5ee1b1a9384c..e7ce8dc242ca8 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -518,14 +518,16 @@ standard database join operations between DataFrame objects: - ``left``: A DataFrame object - ``right``: Another DataFrame object -- ``on``: Columns (names) to join on. Must be found in both the left and - right DataFrame objects. If not passed and ``left_index`` and +- ``on``: Column or index level names to join on. Must be found in both the left + and right DataFrame objects. If not passed and ``left_index`` and ``right_index`` are ``False``, the intersection of the columns in the DataFrames will be inferred to be the join keys -- ``left_on``: Columns from the left DataFrame to use as keys. Can either be - column names or arrays with length equal to the length of the DataFrame -- ``right_on``: Columns from the right DataFrame to use as keys. Can either be - column names or arrays with length equal to the length of the DataFrame +- ``left_on``: Columns or index levels from the left DataFrame to use as + keys. Can either be column names, index level names, or arrays with length + equal to the length of the DataFrame +- ``right_on``: Columns or index levels from the right DataFrame to use as + keys. Can either be column names, index level names, or arrays with length + equal to the length of the DataFrame - ``left_index``: If ``True``, use the index (row labels) from the left DataFrame as its join key(s). In the case of a DataFrame with a MultiIndex (hierarchical), the number of levels must match the number of join keys @@ -1120,6 +1122,53 @@ This is not Implemented via ``join`` at-the-moment, however it can be done using labels=['left', 'right'], vertical=False); plt.close('all'); +Merging on a combination of columns and index levels +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. versionadded:: 0.21 + +Strings passed as the ``on``, ``left_on``, and ``right_on`` parameters +may refer to either column names or index level names. This enables +the merging of DataFrames on a combination of index levels and columns without +resetting indexes. + +.. ipython:: python + + left_index = pd.Index(['K0', 'K0', 'K1', 'K2'], name='key1') + + left = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'], + 'B': ['B0', 'B1', 'B2', 'B3'], + 'key2': ['K0', 'K1', 'K0', 'K1']}, + index=left_index) + + right_index = pd.Index(['K0', 'K1', 'K2', 'K2'], name='key1') + + right = pd.DataFrame({'C': ['C0', 'C1', 'C2', 'C3'], + 'D': ['D0', 'D1', 'D2', 'D3'], + 'key2': ['K0', 'K0', 'K0', 'K1']}, + index=right_index) + + result = left.merge(right, on=['key1', 'key2']) + +.. ipython:: python + :suppress: + + @savefig merge_on_index_and_column.png + p.plot([left, right], result, + labels=['left', 'right'], vertical=False); + plt.close('all'); + +.. note:: + + When DataFrames are merged on a string that matches an index level in both + frames then the index level is preserved as an index level in the resulting + DataFrame. + +.. note:: + + If a string matches both a column name and an index level name then a warning is + issued and the column takes precedence. This will result in an ambiguity error + in a future version. + Overlapping value columns ~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index f50052347cfb5..2d8d10165a030 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -109,6 +109,7 @@ Other Enhancements - :func:`date_range` now accepts 'Y' in addition to 'A' as an alias for end of year (:issue:`9313`) - Integration with `Apache Parquet `__, including a new top-level :func:`read_parquet` and :func:`DataFrame.to_parquet` method, see :ref:`here `. (:issue:`15838`, :issue:`17438`) - :func:`DataFrame.add_prefix` and :func:`DataFrame.add_suffix` now accept strings containing the '%' character. (:issue:`17151`) +- :func:`DataFrame.merge` now accepts index level names as `on`, `left_on`, and `right_on` parameters allowing frames to be merged on a combination of columns and index levels (:issue:`14355`) - `read_*` methods can now infer compression from non-string paths, such as ``pathlib.Path`` objects (:issue:`17206`). - :func:`pd.read_sas()` now recognizes much more of the most frequently used date (datetime) formats in SAS7BDAT files (:issue:`15871`). - :func:`DataFrame.items` and :func:`Series.items` is now present in both Python 2 and 3 and is lazy in all cases (:issue:`13918`, :issue:`17213`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5991ec825c841..1d128fec9c4f8 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3437,6 +3437,36 @@ def f(vals): # ---------------------------------------------------------------------- # Sorting + def _get_column_or_level_values(self, key, axis=1, + op_description='retrieve'): + if (is_integer(key) or + (axis == 1 and key in self) or + (axis == 0 and key in self.index)): + + if axis == 1 and key in self.index.names: + warnings.warn( + ("'%s' is both a column name and an index level.\n" + "Defaulting to column but " + "this will raise an ambiguity error in a " + "future version") % key, + FutureWarning, stacklevel=2) + + k = self.xs(key, axis=axis)._values + if k.ndim == 2: + + # try to be helpful + if isinstance(self.columns, MultiIndex): + raise ValueError('Cannot %s column "%s" in a multi-index. ' + 'All levels must be provided explicitly' + % (op_description, str(key))) + + raise ValueError('Cannot %s duplicate column "%s"' % + (op_description, str(key))) + elif key in self.index.names: + k = self.index.get_level_values(key).values + else: + raise KeyError(key) + return k @Appender(_shared_docs['sort_values'] % _shared_doc_kwargs) def sort_values(self, by, axis=0, ascending=True, inplace=False, diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 947300a28e510..9bf40df56f027 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -524,6 +524,7 @@ def __init__(self, left, right, how='inner', on=None, self.right_index = right_index self.indicator = indicator + self.has_common_index_levels = False if isinstance(self.indicator, compat.string_types): self.indicator_name = self.indicator @@ -650,6 +651,7 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer): left_has_missing = None right_has_missing = None + new_index_values = {} keys = zip(self.join_names, self.left_on, self.right_on) for i, (name, lname, rname) in enumerate(keys): if not _should_fill(lname, rname): @@ -717,7 +719,25 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer): if name in result: result[name] = key_col else: - result.insert(i, name or 'key_{i}'.format(i=i), key_col) + if name and name in result.index.names: + new_index_values[name] = key_col + else: + result.insert( + i, name or 'key_{i}'.format(i=i), key_col) + + if new_index_values: + # Create new index for result + index_arrays = [new_index_values[n] + if n in new_index_values + else result.index.get_level_values(i) + for (i, n) in enumerate(result.index.names)] + + if len(index_arrays) == 1: + new_index = Index(index_arrays[0], name=result.index.name) + else: + new_index = MultiIndex.from_arrays(index_arrays, + names=result.index.names) + result.index = new_index def _get_join_indexers(self): """ return the join indexers """ @@ -760,7 +780,10 @@ def _get_join_info(self): join_index = self.left.index.take(left_indexer) right_indexer = np.array([-1] * len(join_index)) else: - join_index = Index(np.arange(len(left_indexer))) + if not self.has_common_index_levels: + join_index = Index(np.arange(len(left_indexer))) + else: + join_index = self.left.index.take(left_indexer) if len(join_index) == 0: join_index = join_index.astype(object) @@ -792,6 +815,10 @@ def _get_merge_keys(self): is_rkey = lambda x: isinstance( x, (np.ndarray, Series)) and len(x) == len(right) + def get_key_vals(df, key): + return df._get_column_or_level_values(key, axis=self.axis, + op_description="merge on") + # Note that pd.merge_asof() has separate 'on' and 'by' parameters. A # user could, for example, request 'left_index' and 'left_by'. In a # regular pd.merge(), users cannot specify both 'left_index' and @@ -812,7 +839,7 @@ def _get_merge_keys(self): join_names.append(None) # what to do? else: if rk is not None: - right_keys.append(right[rk]._values) + right_keys.append(get_key_vals(right, rk)) join_names.append(rk) else: # work-around for merge_asof(right_index=True) @@ -821,7 +848,7 @@ def _get_merge_keys(self): else: if not is_rkey(rk): if rk is not None: - right_keys.append(right[rk]._values) + right_keys.append(get_key_vals(right, rk)) else: # work-around for merge_asof(right_index=True) right_keys.append(right.index) @@ -834,7 +861,7 @@ def _get_merge_keys(self): else: right_keys.append(rk) if lk is not None: - left_keys.append(left[lk]._values) + left_keys.append(get_key_vals(left, lk)) join_names.append(lk) else: # work-around for merge_asof(left_index=True) @@ -846,7 +873,7 @@ def _get_merge_keys(self): left_keys.append(k) join_names.append(None) else: - left_keys.append(left[k]._values) + left_keys.append(get_key_vals(left, k)) join_names.append(k) if isinstance(self.right.index, MultiIndex): right_keys = [lev._values.take(lab) @@ -860,7 +887,7 @@ def _get_merge_keys(self): right_keys.append(k) join_names.append(None) else: - right_keys.append(right[k]._values) + right_keys.append(get_key_vals(right, k)) join_names.append(k) if isinstance(self.left.index, MultiIndex): left_keys = [lev._values.take(lab) @@ -869,10 +896,49 @@ def _get_merge_keys(self): else: left_keys = [self.left.index.values] + # Reset index levels that are not common to both DataFrames + common_index_levels = [(li, ri) for (li, ri) in + zip(self.left_on, self.right_on) if + isinstance(li, compat.string_types) and + li not in self.left and + isinstance(ri, compat.string_types) and + ri not in self.right] + + if common_index_levels: + common_levels_right, common_levels_left = ( + zip(*common_index_levels) + ) + + reset_left = [lev for lev in self.left.index.names + if lev not in common_levels_left] + if reset_left: + self.left.reset_index( + reset_left, + inplace=True) + + reset_right = [lev for lev in self.right.index.names + if lev not in common_levels_right] + if reset_right: + self.right.reset_index( + reset_right, + inplace=True) + + self.has_common_index_levels = True + if left_drop: + # Determine index levels to reset before dropping + levels_to_reset = [level for level in left_drop + if level not in self.left] + if levels_to_reset: + self.left = self.left.reset_index(levels_to_reset) self.left = self.left.drop(left_drop, axis=1) if right_drop: + # Determine index levels to reset before dropping + levels_to_reset = [level for level in right_drop + if level not in self.right] + if levels_to_reset: + self.right = self.right.reset_index(levels_to_reset) self.right = self.right.drop(right_drop, axis=1) return left_keys, right_keys, join_names diff --git a/pandas/tests/reshape/test_merge.py b/pandas/tests/reshape/test_merge.py index 338596d1523e4..dafb740cf2fa1 100644 --- a/pandas/tests/reshape/test_merge.py +++ b/pandas/tests/reshape/test_merge.py @@ -68,6 +68,14 @@ def test_merge_common(self): exp = merge(self.df, self.df2, on=['key1', 'key2']) tm.assert_frame_equal(joined, exp) + def test_merge_index_as_on_arg(self): + # GH14355 + left = self.df.set_index('key1') + right = self.df2.set_index('key1') + result = merge(left, right, on='key1') + expected = merge(self.df, self.df2, on='key1').set_index('key1') + assert_frame_equal(result, expected) + def test_merge_index_singlekey_right_vs_left(self): left = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'e', 'a'], 'v1': np.random.randn(7)}) @@ -1350,6 +1358,131 @@ def f(): household.join(log_return, how='outer') pytest.raises(NotImplementedError, f) + def test_merge_on_index_and_column(self): + # Construct DataFrames + df1 = DataFrame(dict( + outer=[1, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4], + inner=[1, 2, 3, 1, 2, 3, 4, 1, 2, 1, 2], + v1=np.linspace(0, 1, 11) + )).set_index(['outer']) + + df2 = DataFrame(dict( + outer=[1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3], + inner=[1, 2, 2, 3, 3, 4, 2, 3, 1, 1, 2, 3], + v2=np.linspace(10, 11, 12) + )).set_index(['outer']) + + # Test merge on outer (index) and inner (column) + for how in ['inner', 'left', 'right', 'outer']: + expected = (df1.reset_index() + .merge(df2.reset_index(), + on=['outer', 'inner'], how=how) + .set_index('outer')) + + result = df1.merge(df2, on=['outer', 'inner'], how=how) + assert_frame_equal(result, expected) + + # Same result when index/column order is flipped + result = df1.merge(df2, on=['inner', 'outer'], how=how) + assert_frame_equal(result, expected) + + def test_merge_on_index_and_column_multi(self): + # GH14355 + + # Construct DataFrames + df1 = DataFrame(dict( + outer=[1, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4], + inner=[1, 2, 3, 1, 2, 3, 4, 1, 2, 1, 2], + v1=np.linspace(0, 1, 11) + )).set_index(['outer', 'inner']) + + df2 = DataFrame(dict( + outer=[1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3], + inner=[1, 2, 2, 3, 3, 4, 2, 3, 1, 1, 2, 3], + v2=np.linspace(10, 11, 12) + )).set_index(['outer']) + + # Test merge on outer (both index) and inner (one as index and + # one as column) + for how in ['inner', 'left', 'right', 'outer']: + expected = (df1.reset_index() + .merge(df2.reset_index(), + on=['outer', 'inner'], how=how) + .set_index('outer')) + + result = df1.merge(df2, + on=['outer', 'inner'], how=how) + assert_frame_equal(result, expected) + + # Same result when index/column order is flipped + result = df1.merge(df2, + on=['inner', 'outer'], how=how) + assert_frame_equal(result, expected) + + def test_merge_on_index_and_column_multi2(self): + # GH14355 + + # Construct DataFrames + df1 = DataFrame(dict( + outer=[1, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4], + inner=[1, 2, 3, 1, 2, 3, 4, 1, 2, 1, 2], + v1=np.linspace(0, 1, 11) + )).set_index(['outer', 'inner']) + + df2 = DataFrame(dict( + outer=[1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3], + inner=[1, 2, 2, 3, 3, 4, 2, 3, 1, 1, 2, 3], + v2=np.linspace(10, 11, 12) + )).set_index(['outer', 'inner']) + + # Test merge on outer (both index) and inner (both index) + for how in ['inner', 'left', 'right', 'outer']: + expected = (df1.reset_index() + .merge(df2.reset_index(), + on=['outer', 'inner'], how=how) + .set_index(['outer', 'inner'])) + + result = df1.merge(df2, on=['outer', 'inner'], how=how) + assert_frame_equal(result, expected) + + # Same result when index/column order is flipped + result = df1.merge(df2, on=['inner', 'outer'], how=how) + assert_frame_equal(result, expected) + + def test_merge_index_column_precedence(self): + # GH14355 + + # Construct DataFrames + df1 = DataFrame(dict( + outer=[1, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4], + inner=[1, 2, 3, 1, 2, 3, 4, 1, 2, 1, 2], + v1=np.linspace(0, 1, 11) + )).set_index(['outer']) + # - df1 has both a column and index named 'outer' + df1['outer'] = df1['inner'] + + df2 = DataFrame(dict( + outer=[1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3], + inner=[1, 2, 2, 3, 3, 4, 2, 3, 1, 1, 2, 3], + v2=np.linspace(10, 11, 12) + )).set_index(['outer']) + + # Merge df1 and df2 on 'outer' and 'inner' + # - 'outer' for df1 should refer to the 'outer' column + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = df1.merge(df2, on=['outer', 'inner']) + + # Remove 'outer' index from df1 prior to merge + expected = df1.reset_index(drop=True).merge(df2.reset_index(), + on=['outer', 'inner']) + # Remove 'outer' column from df1 prior to merge + not_expected = df1.drop('outer', axis=1).reset_index().merge( + df2.reset_index(), on=['outer', 'inner']) + + # Check results + assert_frame_equal(result, expected) + assert not result.equals(not_expected) + @pytest.fixture def df(): From f8c8c538981564ee84c05e831b3004ad4124bb31 Mon Sep 17 00:00:00 2001 From: "Jon M. Mease" Date: Sat, 9 Sep 2017 21:04:52 -0400 Subject: [PATCH 02/34] Cleanup for review --- doc/source/merging.rst | 8 ++--- doc/source/whatsnew/v0.21.0.txt | 2 +- pandas/core/frame.py | 58 ++++++++++++++++++++++++------ pandas/core/reshape/merge.py | 19 ++++++++++ pandas/tests/reshape/test_merge.py | 2 ++ 5 files changed, 74 insertions(+), 15 deletions(-) diff --git a/doc/source/merging.rst b/doc/source/merging.rst index e7ce8dc242ca8..dbd8bd0cc3981 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -1160,14 +1160,14 @@ resetting indexes. .. note:: When DataFrames are merged on a string that matches an index level in both - frames then the index level is preserved as an index level in the resulting + frames, the index level is preserved as an index level in the resulting DataFrame. .. note:: - If a string matches both a column name and an index level name then a warning is - issued and the column takes precedence. This will result in an ambiguity error - in a future version. + If a string matches both a column name and an index level name, then a + warning is issued and the column takes precedence. This will result in an + ambiguity error in a future version. Overlapping value columns ~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 2d8d10165a030..111ce5e922394 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -109,7 +109,7 @@ Other Enhancements - :func:`date_range` now accepts 'Y' in addition to 'A' as an alias for end of year (:issue:`9313`) - Integration with `Apache Parquet `__, including a new top-level :func:`read_parquet` and :func:`DataFrame.to_parquet` method, see :ref:`here `. (:issue:`15838`, :issue:`17438`) - :func:`DataFrame.add_prefix` and :func:`DataFrame.add_suffix` now accept strings containing the '%' character. (:issue:`17151`) -- :func:`DataFrame.merge` now accepts index level names as `on`, `left_on`, and `right_on` parameters allowing frames to be merged on a combination of columns and index levels (:issue:`14355`) +- :func:`DataFrame.merge` now accepts index level names as `on`, `left_on`, and `right_on` parameters, allowing frames to be merged on a combination of columns and index levels (:issue:`14355`) - `read_*` methods can now infer compression from non-string paths, such as ``pathlib.Path`` objects (:issue:`17206`). - :func:`pd.read_sas()` now recognizes much more of the most frequently used date (datetime) formats in SAS7BDAT files (:issue:`15871`). - :func:`DataFrame.items` and :func:`Series.items` is now present in both Python 2 and 3 and is lazy in all cases (:issue:`13918`, :issue:`17213`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1d128fec9c4f8..bc80ec047b257 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3439,16 +3439,50 @@ def f(vals): # Sorting def _get_column_or_level_values(self, key, axis=1, op_description='retrieve'): + """ + Return an array of values from a column or index level. + + Parameters + ---------- + key: int or object + - int: integer location of the column (or row if `axis` is 0) to + retrieve. + + - object: label of the column (or row if axis=0) to retrieve. If + `axis` is 1 then `key` may also refer to an index level. If + `axis` is 1 and `key` matches both a column and an index level + a ``FutureWarning`` is raised and the column takes precedence + + axis: int, default 0 + Axis to retrieve values from + + op_description: str, default 'retrieve' + Description of current user operation for use in error reporting + + Returns + ------- + values : array of requested values + + Raises + ------ + ValueError + if `key` matches duplicate columns + KeyError + if `key` not found + FutureWarning + if `axis` is 1 and `key` matches both a column and an index + level. The column takes precedence but this will result in an + ambiguity error in a future version + """ if (is_integer(key) or (axis == 1 and key in self) or (axis == 0 and key in self.index)): if axis == 1 and key in self.index.names: warnings.warn( - ("'%s' is both a column name and an index level.\n" - "Defaulting to column but " - "this will raise an ambiguity error in a " - "future version") % key, + ("'{key}' is both a column name and an index level.\n" + "Defaulting to column, but this will raise an ambiguity " + "error in a future version").format(key=key), FutureWarning, stacklevel=2) k = self.xs(key, axis=axis)._values @@ -3456,13 +3490,17 @@ def _get_column_or_level_values(self, key, axis=1, # try to be helpful if isinstance(self.columns, MultiIndex): - raise ValueError('Cannot %s column "%s" in a multi-index. ' - 'All levels must be provided explicitly' - % (op_description, str(key))) + raise ValueError(('Cannot {op} column "{key}" in a ' + 'multi-index. All levels must be ' + 'provided explicitly' + ).format(op=op_description, + key=key)) + + raise ValueError(('Cannot {op} duplicate column "{key}"' + ).format(op=op_description, + key=key)) - raise ValueError('Cannot %s duplicate column "%s"' % - (op_description, str(key))) - elif key in self.index.names: + elif axis == 1 and key in self.index.names: k = self.index.get_level_values(key).values else: raise KeyError(key) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 9bf40df56f027..5c6b5866ee2c0 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -816,6 +816,25 @@ def _get_merge_keys(self): x, (np.ndarray, Series)) and len(x) == len(right) def get_key_vals(df, key): + """ + Return an array of values from a DataFrame corresponding to a + given key. This is a wrapper for + ``DataFrame._get_column_or_level_values()`` with `axis` and + `op_description` defined by the merge operation + + Parameters + ---------- + df: DataFrame + key: int or object + + Returns + ------- + values: array + + See Also + -------- + DataFrame._get_column_or_level_values + """ return df._get_column_or_level_values(key, axis=self.axis, op_description="merge on") diff --git a/pandas/tests/reshape/test_merge.py b/pandas/tests/reshape/test_merge.py index dafb740cf2fa1..3f13f0b7926d3 100644 --- a/pandas/tests/reshape/test_merge.py +++ b/pandas/tests/reshape/test_merge.py @@ -1359,6 +1359,8 @@ def f(): pytest.raises(NotImplementedError, f) def test_merge_on_index_and_column(self): + # GH14355 + # Construct DataFrames df1 = DataFrame(dict( outer=[1, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4], From 368844ad09ae670887918bb38ec6a97c8e2b308d Mon Sep 17 00:00:00 2001 From: "Jon M. Mease" Date: Mon, 11 Sep 2017 12:07:57 -0400 Subject: [PATCH 03/34] revert implementation (but keep documentation and tests) --- pandas/core/frame.py | 68 ------------------------- pandas/core/reshape/merge.py | 99 +++--------------------------------- 2 files changed, 7 insertions(+), 160 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index bc80ec047b257..5991ec825c841 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3437,74 +3437,6 @@ def f(vals): # ---------------------------------------------------------------------- # Sorting - def _get_column_or_level_values(self, key, axis=1, - op_description='retrieve'): - """ - Return an array of values from a column or index level. - - Parameters - ---------- - key: int or object - - int: integer location of the column (or row if `axis` is 0) to - retrieve. - - - object: label of the column (or row if axis=0) to retrieve. If - `axis` is 1 then `key` may also refer to an index level. If - `axis` is 1 and `key` matches both a column and an index level - a ``FutureWarning`` is raised and the column takes precedence - - axis: int, default 0 - Axis to retrieve values from - - op_description: str, default 'retrieve' - Description of current user operation for use in error reporting - - Returns - ------- - values : array of requested values - - Raises - ------ - ValueError - if `key` matches duplicate columns - KeyError - if `key` not found - FutureWarning - if `axis` is 1 and `key` matches both a column and an index - level. The column takes precedence but this will result in an - ambiguity error in a future version - """ - if (is_integer(key) or - (axis == 1 and key in self) or - (axis == 0 and key in self.index)): - - if axis == 1 and key in self.index.names: - warnings.warn( - ("'{key}' is both a column name and an index level.\n" - "Defaulting to column, but this will raise an ambiguity " - "error in a future version").format(key=key), - FutureWarning, stacklevel=2) - - k = self.xs(key, axis=axis)._values - if k.ndim == 2: - - # try to be helpful - if isinstance(self.columns, MultiIndex): - raise ValueError(('Cannot {op} column "{key}" in a ' - 'multi-index. All levels must be ' - 'provided explicitly' - ).format(op=op_description, - key=key)) - - raise ValueError(('Cannot {op} duplicate column "{key}"' - ).format(op=op_description, - key=key)) - - elif axis == 1 and key in self.index.names: - k = self.index.get_level_values(key).values - else: - raise KeyError(key) - return k @Appender(_shared_docs['sort_values'] % _shared_doc_kwargs) def sort_values(self, by, axis=0, ascending=True, inplace=False, diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 5c6b5866ee2c0..947300a28e510 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -524,7 +524,6 @@ def __init__(self, left, right, how='inner', on=None, self.right_index = right_index self.indicator = indicator - self.has_common_index_levels = False if isinstance(self.indicator, compat.string_types): self.indicator_name = self.indicator @@ -651,7 +650,6 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer): left_has_missing = None right_has_missing = None - new_index_values = {} keys = zip(self.join_names, self.left_on, self.right_on) for i, (name, lname, rname) in enumerate(keys): if not _should_fill(lname, rname): @@ -719,25 +717,7 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer): if name in result: result[name] = key_col else: - if name and name in result.index.names: - new_index_values[name] = key_col - else: - result.insert( - i, name or 'key_{i}'.format(i=i), key_col) - - if new_index_values: - # Create new index for result - index_arrays = [new_index_values[n] - if n in new_index_values - else result.index.get_level_values(i) - for (i, n) in enumerate(result.index.names)] - - if len(index_arrays) == 1: - new_index = Index(index_arrays[0], name=result.index.name) - else: - new_index = MultiIndex.from_arrays(index_arrays, - names=result.index.names) - result.index = new_index + result.insert(i, name or 'key_{i}'.format(i=i), key_col) def _get_join_indexers(self): """ return the join indexers """ @@ -780,10 +760,7 @@ def _get_join_info(self): join_index = self.left.index.take(left_indexer) right_indexer = np.array([-1] * len(join_index)) else: - if not self.has_common_index_levels: - join_index = Index(np.arange(len(left_indexer))) - else: - join_index = self.left.index.take(left_indexer) + join_index = Index(np.arange(len(left_indexer))) if len(join_index) == 0: join_index = join_index.astype(object) @@ -815,29 +792,6 @@ def _get_merge_keys(self): is_rkey = lambda x: isinstance( x, (np.ndarray, Series)) and len(x) == len(right) - def get_key_vals(df, key): - """ - Return an array of values from a DataFrame corresponding to a - given key. This is a wrapper for - ``DataFrame._get_column_or_level_values()`` with `axis` and - `op_description` defined by the merge operation - - Parameters - ---------- - df: DataFrame - key: int or object - - Returns - ------- - values: array - - See Also - -------- - DataFrame._get_column_or_level_values - """ - return df._get_column_or_level_values(key, axis=self.axis, - op_description="merge on") - # Note that pd.merge_asof() has separate 'on' and 'by' parameters. A # user could, for example, request 'left_index' and 'left_by'. In a # regular pd.merge(), users cannot specify both 'left_index' and @@ -858,7 +812,7 @@ def get_key_vals(df, key): join_names.append(None) # what to do? else: if rk is not None: - right_keys.append(get_key_vals(right, rk)) + right_keys.append(right[rk]._values) join_names.append(rk) else: # work-around for merge_asof(right_index=True) @@ -867,7 +821,7 @@ def get_key_vals(df, key): else: if not is_rkey(rk): if rk is not None: - right_keys.append(get_key_vals(right, rk)) + right_keys.append(right[rk]._values) else: # work-around for merge_asof(right_index=True) right_keys.append(right.index) @@ -880,7 +834,7 @@ def get_key_vals(df, key): else: right_keys.append(rk) if lk is not None: - left_keys.append(get_key_vals(left, lk)) + left_keys.append(left[lk]._values) join_names.append(lk) else: # work-around for merge_asof(left_index=True) @@ -892,7 +846,7 @@ def get_key_vals(df, key): left_keys.append(k) join_names.append(None) else: - left_keys.append(get_key_vals(left, k)) + left_keys.append(left[k]._values) join_names.append(k) if isinstance(self.right.index, MultiIndex): right_keys = [lev._values.take(lab) @@ -906,7 +860,7 @@ def get_key_vals(df, key): right_keys.append(k) join_names.append(None) else: - right_keys.append(get_key_vals(right, k)) + right_keys.append(right[k]._values) join_names.append(k) if isinstance(self.left.index, MultiIndex): left_keys = [lev._values.take(lab) @@ -915,49 +869,10 @@ def get_key_vals(df, key): else: left_keys = [self.left.index.values] - # Reset index levels that are not common to both DataFrames - common_index_levels = [(li, ri) for (li, ri) in - zip(self.left_on, self.right_on) if - isinstance(li, compat.string_types) and - li not in self.left and - isinstance(ri, compat.string_types) and - ri not in self.right] - - if common_index_levels: - common_levels_right, common_levels_left = ( - zip(*common_index_levels) - ) - - reset_left = [lev for lev in self.left.index.names - if lev not in common_levels_left] - if reset_left: - self.left.reset_index( - reset_left, - inplace=True) - - reset_right = [lev for lev in self.right.index.names - if lev not in common_levels_right] - if reset_right: - self.right.reset_index( - reset_right, - inplace=True) - - self.has_common_index_levels = True - if left_drop: - # Determine index levels to reset before dropping - levels_to_reset = [level for level in left_drop - if level not in self.left] - if levels_to_reset: - self.left = self.left.reset_index(levels_to_reset) self.left = self.left.drop(left_drop, axis=1) if right_drop: - # Determine index levels to reset before dropping - levels_to_reset = [level for level in right_drop - if level not in self.right] - if levels_to_reset: - self.right = self.right.reset_index(levels_to_reset) self.right = self.right.drop(right_drop, axis=1) return left_keys, right_keys, join_names From 1c4699eca7992a9aff9d37478bc4c1f92a8ae2eb Mon Sep 17 00:00:00 2001 From: "Jon M. Mease" Date: Mon, 11 Sep 2017 12:04:54 -0400 Subject: [PATCH 04/34] Simplify and refactor column/level logic in merge --- pandas/core/frame.py | 113 +++++++++++++++++++++++++++++ pandas/core/reshape/merge.py | 46 ++++++++++-- pandas/tests/reshape/test_merge.py | 5 ++ 3 files changed, 157 insertions(+), 7 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5991ec825c841..2d804da9bbf78 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2162,6 +2162,119 @@ def _getitem_frame(self, key): raise ValueError('Must pass DataFrame with boolean values only') return self.where(key) + # ------------------------------------------------------------------------- + # Column and Index Mixing + # + # A collection of helpers methods for DataFrame operations that accept + # mix of column and index levels. All such operations should utilize + # these methods as much as possible so that we have consistent precedence + # and validation logic. + # + # General Notes: + # + # - If a column and index level share the same name, the column takes + # precedence + # + # - These methods assume axis=1 + # + # - Only string keys may be used to reference index levels. + + def _get_column_or_level_values(self, key): + """ + Return an array of values from a DataFrame column or named index level + + Parameters + ---------- + key: str or object + Label of column or index level. If `key` is present in the frame as + a column label, the corresponding column is chosen. Otherwise, + if `key` is a string and the is present in the frame as the name + of an index level, the corresponding index level is chosen. + Otherwise, a ``KeyError`` is raised. + + Returns + ------- + values: np.ndarray + + Raises + ------ + KeyError + if `key` matches neither a column label nor an index level name + + See Also + -------- + DataFrame._get_column_or_level_values + """ + if key in self: + if key in self.index.names: + warnings.warn( + ("'{key}' is both a column name and an index level.\n" + "Defaulting to column, but this will raise an " + "ambiguity error in a future version" + ).format(key=key), + FutureWarning, stacklevel=2) + + values = self[key]._values + elif key in self.index.names: + values = self.index.get_level_values(key)._values + else: + raise KeyError(key) + + return values + + def _is_index_reference(self, key): + """ + Test whether a key is an index level reference + + To be considered an index level reference `key` must be a string that + matches the name of an index level and does NOT match the label + of any column. + + Parameters + ---------- + key: str or object + Label of column or index level + + Returns + ------- + is_index: bool + """ + return (isinstance(key, compat.string_types) and + key not in self.columns and + key in self.index.names) + + def _drop_columns_or_levels(self, drop_keys): + """ + Drop columns or levels from the dataframe + + Parameters + ---------- + drop_keys: single label or list-like of labels + + Returns + ------- + dropped: DataFrame + """ + drop_keys = com._maybe_make_list(drop_keys) + + # Perform copy upfront and then use inplace operations below. + # This ensures that we always perform exactly one copy. + # ``copy`` and/or ``inplace`` options could be added in the future. + dropped = self.copy() + + # Handle dropping index levels + levels_to_reset = [k for k in drop_keys if self._is_index_reference(k)] + if levels_to_reset: + dropped.reset_index(levels_to_reset, inplace=True) + + # Handle dropping columns + cols_to_drop = [k for k in drop_keys if not self._is_index_reference(k)] + if cols_to_drop: + dropped.drop(drop_keys, axis=1, inplace=True) + + return dropped + + def query(self, expr, inplace=False, **kwargs): """Query the columns of a frame with a boolean expression. diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 947300a28e510..e5cf946266db8 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -603,6 +603,8 @@ def get_result(self): self._maybe_add_join_keys(result, left_indexer, right_indexer) + self._maybe_restore_index_levels(result) + return result def _indicator_pre_merge(self, left, right): @@ -645,6 +647,34 @@ def _indicator_post_merge(self, result): axis=1) return result + def _maybe_restore_index_levels(self, result): + """ + Restore index levels specified as `on` parameters to the index + + Here we check for cases where `self.left_on` and `self.right_on` pairs + each reference an index level in their respective DataFrames. The + joined columns corresponding to these pairs are then restored to the + index of `result`. + + **Note:** This method has side effects. It modifies `result` in-place + + Parameters + ---------- + result: DataFrame + merge result + """ + names_to_restore = [] + for name, left_key, right_key in zip(self.join_names, + self.left_on, + self.right_on): + if self.orig_left._is_index_reference(left_key) and \ + self.orig_right._is_index_reference(right_key): + + names_to_restore.append(name) + + if names_to_restore: + result.set_index(names_to_restore, inplace=True) + def _maybe_add_join_keys(self, result, left_indexer, right_indexer): left_has_missing = None @@ -812,7 +842,8 @@ def _get_merge_keys(self): join_names.append(None) # what to do? else: if rk is not None: - right_keys.append(right[rk]._values) + right_keys.append( + right._get_column_or_level_values(rk)) join_names.append(rk) else: # work-around for merge_asof(right_index=True) @@ -821,7 +852,8 @@ def _get_merge_keys(self): else: if not is_rkey(rk): if rk is not None: - right_keys.append(right[rk]._values) + right_keys.append( + right._get_column_or_level_values(rk)) else: # work-around for merge_asof(right_index=True) right_keys.append(right.index) @@ -834,7 +866,7 @@ def _get_merge_keys(self): else: right_keys.append(rk) if lk is not None: - left_keys.append(left[lk]._values) + left_keys.append(left._get_column_or_level_values(lk)) join_names.append(lk) else: # work-around for merge_asof(left_index=True) @@ -846,7 +878,7 @@ def _get_merge_keys(self): left_keys.append(k) join_names.append(None) else: - left_keys.append(left[k]._values) + left_keys.append(left._get_column_or_level_values(k)) join_names.append(k) if isinstance(self.right.index, MultiIndex): right_keys = [lev._values.take(lab) @@ -860,7 +892,7 @@ def _get_merge_keys(self): right_keys.append(k) join_names.append(None) else: - right_keys.append(right[k]._values) + right_keys.append(right._get_column_or_level_values(k)) join_names.append(k) if isinstance(self.left.index, MultiIndex): left_keys = [lev._values.take(lab) @@ -870,10 +902,10 @@ def _get_merge_keys(self): left_keys = [self.left.index.values] if left_drop: - self.left = self.left.drop(left_drop, axis=1) + self.left = self.left._drop_columns_or_levels(left_drop) if right_drop: - self.right = self.right.drop(right_drop, axis=1) + self.right = self.right._drop_columns_or_levels(right_drop) return left_keys, right_keys, join_names diff --git a/pandas/tests/reshape/test_merge.py b/pandas/tests/reshape/test_merge.py index 3f13f0b7926d3..e6f87a07c9362 100644 --- a/pandas/tests/reshape/test_merge.py +++ b/pandas/tests/reshape/test_merge.py @@ -1448,6 +1448,11 @@ def test_merge_on_index_and_column_multi2(self): assert_frame_equal(result, expected) # Same result when index/column order is flipped + expected = (df1.reset_index() + .merge(df2.reset_index(), + on=['inner', 'outer'], how=how) + .set_index(['inner', 'outer'])) + result = df1.merge(df2, on=['inner', 'outer'], how=how) assert_frame_equal(result, expected) From ac1189b5e9496f8cc8316dd5f5e4ae70b0a2c652 Mon Sep 17 00:00:00 2001 From: "Jon M. Mease" Date: Mon, 11 Sep 2017 12:21:31 -0400 Subject: [PATCH 05/34] PEP8 cleanup --- pandas/core/frame.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 2d804da9bbf78..72e1ea94369dd 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2263,18 +2263,19 @@ def _drop_columns_or_levels(self, drop_keys): dropped = self.copy() # Handle dropping index levels - levels_to_reset = [k for k in drop_keys if self._is_index_reference(k)] + levels_to_reset = [k for k in drop_keys + if self._is_index_reference(k)] if levels_to_reset: dropped.reset_index(levels_to_reset, inplace=True) # Handle dropping columns - cols_to_drop = [k for k in drop_keys if not self._is_index_reference(k)] + cols_to_drop = [k for k in drop_keys + if not self._is_index_reference(k)] if cols_to_drop: dropped.drop(drop_keys, axis=1, inplace=True) return dropped - def query(self, expr, inplace=False, **kwargs): """Query the columns of a frame with a boolean expression. From d90ed78d2e2692e6a171c700562de085b3b4d4e0 Mon Sep 17 00:00:00 2001 From: "Jon M. Mease" Date: Mon, 11 Sep 2017 13:13:38 -0400 Subject: [PATCH 06/34] Extract column/level ambiguity warning logic into utility method --- pandas/core/frame.py | 49 +++++++++++++++++++++++++++++------- pandas/core/groupby.py | 10 ++------ pandas/core/reshape/merge.py | 4 +++ 3 files changed, 46 insertions(+), 17 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 72e1ea94369dd..09ff1df82d167 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2206,22 +2206,53 @@ def _get_column_or_level_values(self, key): DataFrame._get_column_or_level_values """ if key in self: - if key in self.index.names: - warnings.warn( - ("'{key}' is both a column name and an index level.\n" - "Defaulting to column, but this will raise an " - "ambiguity error in a future version" - ).format(key=key), - FutureWarning, stacklevel=2) - + self._check_column_or_level_ambiguity(key) values = self[key]._values - elif key in self.index.names: + elif self._is_index_reference(key): values = self.index.get_level_values(key)._values else: raise KeyError(key) return values + def _check_column_or_level_ambiguity(self, key): + """ + Check whether `key` matches both a column label and an index level + and issue a ``FutureWarning`` if this is the case. + + Note: This method will be altered to raise an ambiguity exception in + a future version. + + Parameters + ---------- + key: str or object + Label of column or index level + + Returns + ------- + ambiguous: bool + + Raises + ------ + FutureWarning + if `key` is ambiguous. This will become an ambiguity error in a + future version + + """ + if (isinstance(key, compat.string_types) and + key in self.columns and + key in self.index.names): + + warnings.warn( + ("'{key}' is both a column name and an index level.\n" + "Defaulting to column, but this will raise an " + "ambiguity error in a future version" + ).format(key=key), FutureWarning) + + return True + else: + return False + def _is_index_reference(self, key): """ Test whether a key is an index level reference diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 248f3b2095a78..8283da2e805ec 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2679,16 +2679,10 @@ def is_in_obj(gpr): elif is_in_axis(gpr): # df.groupby('name') if gpr in obj: - if gpr in obj.index.names: - warnings.warn( - ("'%s' is both a column name and an index level.\n" - "Defaulting to column but " - "this will raise an ambiguity error in a " - "future version") % gpr, - FutureWarning, stacklevel=5) + obj._check_column_or_level_ambiguity(gpr) in_axis, name, gpr = True, gpr, obj[gpr] exclusions.append(name) - elif gpr in obj.index.names: + elif obj._is_index_reference(gpr): in_axis, name, level, gpr = False, None, gpr, None else: raise KeyError(gpr) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index e5cf946266db8..d505de9e5fc9d 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -662,6 +662,10 @@ def _maybe_restore_index_levels(self, result): ---------- result: DataFrame merge result + + Returns + ------- + None """ names_to_restore = [] for name, left_key, right_key in zip(self.join_names, From 27b2d25d1724a7a0343e16230b0bcee39a6d6b2a Mon Sep 17 00:00:00 2001 From: "Jon M. Mease" Date: Mon, 11 Sep 2017 13:24:07 -0400 Subject: [PATCH 07/34] Add newline and add :ref: entry for new doc section --- doc/source/merging.rst | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/doc/source/merging.rst b/doc/source/merging.rst index dbd8bd0cc3981..b5001daa344c5 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -504,7 +504,7 @@ the data in DataFrame. See the :ref:`cookbook` for some advanced strategies. Users who are familiar with SQL but new to pandas might be interested in a -:ref:`comparison with SQL`. +:ref:`comparison with SQL`. pandas provides a single function, ``merge``, as the entry point for all standard database join operations between DataFrame objects: @@ -1122,8 +1122,11 @@ This is not Implemented via ``join`` at-the-moment, however it can be done using labels=['left', 'right'], vertical=False); plt.close('all'); +.. _merging.merge_on_columns_and_levels: + Merging on a combination of columns and index levels ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + .. versionadded:: 0.21 Strings passed as the ``on``, ``left_on``, and ``right_on`` parameters From de6f4b1aeffd4bad0e0ffd77e4dd4602c93b9822 Mon Sep 17 00:00:00 2001 From: "Jon M. Mease" Date: Mon, 11 Sep 2017 14:21:12 -0400 Subject: [PATCH 08/34] docstring / comment cleanup --- pandas/core/frame.py | 19 +++++++++---------- pandas/core/reshape/merge.py | 2 +- pandas/tests/reshape/test_merge.py | 3 ++- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 09ff1df82d167..6fbcc8abb1b60 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2163,21 +2163,24 @@ def _getitem_frame(self, key): return self.where(key) # ------------------------------------------------------------------------- - # Column and Index Mixing + # Column and Index Combination Helpers # - # A collection of helpers methods for DataFrame operations that accept - # mix of column and index levels. All such operations should utilize - # these methods as much as possible so that we have consistent precedence - # and validation logic. + # A collection of helper methods for DataFrame operations that accept a + # combination of columns and index levels. All such operations should + # utilize/extend these methods when possible so that we have consistent + # precedence and validation logic throughout the library. # # General Notes: # # - If a column and index level share the same name, the column takes - # precedence + # precedence. Currently a ``FutureWarning`` should be issued in this + # situation. In a future version we will convert this into an + # exception. # # - These methods assume axis=1 # # - Only string keys may be used to reference index levels. + # def _get_column_or_level_values(self, key): """ @@ -2200,10 +2203,6 @@ def _get_column_or_level_values(self, key): ------ KeyError if `key` matches neither a column label nor an index level name - - See Also - -------- - DataFrame._get_column_or_level_values """ if key in self: self._check_column_or_level_ambiguity(key) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index d505de9e5fc9d..f621f7f96f1bc 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -649,7 +649,7 @@ def _indicator_post_merge(self, result): def _maybe_restore_index_levels(self, result): """ - Restore index levels specified as `on` parameters to the index + Restore index levels specified as `on` parameters Here we check for cases where `self.left_on` and `self.right_on` pairs each reference an index level in their respective DataFrames. The diff --git a/pandas/tests/reshape/test_merge.py b/pandas/tests/reshape/test_merge.py index e6f87a07c9362..0525d5967b928 100644 --- a/pandas/tests/reshape/test_merge.py +++ b/pandas/tests/reshape/test_merge.py @@ -70,6 +70,7 @@ def test_merge_common(self): def test_merge_index_as_on_arg(self): # GH14355 + left = self.df.set_index('key1') right = self.df2.set_index('key1') result = merge(left, right, on='key1') @@ -1447,7 +1448,7 @@ def test_merge_on_index_and_column_multi2(self): result = df1.merge(df2, on=['outer', 'inner'], how=how) assert_frame_equal(result, expected) - # Same result when index/column order is flipped + # Flip index/column order expected = (df1.reset_index() .merge(df2.reset_index(), on=['inner', 'outer'], how=how) From 5b1b1005beea70f61e9a6f1625cea065c26a0169 Mon Sep 17 00:00:00 2001 From: "Jon M. Mease" Date: Mon, 9 Oct 2017 15:30:33 -0400 Subject: [PATCH 09/34] Documentation updates - Added whatsnew 0.21 subsection - Updated DataFrame.merge() API docstring --- doc/source/merging.rst | 4 ++-- doc/source/whatsnew/v0.21.0.txt | 31 +++++++++++++++++++++++++++++++ pandas/core/frame.py | 17 +++++++++-------- 3 files changed, 42 insertions(+), 10 deletions(-) diff --git a/doc/source/merging.rst b/doc/source/merging.rst index 855c2bbf24b54..b6da7e8b0c002 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -1135,8 +1135,8 @@ Merging on a combination of columns and index levels .. versionadded:: 0.21 Strings passed as the ``on``, ``left_on``, and ``right_on`` parameters -may refer to either column names or index level names. This enables -the merging of DataFrames on a combination of index levels and columns without +may refer to either column names or index level names. This enables merging +``DataFrame`` instances on a combination of index levels and columns without resetting indexes. .. ipython:: python diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index f3587510e2207..994d7d887efbe 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -165,6 +165,37 @@ and new ``CategoricalDtype``. See the :ref:`CategoricalDtype docs ` for more. + +.. _whatsnew_0210.enhancements.merge_on_columns_and_levels: + +Merging on a combination of columns and index levels +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Strings passed to :meth:`DataFrame.merge` as the ``on``, ``left_on``, and ``right_on`` +parameters may now refer to either column names or index level names. This enables +merging ``DataFrame`` instances on a combination of index levels and columns +without resetting indexes. See the :ref:`Merge on columns and levels +` documentation section. + +.. ipython:: python + + left_index = pd.Index(['K0', 'K0', 'K1', 'K2'], name='key1') + + left = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'], + 'B': ['B0', 'B1', 'B2', 'B3'], + 'key2': ['K0', 'K1', 'K0', 'K1']}, + index=left_index) + + right_index = pd.Index(['K0', 'K1', 'K2', 'K2'], name='key1') + + right = pd.DataFrame({'C': ['C0', 'C1', 'C2', 'C3'], + 'D': ['D0', 'D1', 'D2', 'D3'], + 'key2': ['K0', 'K0', 'K0', 'K1']}, + index=right_index) + + left.merge(right, on=['key1', 'key2']) + + .. _whatsnew_0210.enhancements.other: Other Enhancements diff --git a/pandas/core/frame.py b/pandas/core/frame.py index babd2d9b8e6a3..5b4162ad0db39 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -139,16 +139,17 @@ * inner: use intersection of keys from both frames, similar to a SQL inner join; preserve the order of the left keys on : label or list - Field names to join on. Must be found in both DataFrames. If on is - None and not merging on indexes, then it merges on the intersection of - the columns by default. + Column or index level names to join on. These must be found in both + DataFrames. If on is None and not merging on indexes then this defaults to + the intersection of the columns in both DataFrames. left_on : label or list, or array-like - Field names to join on in left DataFrame. Can be a vector or list of - vectors of the length of the DataFrame to use a particular vector as - the join key instead of columns + Column or index level names to join on in the left DataFrame. Can also + be a vector or list of vectors of the length of the left DataFrame. + These vectors are treated as though they are columns. right_on : label or list, or array-like - Field names to join on in right DataFrame or vector/list of vectors per - left_on docs + Column or index level names to join on in the right DataFrame. Can also + be a vector or list of vectors of the length of the right DataFrame. + These vectors are treated as though they are columns. left_index : boolean, default False Use the index from the left DataFrame as the join key(s). If it is a MultiIndex, the number of keys in the other DataFrame (either the index From dfc6cf79cc119cc8907305d811036b7e14a330b2 Mon Sep 17 00:00:00 2001 From: "Jon M. Mease" Date: Mon, 9 Oct 2017 19:32:07 -0400 Subject: [PATCH 10/34] Fix errors in _drop_columns_or_levels --- pandas/core/frame.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5b4162ad0db39..36946a6b66f6f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2295,13 +2295,13 @@ def _drop_columns_or_levels(self, drop_keys): levels_to_reset = [k for k in drop_keys if self._is_index_reference(k)] if levels_to_reset: - dropped.reset_index(levels_to_reset, inplace=True) + dropped.reset_index(levels_to_reset, drop=True, inplace=True) # Handle dropping columns cols_to_drop = [k for k in drop_keys if not self._is_index_reference(k)] if cols_to_drop: - dropped.drop(drop_keys, axis=1, inplace=True) + dropped.drop(cols_to_drop, axis=1, inplace=True) return dropped From 03e3c2ed7efadc304d62c72fa5d50b099fc537fd Mon Sep 17 00:00:00 2001 From: "Jon M. Mease" Date: Mon, 9 Oct 2017 19:35:41 -0400 Subject: [PATCH 11/34] Refactor and parametrize test cases --- pandas/tests/reshape/test_merge.py | 281 ++++++++++++++++++----------- 1 file changed, 171 insertions(+), 110 deletions(-) diff --git a/pandas/tests/reshape/test_merge.py b/pandas/tests/reshape/test_merge.py index db5fe71564a4a..4b487f10ea5cb 100644 --- a/pandas/tests/reshape/test_merge.py +++ b/pandas/tests/reshape/test_merge.py @@ -1359,137 +1359,198 @@ def f(): household.join(log_return, how='outer') pytest.raises(NotImplementedError, f) - def test_merge_on_index_and_column(self): - # GH14355 - - # Construct DataFrames - df1 = DataFrame(dict( - outer=[1, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4], - inner=[1, 2, 3, 1, 2, 3, 4, 1, 2, 1, 2], - v1=np.linspace(0, 1, 11) - )).set_index(['outer']) - - df2 = DataFrame(dict( - outer=[1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3], - inner=[1, 2, 2, 3, 3, 4, 2, 3, 1, 1, 2, 3], - v2=np.linspace(10, 11, 12) - )).set_index(['outer']) - # Test merge on outer (index) and inner (column) - for how in ['inner', 'left', 'right', 'outer']: - expected = (df1.reset_index() - .merge(df2.reset_index(), - on=['outer', 'inner'], how=how) - .set_index('outer')) +class TestMergeColumnAndIndex(object): + # GH14355 - result = df1.merge(df2, on=['outer', 'inner'], how=how) - assert_frame_equal(result, expected) - - # Same result when index/column order is flipped - result = df1.merge(df2, on=['inner', 'outer'], how=how) - assert_frame_equal(result, expected) - - def test_merge_on_index_and_column_multi(self): - # GH14355 - - # Construct DataFrames - df1 = DataFrame(dict( + @staticmethod + def df_left_with_index(levels): + """ Construct left test DataFrame with specified levels + (any of 'outer', 'inner', and 'v1')""" + res = DataFrame(dict( outer=[1, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4], inner=[1, 2, 3, 1, 2, 3, 4, 1, 2, 1, 2], v1=np.linspace(0, 1, 11) - )).set_index(['outer', 'inner']) + )) - df2 = DataFrame(dict( - outer=[1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3], - inner=[1, 2, 2, 3, 3, 4, 2, 3, 1, 1, 2, 3], - v2=np.linspace(10, 11, 12) - )).set_index(['outer']) - - # Test merge on outer (both index) and inner (one as index and - # one as column) - for how in ['inner', 'left', 'right', 'outer']: - expected = (df1.reset_index() - .merge(df2.reset_index(), - on=['outer', 'inner'], how=how) - .set_index('outer')) - - result = df1.merge(df2, - on=['outer', 'inner'], how=how) - assert_frame_equal(result, expected) - - # Same result when index/column order is flipped - result = df1.merge(df2, - on=['inner', 'outer'], how=how) - assert_frame_equal(result, expected) - - def test_merge_on_index_and_column_multi2(self): - # GH14355 + if levels: + res = res.set_index(levels) - # Construct DataFrames - df1 = DataFrame(dict( - outer=[1, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4], - inner=[1, 2, 3, 1, 2, 3, 4, 1, 2, 1, 2], - v1=np.linspace(0, 1, 11) - )).set_index(['outer', 'inner']) + return res - df2 = DataFrame(dict( + @staticmethod + def df_right_with_index(levels): + """ Construct right test DataFrame with specified levels + (any of 'outer', 'inner', and 'v2')""" + res = DataFrame(dict( outer=[1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3], inner=[1, 2, 2, 3, 3, 4, 2, 3, 1, 1, 2, 3], v2=np.linspace(10, 11, 12) - )).set_index(['outer', 'inner']) + )) + + if levels: + res = res.set_index(levels) + + return res + + @staticmethod + def compute_expected(df_left, df_right, + on=None, left_on=None, right_on=None, how=None): + """ + Compute the expected merge result for the test case. + + This method computes the expected result of merging two DataFrames on + a combination of their columns and index levels. It does so by + explicitly dropping/resetting their named index levels, performing a + merge on their columns, and then finally restoring the appropriate + index in the result. + + Parameters + ---------- + df_left : DataFrame + The left DataFrame (may have zero or more named index levels) + df_right : DataFrame + The right DataFrame (may have zero or more named index levels) + on : list of str + The on parameter to the merge operation + left_on : list of str + The left_on parameter to the merge operation + right_on : list of str + The right_on parameter to the merge operation + how : str + The how parameter to the merge operation + + Returns + ------- + DataFrame + The expected merge result + """ + + # Handle on param if specified + if on is not None: + left_on, right_on = on, on + + # Compute input named index levels + left_levels = [n for n in df_left.index.names if n is not None] + right_levels = [n for n in df_right.index.names if n is not None] + + # Compute output named index levels + output_levels = [i for i in left_on + if i in right_levels and i in left_levels] + + # Drop index levels that aren't involved in the merge + drop_left = [n for n in left_levels if n not in left_on] + if drop_left: + df_left = df_left.reset_index(drop_left, drop=True) + + drop_right = [n for n in right_levels if n not in right_on] + if drop_right: + df_right = df_right.reset_index(drop_right, drop=True) + + # Convert remaining index levels to columns + reset_left = [n for n in left_levels if n in left_on] + if reset_left: + df_left = df_left.reset_index(level=reset_left) + + reset_right = [n for n in right_levels if n in right_on] + if reset_right: + df_right = df_right.reset_index(level=reset_right) + + # Perform merge + expected = df_left.merge(df_right, + left_on=left_on, + right_on=right_on, + how=how) + + # Restore index levels + if output_levels: + expected = expected.set_index(output_levels) + + return expected + + @pytest.mark.parametrize('left_levels', + [[], ['outer'], ['outer', 'inner']]) + @pytest.mark.parametrize('right_levels', + [[], ['outer'], ['outer', 'inner']]) + @pytest.mark.parametrize('on', + [['outer'], ['inner'], + ['outer', 'inner'], + ['inner', 'outer']]) + @pytest.mark.parametrize('how', ['inner', 'left', 'right', 'outer']) + def test_merge_indexes_and_columns_on( + self, left_levels, right_levels, on, how): + + # Construct test DataFrames + df_left = self.df_left_with_index(left_levels) + df_right = self.df_right_with_index(right_levels) + + # Construct expected result + expected = self.compute_expected(df_left, df_right, on=on, how=how) + + # Perform merge + result = df_left.merge(df_right, on=on, how=how) + assert_frame_equal(result, expected, check_like=True) + + @pytest.mark.parametrize('left_levels', + [[], ['outer'], ['outer', 'inner']]) + @pytest.mark.parametrize('right_levels', + [[], ['outer'], ['outer', 'inner']]) + @pytest.mark.parametrize('left_on,right_on', + [(['outer'], ['outer']), (['inner'], ['inner']), + (['outer', 'inner'], ['outer', 'inner']), + (['inner', 'outer'], ['inner', 'outer'])]) + @pytest.mark.parametrize('how', ['inner', 'left', 'right', 'outer']) + def test_merge_indexes_and_columns_lefton_righton( + self, left_levels, right_levels, left_on, right_on, how): + + # Construct test DataFrames + df_left = self.df_left_with_index(left_levels) + df_right = self.df_right_with_index(right_levels) + + # Construct expected result + expected = self.compute_expected(df_left, df_right, + left_on=left_on, + right_on=right_on, + how=how) + + # Perform merge + result = df_left.merge(df_right, + left_on=left_on, right_on=right_on, how=how) + assert_frame_equal(result, expected, check_like=True) - # Test merge on outer (both index) and inner (both index) - for how in ['inner', 'left', 'right', 'outer']: - expected = (df1.reset_index() - .merge(df2.reset_index(), - on=['outer', 'inner'], how=how) - .set_index(['outer', 'inner'])) + def test_merge_index_column_precedence(self): - result = df1.merge(df2, on=['outer', 'inner'], how=how) - assert_frame_equal(result, expected) + # Construct df_left with both an index and a column named 'outer'. + # We make this 'outer' column equal to the 'inner' column so that we + # can verify that the correct values are used by the merge operation + df_left = self.df_left_with_index(['outer']) + df_left['outer'] = df_left['inner'] - # Flip index/column order - expected = (df1.reset_index() - .merge(df2.reset_index(), - on=['inner', 'outer'], how=how) - .set_index(['inner', 'outer'])) + # Construct df_right with an index level named 'outer' + df_right = self.df_right_with_index(['outer']) - result = df1.merge(df2, on=['inner', 'outer'], how=how) - assert_frame_equal(result, expected) + # Construct expected result. + # The 'outer' column from df_left is chosen and the resulting + # frame has no index levels + expected = (df_left.reset_index(level='outer', drop=True) + .merge(df_right.reset_index(), on=['outer', 'inner'])) - def test_merge_index_column_precedence(self): - # GH14355 - - # Construct DataFrames - df1 = DataFrame(dict( - outer=[1, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4], - inner=[1, 2, 3, 1, 2, 3, 4, 1, 2, 1, 2], - v1=np.linspace(0, 1, 11) - )).set_index(['outer']) - # - df1 has both a column and index named 'outer' - df1['outer'] = df1['inner'] + # Merge df_left and df_right on 'outer' and 'inner' + # 'outer' for df_left should refer to the 'outer' column, not the + # 'outer' index level and a FutureWarning should be raised + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = df_left.merge(df_right, on=['outer', 'inner']) - df2 = DataFrame(dict( - outer=[1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3], - inner=[1, 2, 2, 3, 3, 4, 2, 3, 1, 1, 2, 3], - v2=np.linspace(10, 11, 12) - )).set_index(['outer']) + # Check results + assert_frame_equal(result, expected) - # Merge df1 and df2 on 'outer' and 'inner' - # - 'outer' for df1 should refer to the 'outer' column + # Perform the same using the left_on and right_on parameters with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = df1.merge(df2, on=['outer', 'inner']) + result = df_left.merge(df_right, + left_on=['outer', 'inner'], + right_on=['outer', 'inner']) - # Remove 'outer' index from df1 prior to merge - expected = df1.reset_index(drop=True).merge(df2.reset_index(), - on=['outer', 'inner']) - # Remove 'outer' column from df1 prior to merge - not_expected = df1.drop('outer', axis=1).reset_index().merge( - df2.reset_index(), on=['outer', 'inner']) - - # Check results assert_frame_equal(result, expected) - assert not result.equals(not_expected) @pytest.fixture From bf5d3498f67af7a6761fa560098ca20cd7a4d060 Mon Sep 17 00:00:00 2001 From: "Jon M. Mease" Date: Tue, 10 Oct 2017 16:36:22 -0400 Subject: [PATCH 12/34] Moved label/level helpers up to NDFrame, added axis support, and added test suite --- pandas/core/frame.py | 239 +++++------ pandas/core/generic.py | 178 +++++++++ pandas/core/groupby.py | 4 +- pandas/core/reshape/merge.py | 18 +- pandas/core/series.py | 54 +++ .../indexes/test_label_or_level_utils.py | 378 ++++++++++++++++++ 6 files changed, 745 insertions(+), 126 deletions(-) create mode 100644 pandas/tests/indexes/test_label_or_level_utils.py diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 36946a6b66f6f..1e61f34b5197d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -68,7 +68,7 @@ standardize_mapping) from pandas.core.generic import NDFrame, _shared_docs from pandas.core.index import (Index, MultiIndex, _ensure_index, - _ensure_index_from_sequences) + _ensure_index_from_sequences, RangeIndex) from pandas.core.indexing import (maybe_droplevels, convert_to_index_sliceable, check_bool_indexer) from pandas.core.internals import (BlockManager, @@ -2162,146 +2162,155 @@ def _getitem_frame(self, key): return self.where(key) # ------------------------------------------------------------------------- - # Column and Index Combination Helpers - # - # A collection of helper methods for DataFrame operations that accept a - # combination of columns and index levels. All such operations should - # utilize/extend these methods when possible so that we have consistent - # precedence and validation logic throughout the library. - # - # General Notes: - # - # - If a column and index level share the same name, the column takes - # precedence. Currently a ``FutureWarning`` should be issued in this - # situation. In a future version we will convert this into an - # exception. - # - # - These methods assume axis=1 - # - # - Only string keys may be used to reference index levels. - # - - def _get_column_or_level_values(self, key): - """ - Return an array of values from a DataFrame column or named index level + # Label or Level Combination Helpers - Parameters - ---------- - key: str or object - Label of column or index level. If `key` is present in the frame as - a column label, the corresponding column is chosen. Otherwise, - if `key` is a string and the is present in the frame as the name - of an index level, the corresponding index level is chosen. - Otherwise, a ``KeyError`` is raised. + @Appender(_shared_docs['_is_level_reference']) + def _is_level_reference(self, key, axis=0): + axis = self._get_axis_number(axis) + if axis == 0: + return (isinstance(key, compat.string_types) and + key not in self.columns and + key in self.index.names) + elif axis == 1: + return (isinstance(key, compat.string_types) and + key not in self.index and + key in self.columns.names) - Returns - ------- - values: np.ndarray + @Appender(_shared_docs['_is_label_reference']) + def _is_label_reference(self, key, axis=0): + axis = self._get_axis_number(axis) + if axis == 0: + return (isinstance(key, compat.string_types) and + key in self.columns) + elif axis == 1: + return (isinstance(key, compat.string_types) and + key in self.index) - Raises - ------ - KeyError - if `key` matches neither a column label nor an index level name - """ - if key in self: - self._check_column_or_level_ambiguity(key) - values = self[key]._values - elif self._is_index_reference(key): - values = self.index.get_level_values(key)._values - else: - raise KeyError(key) + @Appender(_shared_docs['_check_label_or_level_ambiguity']) + def _check_label_or_level_ambiguity(self, key, axis=0): - return values + axis = self._get_axis_number(axis) - def _check_column_or_level_ambiguity(self, key): - """ - Check whether `key` matches both a column label and an index level - and issue a ``FutureWarning`` if this is the case. + def raise_warning(): - Note: This method will be altered to raise an ambiguity exception in - a future version. + # Build an informative and grammatical warning + level_article, level_type = (('an', 'index') + if axis==0 else + ('a', 'column')) - Parameters - ---------- - key: str or object - Label of column or index level + label_article, label_type = (('a', 'column') + if axis==0 else + ('an', 'index')) - Returns - ------- - ambiguous: bool + warnings.warn( + ("'{key}' is both {level_article} {level_type} level and " + "{label_article} {label_type} label.\n" + "Defaulting to {label_type}, but this will raise an " + "ambiguity error in a future version" + ).format(key=key, + level_article=level_article, + level_type=level_type, + label_article=label_article, + label_type=label_type), FutureWarning) - Raises - ------ - FutureWarning - if `key` is ambiguous. This will become an ambiguity error in a - future version + if axis == 0: + if (isinstance(key, compat.string_types) and + key in self.columns and + key in self.index.names): - """ - if (isinstance(key, compat.string_types) and - key in self.columns and - key in self.index.names): + raise_warning() + return True + else: + return False + else: + if (isinstance(key, compat.string_types) and + key in self.index and + key in self.columns.names): - warnings.warn( - ("'{key}' is both a column name and an index level.\n" - "Defaulting to column, but this will raise an " - "ambiguity error in a future version" - ).format(key=key), FutureWarning) + raise_warning() + return True + else: + return False - return True + @Appender(_shared_docs['_get_label_or_level_values']) + def _get_label_or_level_values(self, key, axis=0): + axis = self._get_axis_number(axis) + if axis == 0: + if key in self: + self._check_label_or_level_ambiguity(key, axis=axis) + values = self[key]._values + elif self._is_level_reference(key, axis=axis): + values = self.index.get_level_values(key)._values + else: + raise KeyError(key) else: - return False + if key in self.index: + self._check_label_or_level_ambiguity(key, axis=axis) + values = self.loc[key]._values + elif self._is_level_reference(key, axis=axis): + values = self.columns.get_level_values(key)._values + else: + raise KeyError(key) - def _is_index_reference(self, key): - """ - Test whether a key is an index level reference + # Check for duplicates + if values.ndim > 1: + label_axis_name = 'column' if axis == 0 else 'index' + raise ValueError(("The {label_axis_name} label '{key}' " + "is not unique") + .format(key=key, + label_axis_name=label_axis_name)) - To be considered an index level reference `key` must be a string that - matches the name of an index level and does NOT match the label - of any column. + return values - Parameters - ---------- - key: str or object - Label of column or index level + @Appender(_shared_docs['_drop_labels_or_levels']) + def _drop_labels_or_levels(self, keys, axis=0): + axis = self._get_axis_number(axis) + keys = com._maybe_make_list(keys) - Returns - ------- - is_index: bool - """ - return (isinstance(key, compat.string_types) and - key not in self.columns and - key in self.index.names) + # Validate keys + invalid_keys = [k for k in keys if not + self._is_label_or_level_reference(k, axis=axis)] - def _drop_columns_or_levels(self, drop_keys): - """ - Drop columns or levels from the dataframe + if invalid_keys: + raise ValueError(("The following keys are not valid labels or " + "levels for {axis}: {invalid_keys}") + .format(axis=axis, + invalid_keys=invalid_keys)) - Parameters - ---------- - drop_keys: single label or list-like of labels + # Compute levels and labels to drop + levels_to_drop = [k for k in keys + if self._is_level_reference(k, axis=axis)] - Returns - ------- - dropped: DataFrame - """ - drop_keys = com._maybe_make_list(drop_keys) + labels_to_drop = [k for k in keys + if not self._is_level_reference(k, axis=axis)] # Perform copy upfront and then use inplace operations below. # This ensures that we always perform exactly one copy. # ``copy`` and/or ``inplace`` options could be added in the future. dropped = self.copy() - # Handle dropping index levels - levels_to_reset = [k for k in drop_keys - if self._is_index_reference(k)] - if levels_to_reset: - dropped.reset_index(levels_to_reset, drop=True, inplace=True) - - # Handle dropping columns - cols_to_drop = [k for k in drop_keys - if not self._is_index_reference(k)] - if cols_to_drop: - dropped.drop(cols_to_drop, axis=1, inplace=True) + if axis == 0: + # Handle dropping index levels + if levels_to_drop: + dropped.reset_index(levels_to_drop, drop=True, inplace=True) + + # Handle dropping columns labels + if labels_to_drop: + dropped.drop(labels_to_drop, axis=1, inplace=True) + else: + # Handle dropping column levels + if levels_to_drop: + if isinstance(dropped.columns, MultiIndex): + # Drop the specified levels from the MultiIndex + dropped.columns = dropped.columns.droplevel(levels_to_drop) + else: + # Drop the last level of Index by replacing with + # a RangeIndex + dropped.columns = RangeIndex(dropped.columns.size) + + # Handle dropping index labels + if labels_to_drop: + dropped.drop(labels_to_drop, axis=0, inplace=True) return dropped diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 5dd770b2600a0..7076938a10a8e 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1001,6 +1001,184 @@ def equals(self, other): return False return self._data.equals(other._data) + # ------------------------------------------------------------------------- + # Label or Level Combination Helpers + # + # A collection of helper methods for DataFrame/Series operations that + # accept a combination of column/index labels and levels. All such + # operations should utilize/extend these methods when possible so that we + # have consistent precedence and validation logic throughout the library. + + _shared_docs['_is_level_reference'] = """ + Test whether a key is a level reference for a given axis. + + To be considered a level reference, `key` must be a string that: + - (axis=0): Matches the name of an index level and does NOT match + a column label. + - (axis=1): Matches the name of a column level and does NOT match + an index label. + + Parameters + ---------- + key: str + Potential level name for the given axis + axis: int, default 0 + Axis that levels are associated with (0 for index, 1 for columns) + + Returns + ------- + is_level: bool""" + + @Appender(_shared_docs['_is_level_reference']) + def _is_level_reference(self, key, axis=0): + raise NotImplementedError() + + + _shared_docs['_is_label_reference'] = """ + Test whether a key is a label reference for a given axis. + + To be considered a label reference, `key` must be a string that: + - (axis=0): Matches a column label + - (axis=1): Matches an index label + + Parameters + ---------- + key: str + Potential label name + axis: int, default 0 + Axis perpendicular to the axis that labels are associated with + (0 means search for column labels, 1 means search for index labels) + + Returns + ------- + is_label: bool""" + + @Appender(_shared_docs['_is_label_reference']) + def _is_label_reference(self, key, axis=0): + raise NotImplementedError() + + + _shared_docs['_is_label_or_level_reference'] = """ + Test whether a key is a label or level reference for a given axis. + + To be considered either a label or a level reference, `key` must be a + string that: + - (axis=0): Matches a column label or an index level + - (axis=1): Matches an index label or a column level + + Parameters + ---------- + key: str + Potential label or level name + axis: int, default 0 + Axis that levels are associated with (0 for index, 1 for columns) + + Returns + ------- + is_label_or_level: bool""" + + @Appender(_shared_docs['_is_label_or_level_reference']) + def _is_label_or_level_reference(self, key, axis=0): + return (self._is_level_reference(key, axis=axis) or + self._is_label_reference(key, axis=axis)) + + + _shared_docs['_check_label_or_level_ambiguity'] = """ + Check whether `key` matches both a level of the input `axis` and a + label of the other axis and raise a ``FutureWarning`` if this is the + case. + + Note: This method will be altered to raise an ambiguity exception in + a future version. + + Parameters + ---------- + key: str or object + label or level name + + axis: int, default 0 + Axis that levels are associated with (0 for index, 1 for columns) + + Returns + ------- + ambiguous: bool + + Raises + ------ + FutureWarning + if `key` is ambiguous. This will become an ambiguity error in a + future version + """ + + @Appender(_shared_docs['_check_label_or_level_ambiguity']) + def _check_label_or_level_ambiguity(self, key, axis=0): + return False + + _shared_docs['_get_label_or_level_values'] = """ + + Return a 1-D array of values associated with `key`, a label or level + from the given `axis`. + + Retrieval logic: + - (axis=0): Return column values if `key` matches a column label. + Otherwise return index level values if `key` matches an index + level. + - (axis=1): Return row values if `key` matches an index label. + Otherwise return column level values if 'key' matches a column + level + + Parameters + ---------- + key: str + Label or level name. + axis: int, default 0 + Axis that levels are associated with (0 for index, 1 for columns) + + Returns + ------- + values: np.ndarray + + Raises + ------ + KeyError + if `key` matches neither a label nor a level + ValueError + if `key` matches multiple labels""" + + @Appender(_shared_docs['_get_label_or_level_values']) + def _get_label_or_level_values(self, key, axis=0): + raise NotImplementedError() + + _shared_docs['_drop_labels_or_levels'] = """ + + Drop labels and/or levels for the given `axis`. + + For each key in `keys`: + - (axis=0): If key matches a column label then drop the column. + Otherwise if key matches an index level then drop the level. + - (axis=1): If key matches an index label then drop the row. + Otherwise if key matches a column level then drop the level. + + Parameters + ---------- + keys: str or list of str + labels or levels to drop + axis: int, default 0 + Axis that levels are associated with (0 for index, 1 for columns) + + Returns + ------- + dropped: DataFrame + + Raises + ------ + ValueError + if any `keys` match neither a label nor a level""" + + @Appender(_shared_docs['_drop_labels_or_levels']) + def _drop_labels_or_levels(self, key, axis=0): + raise NotImplementedError() + # ---------------------------------------------------------------------- # Iteration diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index ca1bc2697fe37..31969378c60b4 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2777,10 +2777,10 @@ def is_in_obj(gpr): elif is_in_axis(gpr): # df.groupby('name') if gpr in obj: if validate: - obj._check_column_or_level_ambiguity(gpr) + obj._check_label_or_level_ambiguity(gpr) in_axis, name, gpr = True, gpr, obj[gpr] exclusions.append(name) - elif obj._is_index_reference(gpr): + elif obj._is_level_reference(gpr): in_axis, name, level, gpr = False, None, gpr, None else: raise KeyError(gpr) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 320e86c8de20f..c53569e7a3acc 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -671,8 +671,8 @@ def _maybe_restore_index_levels(self, result): for name, left_key, right_key in zip(self.join_names, self.left_on, self.right_on): - if self.orig_left._is_index_reference(left_key) and \ - self.orig_right._is_index_reference(right_key): + if self.orig_left._is_level_reference(left_key) and \ + self.orig_right._is_level_reference(right_key): names_to_restore.append(name) @@ -847,7 +847,7 @@ def _get_merge_keys(self): else: if rk is not None: right_keys.append( - right._get_column_or_level_values(rk)) + right._get_label_or_level_values(rk)) join_names.append(rk) else: # work-around for merge_asof(right_index=True) @@ -857,7 +857,7 @@ def _get_merge_keys(self): if not is_rkey(rk): if rk is not None: right_keys.append( - right._get_column_or_level_values(rk)) + right._get_label_or_level_values(rk)) else: # work-around for merge_asof(right_index=True) right_keys.append(right.index) @@ -870,7 +870,7 @@ def _get_merge_keys(self): else: right_keys.append(rk) if lk is not None: - left_keys.append(left._get_column_or_level_values(lk)) + left_keys.append(left._get_label_or_level_values(lk)) join_names.append(lk) else: # work-around for merge_asof(left_index=True) @@ -882,7 +882,7 @@ def _get_merge_keys(self): left_keys.append(k) join_names.append(None) else: - left_keys.append(left._get_column_or_level_values(k)) + left_keys.append(left._get_label_or_level_values(k)) join_names.append(k) if isinstance(self.right.index, MultiIndex): right_keys = [lev._values.take(lab) @@ -896,7 +896,7 @@ def _get_merge_keys(self): right_keys.append(k) join_names.append(None) else: - right_keys.append(right._get_column_or_level_values(k)) + right_keys.append(right._get_label_or_level_values(k)) join_names.append(k) if isinstance(self.left.index, MultiIndex): left_keys = [lev._values.take(lab) @@ -906,10 +906,10 @@ def _get_merge_keys(self): left_keys = [self.left.index.values] if left_drop: - self.left = self.left._drop_columns_or_levels(left_drop) + self.left = self.left._drop_labels_or_levels(left_drop) if right_drop: - self.right = self.right._drop_columns_or_levels(right_drop) + self.right = self.right._drop_labels_or_levels(right_drop) return left_keys, right_keys, join_names diff --git a/pandas/core/series.py b/pandas/core/series.py index 97f39a680c8c9..93606c976a686 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -410,6 +410,60 @@ def get_values(self): """ same as values (but handles sparseness conversions); is a view """ return self._data.get_values() + # ------------------------------------------------------------------------- + # Label or Level Combination Helpers + + @Appender(generic._shared_docs['_is_level_reference']) + def _is_level_reference(self, key, axis=0): + axis = self._get_axis_number(axis) + return (isinstance(key, compat.string_types) and + key in self.index.names) + + @Appender(generic._shared_docs['_is_label_reference']) + def _is_label_reference(self, key, axis=0): + axis = self._get_axis_number(axis) + return False + + @Appender(generic._shared_docs['_get_label_or_level_values']) + def _get_label_or_level_values(self, key, axis=0): + axis = self._get_axis_number(axis) + if self._is_level_reference(key, axis=axis): + values = self.index.get_level_values(key)._values + else: + raise KeyError(key) + + return values + + @Appender(generic._shared_docs['_drop_labels_or_levels']) + def _drop_labels_or_levels(self, keys, axis=0): + axis = self._get_axis_number(axis) + keys = com._maybe_make_list(keys) + + # Validate keys + invalid_keys = [k for k in keys + if not self._is_level_reference(k, axis=axis)] + + if invalid_keys: + raise ValueError(("The following keys are not valid index levels: " + "{invalid_keys}") + .format(axis=axis, + invalid_keys=invalid_keys)) + + # Compute levels and labels to drop + levels_to_drop = [k for k in keys + if self._is_level_reference(k, axis=axis)] + + # Perform copy upfront and then use inplace operations below. + # This ensures that we always perform exactly one copy. + # ``copy`` and/or ``inplace`` options could be added in the future. + dropped = self.copy() + + # Handle dropping index levels + if levels_to_drop: + dropped.reset_index(levels_to_drop, drop=True, inplace=True) + + return dropped + @property def asobject(self): """ diff --git a/pandas/tests/indexes/test_label_or_level_utils.py b/pandas/tests/indexes/test_label_or_level_utils.py new file mode 100644 index 0000000000000..0a5ac098d6277 --- /dev/null +++ b/pandas/tests/indexes/test_label_or_level_utils.py @@ -0,0 +1,378 @@ +import pytest +import pandas as pd +from pandas import compat +import pandas.util.testing as tm +from pandas.core.dtypes.missing import array_equivalent + + +class TestLabelOrLevelUtils(object): + """ + Test NDFrame utility methods used by operations that allow users to + specify a mixture of levels and labels + """ + + # Setup + # ===== + def setup_method(self): + self.df1 = pd.DataFrame({'L1': [1, 2, 3], + 'L2': [11, 12, 13], + 'L3': ['A', 'B', 'C']}) + + # Data preparation helpers + # ======================== + @staticmethod + def prepare_df(df, levels=None, axis=0): + if levels: + if isinstance(levels, compat.string_types): + levels = [levels] + df = df.set_index(levels) + + if axis == 1: + # Transpose so index levels become column levels + df = df.T + return df + + def prepare_df1(self, levels=None, axis=0): + """Return DataFrame with specified levels (list of any of 'L1', 'L2', + and 'L3'). Remaining keys are left as labels""" + return self.prepare_df(self.df1, levels=levels, axis=axis) + + def prepare_df_ambig(self, axis=0): + """Return DataFrame with levels 'L1' and 'L2' and + labels 'L1' and 'L3' """ + df = self.df1.set_index(['L1', 'L2']) + df['L1'] = df['L3'] + + if axis == 1: + df = df.T + + return df + + def prepare_df_duplabels(self, axis=0): + """Return DataFrame with level 'L1' and labels 'L2', 'L3', and 'L2' """ + df = self.df1.set_index(['L1']) + df = pd.concat([df, df['L2']], axis=1) + + if axis == 1: + df = df.T + + return df + + # Test is label/level reference + # ============================= + @staticmethod + def check_level_reference(frame, levels, axis): + for level in levels: + assert frame._is_level_reference(level, axis=axis) + assert not frame._is_label_reference(level, axis=axis) + assert frame._is_label_or_level_reference(level, axis=axis) + + @staticmethod + def check_label_reference(frame, labels, axis): + for label in labels: + assert frame._is_label_reference(label, axis=axis) + assert not frame._is_level_reference(label, axis=axis) + assert frame._is_label_or_level_reference(label, axis=axis) + + # DataFrame + # --------- + @pytest.mark.parametrize('axis', [0, 1]) + def test_is_level_or_label_reference_df_simple(self, axis): + + # df has no named levels on axis + df = self.prepare_df1(axis=axis) + self.check_label_reference(df, ['L1', 'L2', 'L3'], axis=axis) + + # Set L1 as level on axis + df = self.prepare_df1('L1', axis=axis) + self.check_level_reference(df, ['L1'], axis=axis) + self.check_label_reference(df, ['L2', 'L3'], axis=axis) + + # Set L1 and L2 as levels on axis + df = self.prepare_df1(['L1', 'L2'], axis=axis) + self.check_level_reference(df, ['L1', 'L2'], axis=axis) + self.check_label_reference(df, ['L3'], axis=axis) + + # Set L1, L2, and L3 as levels on axis + df = self.prepare_df1(['L1', 'L2', 'L3'], axis=axis) + self.check_level_reference(df, ['L1', 'L2', 'L3'], axis=axis) + + @pytest.mark.parametrize('axis', [0, 1]) + def test_is_level_reference_df_ambig(self, axis): + + df = self.prepare_df_ambig(axis=axis) + + # df has both an on-axis level and off-axis label named L1 + # Therefore L1 should reference the label, not the level + self.check_label_reference(df, ['L1'], axis=axis) + + # df has an on-axis level named L2 and it is not ambiguous + # Therefore L2 is an level reference + self.check_level_reference(df, ['L2'], axis=axis) + + # df has a column named L3 and it not an level reference + self.check_label_reference(df, ['L3'], axis=axis) + + # Series + # ------ + def test_is_level_reference_series_simple_axis0(self): + + # Make series with L1 as index + s = self.df1.set_index('L1').L2 + self.check_level_reference(s, ['L1'], axis=0) + assert not s._is_level_reference('L2') + + # Make series with L1 and L2 as index + s = self.df1.set_index(['L1', 'L2']).L3 + self.check_level_reference(s, ['L1', 'L2'], axis=0) + assert not s._is_level_reference('L3') + + def test_is_level_reference_series_axis1_error(self): + + # Make series with L1 as index + s = self.df1.set_index('L1').L2 + + with tm.assert_raises_regex(ValueError, "No axis named 1"): + s._is_level_reference('L1', axis=1) + + # Test _check_label_or_level_ambiguity_df + # ======================================= + + # DataFrame + # --------- + @pytest.mark.parametrize('axis', [0, 1]) + def test_check_label_or_level_ambiguity_df(self, axis): + + df = self.prepare_df_ambig(axis=axis) + + # df has both an on-axis level and off-axis label named L1 + # Therefore L1 is ambiguous + with tm.assert_produces_warning(FutureWarning, + clear=True, + check_stacklevel=False) as w: + + assert df._check_label_or_level_ambiguity('L1', axis=axis) + warning_msg = w[0].message.args[0] + if axis == 0: + assert warning_msg.startswith("'L1' is both an index level " + "and a column label") + else: + assert warning_msg.startswith("'L1' is both a column level " + "and an index label") + + # df has an on-axis level named L2 and it is not ambiguous + # No warning should be raised + with tm.assert_produces_warning(None): + assert not df._check_label_or_level_ambiguity('L2', axis=axis) + + # df has an off-axis label named L3 and it is not ambiguous + with tm.assert_produces_warning(None): + assert not df._is_level_reference('L3', axis=axis) + + # Series + # ------ + @pytest.mark.parametrize('axis', [0, 1]) + def test_check_label_or_level_ambiguity_series(self, axis): + + # A series has only one axis and references are never ambiguous, + # regardless of what axis is considered + + # Make series with L1 as index + s = self.df1.set_index('L1').L2 + with tm.assert_produces_warning(None): + assert not s._check_label_or_level_ambiguity('L1', axis=axis) + assert not s._check_label_or_level_ambiguity('L2', axis=axis) + + # Make series with L1 and L2 as index + s = self.df1.set_index(['L1', 'L2']).L3 + with tm.assert_produces_warning(None): + assert not s._check_label_or_level_ambiguity('L1', axis=axis) + assert not s._check_label_or_level_ambiguity('L2', axis=axis) + assert not s._check_label_or_level_ambiguity('L3', axis=axis) + + # Test _get_label_or_level_values + # =============================== + + # DataFrame + # --------- + @staticmethod + def check_labels(frame, labels, axis): + for label in labels: + if axis == 0: + expected = frame[label]._values + else: + expected = frame.loc[label]._values + + result = frame._get_label_or_level_values(label, axis=axis) + assert array_equivalent(expected, result) + + @staticmethod + def check_levels(frame, levels, axis): + for level in levels: + if axis == 0: + expected = frame.index.get_level_values(level=level)._values + else: + expected = (frame.columns + .get_level_values(level=level) + ._values) + + result = frame._get_label_or_level_values(level, axis=axis) + assert array_equivalent(expected, result) + + @pytest.mark.parametrize('axis', [0, 1]) + def test_get_label_or_level_values_df_simple(self, axis): + + # ### df has no named index levels ### + df = self.prepare_df1(axis=axis) + self.check_labels(df, ['L1', 'L2', 'L3'], axis=axis) + + # ### Set L1 as index level ### + df = self.prepare_df1('L1', axis=axis) + self.check_labels(df, ['L2', 'L3'], axis=axis) + self.check_levels(df, ['L1'], axis=axis) + + # ### Set L1 and L2 as index levels ### + df = self.prepare_df1(['L1', 'L2'], axis=axis) + self.check_labels(df, ['L3'], axis=axis) + self.check_levels(df, ['L1', 'L2'], axis=axis) + + # ### Set L1, L2, and L3 as index levels ### + df = self.prepare_df1(['L1', 'L2', 'L3'], axis=axis) + self.check_levels(df, ['L1', 'L2', 'L3'], axis=axis) + + @pytest.mark.parametrize('axis', [0, 1]) + def test_get_label_or_level_values_df_ambig(self, axis): + df = self.prepare_df_ambig(axis=axis) + + # df has both an on-axis level and off-axis label named L1 + # Therefore L1 is ambiguous but will default to label + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + self.check_labels(df, ['L1'], axis=axis) + + # df has an on-axis level named L2 and it is not ambiguous + with tm.assert_produces_warning(None): + self.check_levels(df, ['L2'], axis=axis) + + # df has an off-axis label named L3 and it is not ambiguous + with tm.assert_produces_warning(None): + self.check_labels(df, ['L3'], axis=axis) + + @pytest.mark.parametrize('axis', [0, 1]) + def test_get_label_or_level_values_df_duplabels(self, axis): + + df = self.prepare_df_duplabels(axis=axis) + + # df has unambiguous level 'L1' + self.check_levels(df, ['L1'], axis=axis) + + # df has unique label 'L3' + self.check_labels(df, ['L3'], axis=axis) + + # df has duplicate labels 'L2' + if axis == 0: + expected_msg = "The column label 'L2' is not unique" + else: + expected_msg = "The index label 'L2' is not unique" + + with tm.assert_raises_regex(ValueError, expected_msg): + self.check_labels(df, ['L2'], axis=axis) + + # Series + # ------ + def test_get_label_or_level_values_series_axis0(self): + + # Make series with L1 as index + s = self.df1.set_index('L1').L2 + self.check_levels(s, ['L1'], axis=0) + + # Make series with L1 and L2 as index + s = self.df1.set_index(['L1', 'L2']).L3 + self.check_levels(s, ['L1', 'L2'], axis=0) + + def test_get_label_or_level_values_series_axis1_error(self): + + # Make series with L1 as index + s = self.df1.set_index('L1').L2 + + with tm.assert_raises_regex(ValueError, "No axis named 1"): + s._get_label_or_level_values('L1', axis=1) + + # Test _drop_labels_or_levels + # =========================== + @staticmethod + def check_labels_dropped(frame, labels, axis): + for label in labels: + df_dropped = frame._drop_labels_or_levels(label, axis=axis) + + if axis == 0: + assert label in frame.columns + assert label not in df_dropped.columns + else: + assert label in frame.index + assert label not in df_dropped.index + + @staticmethod + def check_levels_dropped(frame, levels, axis): + for level in levels: + df_dropped = frame._drop_labels_or_levels(level, axis=axis) + + if axis == 0: + assert level in frame.index.names + assert level not in df_dropped.index.names + else: + assert level in frame.columns.names + assert level not in df_dropped.columns.names + + # DataFrame + # --------- + @pytest.mark.parametrize('axis', [0, 1]) + def test_drop_labels_or_levels_df(self, axis): + + # ### df has no named index levels ### + df = self.prepare_df1(axis=axis) + self.check_labels_dropped(df, ['L1', 'L2', 'L3'], axis=axis) + + with tm.assert_raises_regex(ValueError, "not valid labels or levels"): + df._drop_labels_or_levels('L4', axis=axis) + + # ### Set L1 as index level ### + df = self.prepare_df1('L1', axis=axis) + self.check_labels_dropped(df, ['L2', 'L3'], axis=axis) + self.check_levels_dropped(df, ['L1'], axis=axis) + + with tm.assert_raises_regex(ValueError, "not valid labels or levels"): + df._drop_labels_or_levels('L4', axis=axis) + + # ### Set L1 and L2 as index levels ### + df = self.prepare_df1(['L1', 'L2'], axis=axis) + self.check_labels_dropped(df, ['L3'], axis=axis) + self.check_levels_dropped(df, ['L1', 'L2'], axis=axis) + + with tm.assert_raises_regex(ValueError, "not valid labels or levels"): + df._drop_labels_or_levels('L4', axis=axis) + + # ### Set L1, L2, and L3 as index levels ### + df = self.prepare_df1(['L1', 'L2', 'L3'], axis=axis) + self.check_levels_dropped(df, ['L1', 'L2', 'L3'], axis=axis) + + with tm.assert_raises_regex(ValueError, "not valid labels or levels"): + df._drop_labels_or_levels('L4', axis=axis) + + # Series + # ------ + def test_drop_labels_or_levels_series(self): + + # Make series with L1 as index + s = self.df1.set_index('L1').L2 + self.check_levels_dropped(s, ['L1'], axis=0) + + with tm.assert_raises_regex(ValueError, "not valid index levels"): + s._drop_labels_or_levels('L4', axis=0) + + # Make series with L1 and L2 as index + s = self.df1.set_index(['L1', 'L2']).L3 + self.check_levels_dropped(s, ['L1', 'L2'], axis=0) + + with tm.assert_raises_regex(ValueError, "not valid index levels"): + s._drop_labels_or_levels('L4', axis=0) From 7da39aa42e689a430db23d89d8490aad4595b734 Mon Sep 17 00:00:00 2001 From: "Jon M. Mease" Date: Wed, 11 Oct 2017 19:19:01 -0400 Subject: [PATCH 13/34] PEP8 --- pandas/core/frame.py | 20 ++++++++++---------- pandas/core/generic.py | 33 +++++++++++++++------------------ 2 files changed, 25 insertions(+), 28 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1e61f34b5197d..e8878db66507f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -139,16 +139,16 @@ * inner: use intersection of keys from both frames, similar to a SQL inner join; preserve the order of the left keys on : label or list - Column or index level names to join on. These must be found in both - DataFrames. If on is None and not merging on indexes then this defaults to + Column or index level names to join on. These must be found in both + DataFrames. If on is None and not merging on indexes then this defaults to the intersection of the columns in both DataFrames. left_on : label or list, or array-like - Column or index level names to join on in the left DataFrame. Can also - be a vector or list of vectors of the length of the left DataFrame. + Column or index level names to join on in the left DataFrame. Can also + be a vector or list of vectors of the length of the left DataFrame. These vectors are treated as though they are columns. right_on : label or list, or array-like - Column or index level names to join on in the right DataFrame. Can also - be a vector or list of vectors of the length of the right DataFrame. + Column or index level names to join on in the right DataFrame. Can also + be a vector or list of vectors of the length of the right DataFrame. These vectors are treated as though they are columns. left_index : boolean, default False Use the index from the left DataFrame as the join key(s). If it is a @@ -2195,11 +2195,11 @@ def raise_warning(): # Build an informative and grammatical warning level_article, level_type = (('an', 'index') - if axis==0 else + if axis == 0 else ('a', 'column')) label_article, label_type = (('a', 'column') - if axis==0 else + if axis == 0 else ('an', 'index')) warnings.warn( @@ -2224,8 +2224,8 @@ def raise_warning(): return False else: if (isinstance(key, compat.string_types) and - key in self.index and - key in self.columns.names): + key in self.index and + key in self.columns.names): raise_warning() return True diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 7076938a10a8e..96f841dd76c1f 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1033,7 +1033,6 @@ def equals(self, other): def _is_level_reference(self, key, axis=0): raise NotImplementedError() - _shared_docs['_is_label_reference'] = """ Test whether a key is a label reference for a given axis. @@ -1057,11 +1056,10 @@ def _is_level_reference(self, key, axis=0): def _is_label_reference(self, key, axis=0): raise NotImplementedError() - _shared_docs['_is_label_or_level_reference'] = """ Test whether a key is a label or level reference for a given axis. - To be considered either a label or a level reference, `key` must be a + To be considered either a label or a level reference, `key` must be a string that: - (axis=0): Matches a column label or an index level - (axis=1): Matches an index label or a column level @@ -1082,7 +1080,6 @@ def _is_label_or_level_reference(self, key, axis=0): return (self._is_level_reference(key, axis=axis) or self._is_label_reference(key, axis=axis)) - _shared_docs['_check_label_or_level_ambiguity'] = """ Check whether `key` matches both a level of the input `axis` and a label of the other axis and raise a ``FutureWarning`` if this is the @@ -1116,17 +1113,17 @@ def _check_label_or_level_ambiguity(self, key, axis=0): _shared_docs['_get_label_or_level_values'] = """ - Return a 1-D array of values associated with `key`, a label or level + Return a 1-D array of values associated with `key`, a label or level from the given `axis`. Retrieval logic: - - (axis=0): Return column values if `key` matches a column label. - Otherwise return index level values if `key` matches an index + - (axis=0): Return column values if `key` matches a column label. + Otherwise return index level values if `key` matches an index level. - (axis=1): Return row values if `key` matches an index label. - Otherwise return column level values if 'key' matches a column - level - + Otherwise return column level values if 'key' matches a column + level + Parameters ---------- key: str @@ -1151,25 +1148,25 @@ def _get_label_or_level_values(self, key, axis=0): _shared_docs['_drop_labels_or_levels'] = """ - Drop labels and/or levels for the given `axis`. - + Drop labels and/or levels for the given `axis`. + For each key in `keys`: - - (axis=0): If key matches a column label then drop the column. - Otherwise if key matches an index level then drop the level. - - (axis=1): If key matches an index label then drop the row. + - (axis=0): If key matches a column label then drop the column. + Otherwise if key matches an index level then drop the level. + - (axis=1): If key matches an index label then drop the row. Otherwise if key matches a column level then drop the level. - + Parameters ---------- keys: str or list of str labels or levels to drop axis: int, default 0 Axis that levels are associated with (0 for index, 1 for columns) - + Returns ------- dropped: DataFrame - + Raises ------ ValueError From f5a16ff4c20a6d37281c921f78d8df466d6131b4 Mon Sep 17 00:00:00 2001 From: "Jon M. Mease" Date: Thu, 12 Oct 2017 13:15:18 -0400 Subject: [PATCH 14/34] Revert accidental change to merging.rst --- doc/source/merging.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/merging.rst b/doc/source/merging.rst index b6da7e8b0c002..ef8d073019f9a 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -504,7 +504,7 @@ the data in DataFrame. See the :ref:`cookbook` for some advanced strategies. Users who are familiar with SQL but new to pandas might be interested in a -:ref:`comparison with SQL`. +:ref:`comparison with SQL`. pandas provides a single function, ``merge``, as the entry point for all standard database join operations between DataFrame objects: From aa099eafd186f1a3ad06fac1f192f75d551572f3 Mon Sep 17 00:00:00 2001 From: "Jon M. Mease" Date: Fri, 13 Oct 2017 13:22:47 -0400 Subject: [PATCH 15/34] Use fixtures for new TestMergeColumnAndIndex tests --- pandas/tests/reshape/test_merge.py | 90 ++++++++++++++---------------- 1 file changed, 42 insertions(+), 48 deletions(-) diff --git a/pandas/tests/reshape/test_merge.py b/pandas/tests/reshape/test_merge.py index 4b487f10ea5cb..64bdd1e0fe5ac 100644 --- a/pandas/tests/reshape/test_merge.py +++ b/pandas/tests/reshape/test_merge.py @@ -1363,36 +1363,48 @@ def f(): class TestMergeColumnAndIndex(object): # GH14355 - @staticmethod - def df_left_with_index(levels): - """ Construct left test DataFrame with specified levels - (any of 'outer', 'inner', and 'v1')""" - res = DataFrame(dict( + def setup_method(self): + # Construct test DataFrames + self.df1 = DataFrame(dict( outer=[1, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4], inner=[1, 2, 3, 1, 2, 3, 4, 1, 2, 1, 2], v1=np.linspace(0, 1, 11) )) + self.df2 = DataFrame(dict( + outer=[1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3], + inner=[1, 2, 2, 3, 3, 4, 2, 3, 1, 1, 2, 3], + v2=np.linspace(10, 11, 12) + )) + + @pytest.fixture(params=[[], ['outer'], ['outer', 'inner']]) + def left_df(self, request): + """ Construct left test DataFrame with specified levels + (any of 'outer', 'inner', and 'v1')""" + levels = request.param + res = self.df1 + if levels: res = res.set_index(levels) return res - @staticmethod - def df_right_with_index(levels): + @pytest.fixture(params=[[], ['outer'], ['outer', 'inner']]) + def right_df(self, request): """ Construct right test DataFrame with specified levels (any of 'outer', 'inner', and 'v2')""" - res = DataFrame(dict( - outer=[1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3], - inner=[1, 2, 2, 3, 3, 4, 2, 3, 1, 1, 2, 3], - v2=np.linspace(10, 11, 12) - )) + levels = request.param + res = self.df2 if levels: res = res.set_index(levels) return res + @pytest.fixture(params=['inner', 'left', 'right', 'outer']) + def how(self, request): + return request.param + @staticmethod def compute_expected(df_left, df_right, on=None, left_on=None, right_on=None, how=None): @@ -1468,85 +1480,67 @@ def compute_expected(df_left, df_right, return expected - @pytest.mark.parametrize('left_levels', - [[], ['outer'], ['outer', 'inner']]) - @pytest.mark.parametrize('right_levels', - [[], ['outer'], ['outer', 'inner']]) @pytest.mark.parametrize('on', [['outer'], ['inner'], ['outer', 'inner'], ['inner', 'outer']]) - @pytest.mark.parametrize('how', ['inner', 'left', 'right', 'outer']) def test_merge_indexes_and_columns_on( - self, left_levels, right_levels, on, how): - - # Construct test DataFrames - df_left = self.df_left_with_index(left_levels) - df_right = self.df_right_with_index(right_levels) + self, left_df, right_df, on, how): # Construct expected result - expected = self.compute_expected(df_left, df_right, on=on, how=how) + expected = self.compute_expected(left_df, right_df, on=on, how=how) # Perform merge - result = df_left.merge(df_right, on=on, how=how) + result = left_df.merge(right_df, on=on, how=how) assert_frame_equal(result, expected, check_like=True) - @pytest.mark.parametrize('left_levels', - [[], ['outer'], ['outer', 'inner']]) - @pytest.mark.parametrize('right_levels', - [[], ['outer'], ['outer', 'inner']]) @pytest.mark.parametrize('left_on,right_on', [(['outer'], ['outer']), (['inner'], ['inner']), (['outer', 'inner'], ['outer', 'inner']), (['inner', 'outer'], ['inner', 'outer'])]) - @pytest.mark.parametrize('how', ['inner', 'left', 'right', 'outer']) def test_merge_indexes_and_columns_lefton_righton( - self, left_levels, right_levels, left_on, right_on, how): - - # Construct test DataFrames - df_left = self.df_left_with_index(left_levels) - df_right = self.df_right_with_index(right_levels) + self, left_df, right_df, left_on, right_on, how): # Construct expected result - expected = self.compute_expected(df_left, df_right, + expected = self.compute_expected(left_df, right_df, left_on=left_on, right_on=right_on, how=how) # Perform merge - result = df_left.merge(df_right, + result = left_df.merge(right_df, left_on=left_on, right_on=right_on, how=how) assert_frame_equal(result, expected, check_like=True) def test_merge_index_column_precedence(self): - # Construct df_left with both an index and a column named 'outer'. + # Construct left_df with both an index and a column named 'outer'. # We make this 'outer' column equal to the 'inner' column so that we # can verify that the correct values are used by the merge operation - df_left = self.df_left_with_index(['outer']) - df_left['outer'] = df_left['inner'] + left_df = self.df1.set_index('outer') + left_df['outer'] = left_df['inner'] - # Construct df_right with an index level named 'outer' - df_right = self.df_right_with_index(['outer']) + # Construct right_df with an index level named 'outer' + right_df = self.df2.set_index('outer') # Construct expected result. - # The 'outer' column from df_left is chosen and the resulting + # The 'outer' column from left_df is chosen and the resulting # frame has no index levels - expected = (df_left.reset_index(level='outer', drop=True) - .merge(df_right.reset_index(), on=['outer', 'inner'])) + expected = (left_df.reset_index(level='outer', drop=True) + .merge(right_df.reset_index(), on=['outer', 'inner'])) - # Merge df_left and df_right on 'outer' and 'inner' - # 'outer' for df_left should refer to the 'outer' column, not the + # Merge left_df and right_df on 'outer' and 'inner' + # 'outer' for left_df should refer to the 'outer' column, not the # 'outer' index level and a FutureWarning should be raised with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = df_left.merge(df_right, on=['outer', 'inner']) + result = left_df.merge(right_df, on=['outer', 'inner']) # Check results assert_frame_equal(result, expected) # Perform the same using the left_on and right_on parameters with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = df_left.merge(df_right, + result = left_df.merge(right_df, left_on=['outer', 'inner'], right_on=['outer', 'inner']) From e9f02b1c28bbde353a66a276ef8b6a3bedb59040 Mon Sep 17 00:00:00 2001 From: "Jon M. Mease" Date: Wed, 1 Nov 2017 19:29:54 -0400 Subject: [PATCH 16/34] Update documentation for a 0.22 release --- doc/source/merging.rst | 2 +- doc/source/whatsnew/v0.21.0.txt | 30 ------------------------------ doc/source/whatsnew/v0.22.0.txt | 30 ++++++++++++++++++++++++++++++ 3 files changed, 31 insertions(+), 31 deletions(-) diff --git a/doc/source/merging.rst b/doc/source/merging.rst index 4bf5effccd7fa..1fe592f85e4f5 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -1128,7 +1128,7 @@ This is not Implemented via ``join`` at-the-moment, however it can be done using Merging on a combination of columns and index levels ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. versionadded:: 0.21 +.. versionadded:: 0.22 Strings passed as the ``on``, ``left_on``, and ``right_on`` parameters may refer to either column names or index level names. This enables merging diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 7ac4df670e8ba..4c460eeb85b82 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -287,36 +287,6 @@ as in :meth:`DataFrame.rename`. Categories (2, int64): [0, 1] -.. _whatsnew_0210.enhancements.merge_on_columns_and_levels: - -Merging on a combination of columns and index levels -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Strings passed to :meth:`DataFrame.merge` as the ``on``, ``left_on``, and ``right_on`` -parameters may now refer to either column names or index level names. This enables -merging ``DataFrame`` instances on a combination of index levels and columns -without resetting indexes. See the :ref:`Merge on columns and levels -` documentation section. - -.. ipython:: python - - left_index = pd.Index(['K0', 'K0', 'K1', 'K2'], name='key1') - - left = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'], - 'B': ['B0', 'B1', 'B2', 'B3'], - 'key2': ['K0', 'K1', 'K0', 'K1']}, - index=left_index) - - right_index = pd.Index(['K0', 'K1', 'K2', 'K2'], name='key1') - - right = pd.DataFrame({'C': ['C0', 'C1', 'C2', 'C3'], - 'D': ['D0', 'D1', 'D2', 'D3'], - 'key2': ['K0', 'K0', 'K0', 'K1']}, - index=right_index) - - left.merge(right, on=['key1', 'key2']) - - .. _whatsnew_0210.enhancements.other: Other Enhancements diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index c41da4d67afe5..d43a94bf80a16 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -17,6 +17,36 @@ New features - - +.. _whatsnew_0220.enhancements.merge_on_columns_and_levels: + +Merging on a combination of columns and index levels +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Strings passed to :meth:`DataFrame.merge` as the ``on``, ``left_on``, and ``right_on`` +parameters may now refer to either column names or index level names. This enables +merging ``DataFrame`` instances on a combination of index levels and columns +without resetting indexes. See the :ref:`Merge on columns and levels +` documentation section. + +.. ipython:: python + + left_index = pd.Index(['K0', 'K0', 'K1', 'K2'], name='key1') + + left = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'], + 'B': ['B0', 'B1', 'B2', 'B3'], + 'key2': ['K0', 'K1', 'K0', 'K1']}, + index=left_index) + + right_index = pd.Index(['K0', 'K1', 'K2', 'K2'], name='key1') + + right = pd.DataFrame({'C': ['C0', 'C1', 'C2', 'C3'], + 'D': ['D0', 'D1', 'D2', 'D3'], + 'key2': ['K0', 'K0', 'K0', 'K1']}, + index=right_index) + + left.merge(right, on=['key1', 'key2']) + + .. _whatsnew_0220.enhancements.other: Other Enhancements From e029f7b26e60b3585d1cf73b74acc1dcf43452d1 Mon Sep 17 00:00:00 2001 From: "Jon M. Mease" Date: Mon, 6 Nov 2017 09:18:09 -0500 Subject: [PATCH 17/34] Documentation updates --- doc/source/merging.rst | 4 ++++ doc/source/whatsnew/v0.22.0.txt | 8 ++++---- pandas/core/frame.py | 5 +++++ 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/doc/source/merging.rst b/doc/source/merging.rst index 1fe592f85e4f5..86d2ec2254057 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -565,6 +565,10 @@ standard database join operations between DataFrame objects: .. versionadded:: 0.21.0 +.. note:: + + Support for specifying index levels as the ``on``, ``left_on``, and + ``right_on`` parameters was added in version 0.22.0. The return type will be the same as ``left``. If ``left`` is a ``DataFrame`` and ``right`` is a subclass of DataFrame, the return type will still be diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index d43a94bf80a16..8d5933cffc498 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -23,10 +23,10 @@ Merging on a combination of columns and index levels ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Strings passed to :meth:`DataFrame.merge` as the ``on``, ``left_on``, and ``right_on`` -parameters may now refer to either column names or index level names. This enables -merging ``DataFrame`` instances on a combination of index levels and columns -without resetting indexes. See the :ref:`Merge on columns and levels -` documentation section. +parameters may now refer to either column names or index level names (:issue:`14355`). +This enables merging ``DataFrame`` instances on a combination of index levels +and columns without resetting indexes. See the :ref:`Merge on columns and +levels ` documentation section. .. ipython:: python diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8379e9f31e3d8..b6df791c2b071 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -196,6 +196,11 @@ .. versionadded:: 0.21.0 +Note +---- +Support for specifying index levels as the ``on``, ``left_on``, and +``right_on`` parameters was added in version 0.22.0. + Examples -------- From fdddbd392c694ec74d60905899f7415d4c7d44c8 Mon Sep 17 00:00:00 2001 From: "Jon M. Mease" Date: Mon, 6 Nov 2017 09:21:41 -0500 Subject: [PATCH 18/34] Moved test_label_or_level_utils to pandas/tests/generic --- pandas/tests/{indexes => generic}/test_label_or_level_utils.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename pandas/tests/{indexes => generic}/test_label_or_level_utils.py (100%) diff --git a/pandas/tests/indexes/test_label_or_level_utils.py b/pandas/tests/generic/test_label_or_level_utils.py similarity index 100% rename from pandas/tests/indexes/test_label_or_level_utils.py rename to pandas/tests/generic/test_label_or_level_utils.py From 89061b9f25953d8569bd0e4af8134467edec3b75 Mon Sep 17 00:00:00 2001 From: "Jon M. Mease" Date: Mon, 6 Nov 2017 11:57:38 -0500 Subject: [PATCH 19/34] Refactored level_or_level test cases to use fixtures --- .../generic/test_label_or_level_utils.py | 680 +++++++++--------- 1 file changed, 335 insertions(+), 345 deletions(-) diff --git a/pandas/tests/generic/test_label_or_level_utils.py b/pandas/tests/generic/test_label_or_level_utils.py index 0a5ac098d6277..96b08c7e62ad3 100644 --- a/pandas/tests/generic/test_label_or_level_utils.py +++ b/pandas/tests/generic/test_label_or_level_utils.py @@ -1,378 +1,368 @@ import pytest import pandas as pd -from pandas import compat import pandas.util.testing as tm from pandas.core.dtypes.missing import array_equivalent -class TestLabelOrLevelUtils(object): - """ - Test NDFrame utility methods used by operations that allow users to - specify a mixture of levels and labels - """ - - # Setup - # ===== - def setup_method(self): - self.df1 = pd.DataFrame({'L1': [1, 2, 3], - 'L2': [11, 12, 13], - 'L3': ['A', 'B', 'C']}) - - # Data preparation helpers - # ======================== - @staticmethod - def prepare_df(df, levels=None, axis=0): - if levels: - if isinstance(levels, compat.string_types): - levels = [levels] - df = df.set_index(levels) - - if axis == 1: - # Transpose so index levels become column levels - df = df.T - return df - - def prepare_df1(self, levels=None, axis=0): - """Return DataFrame with specified levels (list of any of 'L1', 'L2', - and 'L3'). Remaining keys are left as labels""" - return self.prepare_df(self.df1, levels=levels, axis=axis) - - def prepare_df_ambig(self, axis=0): - """Return DataFrame with levels 'L1' and 'L2' and - labels 'L1' and 'L3' """ - df = self.df1.set_index(['L1', 'L2']) - df['L1'] = df['L3'] - - if axis == 1: - df = df.T - - return df - - def prepare_df_duplabels(self, axis=0): - """Return DataFrame with level 'L1' and labels 'L2', 'L3', and 'L2' """ - df = self.df1.set_index(['L1']) - df = pd.concat([df, df['L2']], axis=1) - - if axis == 1: - df = df.T - - return df - - # Test is label/level reference - # ============================= - @staticmethod - def check_level_reference(frame, levels, axis): - for level in levels: - assert frame._is_level_reference(level, axis=axis) - assert not frame._is_label_reference(level, axis=axis) - assert frame._is_label_or_level_reference(level, axis=axis) - - @staticmethod - def check_label_reference(frame, labels, axis): - for label in labels: - assert frame._is_label_reference(label, axis=axis) - assert not frame._is_level_reference(label, axis=axis) - assert frame._is_label_or_level_reference(label, axis=axis) - - # DataFrame - # --------- - @pytest.mark.parametrize('axis', [0, 1]) - def test_is_level_or_label_reference_df_simple(self, axis): - - # df has no named levels on axis - df = self.prepare_df1(axis=axis) - self.check_label_reference(df, ['L1', 'L2', 'L3'], axis=axis) - - # Set L1 as level on axis - df = self.prepare_df1('L1', axis=axis) - self.check_level_reference(df, ['L1'], axis=axis) - self.check_label_reference(df, ['L2', 'L3'], axis=axis) - - # Set L1 and L2 as levels on axis - df = self.prepare_df1(['L1', 'L2'], axis=axis) - self.check_level_reference(df, ['L1', 'L2'], axis=axis) - self.check_label_reference(df, ['L3'], axis=axis) - - # Set L1, L2, and L3 as levels on axis - df = self.prepare_df1(['L1', 'L2', 'L3'], axis=axis) - self.check_level_reference(df, ['L1', 'L2', 'L3'], axis=axis) - - @pytest.mark.parametrize('axis', [0, 1]) - def test_is_level_reference_df_ambig(self, axis): - - df = self.prepare_df_ambig(axis=axis) - - # df has both an on-axis level and off-axis label named L1 - # Therefore L1 should reference the label, not the level - self.check_label_reference(df, ['L1'], axis=axis) - - # df has an on-axis level named L2 and it is not ambiguous - # Therefore L2 is an level reference - self.check_level_reference(df, ['L2'], axis=axis) - - # df has a column named L3 and it not an level reference - self.check_label_reference(df, ['L3'], axis=axis) - - # Series - # ------ - def test_is_level_reference_series_simple_axis0(self): - - # Make series with L1 as index - s = self.df1.set_index('L1').L2 - self.check_level_reference(s, ['L1'], axis=0) - assert not s._is_level_reference('L2') - - # Make series with L1 and L2 as index - s = self.df1.set_index(['L1', 'L2']).L3 - self.check_level_reference(s, ['L1', 'L2'], axis=0) - assert not s._is_level_reference('L3') - - def test_is_level_reference_series_axis1_error(self): - - # Make series with L1 as index - s = self.df1.set_index('L1').L2 - - with tm.assert_raises_regex(ValueError, "No axis named 1"): - s._is_level_reference('L1', axis=1) - - # Test _check_label_or_level_ambiguity_df - # ======================================= - - # DataFrame - # --------- - @pytest.mark.parametrize('axis', [0, 1]) - def test_check_label_or_level_ambiguity_df(self, axis): - - df = self.prepare_df_ambig(axis=axis) - - # df has both an on-axis level and off-axis label named L1 - # Therefore L1 is ambiguous - with tm.assert_produces_warning(FutureWarning, - clear=True, - check_stacklevel=False) as w: - - assert df._check_label_or_level_ambiguity('L1', axis=axis) - warning_msg = w[0].message.args[0] - if axis == 0: - assert warning_msg.startswith("'L1' is both an index level " - "and a column label") - else: - assert warning_msg.startswith("'L1' is both a column level " - "and an index label") - - # df has an on-axis level named L2 and it is not ambiguous - # No warning should be raised - with tm.assert_produces_warning(None): - assert not df._check_label_or_level_ambiguity('L2', axis=axis) - - # df has an off-axis label named L3 and it is not ambiguous - with tm.assert_produces_warning(None): - assert not df._is_level_reference('L3', axis=axis) - - # Series - # ------ - @pytest.mark.parametrize('axis', [0, 1]) - def test_check_label_or_level_ambiguity_series(self, axis): - - # A series has only one axis and references are never ambiguous, - # regardless of what axis is considered - - # Make series with L1 as index - s = self.df1.set_index('L1').L2 - with tm.assert_produces_warning(None): - assert not s._check_label_or_level_ambiguity('L1', axis=axis) - assert not s._check_label_or_level_ambiguity('L2', axis=axis) - - # Make series with L1 and L2 as index - s = self.df1.set_index(['L1', 'L2']).L3 - with tm.assert_produces_warning(None): - assert not s._check_label_or_level_ambiguity('L1', axis=axis) - assert not s._check_label_or_level_ambiguity('L2', axis=axis) - assert not s._check_label_or_level_ambiguity('L3', axis=axis) - - # Test _get_label_or_level_values - # =============================== - - # DataFrame - # --------- - @staticmethod - def check_labels(frame, labels, axis): - for label in labels: - if axis == 0: - expected = frame[label]._values - else: - expected = frame.loc[label]._values - - result = frame._get_label_or_level_values(label, axis=axis) - assert array_equivalent(expected, result) - - @staticmethod - def check_levels(frame, levels, axis): - for level in levels: - if axis == 0: - expected = frame.index.get_level_values(level=level)._values - else: - expected = (frame.columns - .get_level_values(level=level) - ._values) - - result = frame._get_label_or_level_values(level, axis=axis) - assert array_equivalent(expected, result) - - @pytest.mark.parametrize('axis', [0, 1]) - def test_get_label_or_level_values_df_simple(self, axis): - - # ### df has no named index levels ### - df = self.prepare_df1(axis=axis) - self.check_labels(df, ['L1', 'L2', 'L3'], axis=axis) - - # ### Set L1 as index level ### - df = self.prepare_df1('L1', axis=axis) - self.check_labels(df, ['L2', 'L3'], axis=axis) - self.check_levels(df, ['L1'], axis=axis) - - # ### Set L1 and L2 as index levels ### - df = self.prepare_df1(['L1', 'L2'], axis=axis) - self.check_labels(df, ['L3'], axis=axis) - self.check_levels(df, ['L1', 'L2'], axis=axis) - - # ### Set L1, L2, and L3 as index levels ### - df = self.prepare_df1(['L1', 'L2', 'L3'], axis=axis) - self.check_levels(df, ['L1', 'L2', 'L3'], axis=axis) - - @pytest.mark.parametrize('axis', [0, 1]) - def test_get_label_or_level_values_df_ambig(self, axis): - df = self.prepare_df_ambig(axis=axis) - - # df has both an on-axis level and off-axis label named L1 - # Therefore L1 is ambiguous but will default to label - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - self.check_labels(df, ['L1'], axis=axis) - - # df has an on-axis level named L2 and it is not ambiguous - with tm.assert_produces_warning(None): - self.check_levels(df, ['L2'], axis=axis) - - # df has an off-axis label named L3 and it is not ambiguous - with tm.assert_produces_warning(None): - self.check_labels(df, ['L3'], axis=axis) - - @pytest.mark.parametrize('axis', [0, 1]) - def test_get_label_or_level_values_df_duplabels(self, axis): - - df = self.prepare_df_duplabels(axis=axis) - - # df has unambiguous level 'L1' - self.check_levels(df, ['L1'], axis=axis) - - # df has unique label 'L3' - self.check_labels(df, ['L3'], axis=axis) - - # df has duplicate labels 'L2' +# Fixtures +# ======== +@pytest.fixture +def df(): + """DataFrame with columns 'L1', 'L2', and 'L3' """ + return pd.DataFrame({'L1': [1, 2, 3], + 'L2': [11, 12, 13], + 'L3': ['A', 'B', 'C']}) + + +@pytest.fixture(params=[[], ['L1'], ['L1', 'L2'], ['L1', 'L2', 'L3']]) +def df_levels(request, df): + """DataFrame with columns or index levels 'L1', 'L2', and 'L3' """ + levels = request.param + + if levels: + df = df.set_index(levels) + + return df + + +@pytest.fixture +def df_ambig(df): + """DataFrame with levels 'L1' and 'L2' and labels 'L1' and 'L3' """ + df = df.set_index(['L1', 'L2']) + + df['L1'] = df['L3'] + + return df + + +@pytest.fixture +def df_duplabels(df): + """DataFrame with level 'L1' and labels 'L2', 'L3', and 'L2' """ + df = df.set_index(['L1']) + df = pd.concat([df, df['L2']], axis=1) + + return df + + +# Test is label/level reference +# ============================= +def get_labels_levels(df_levels): + expected_labels = list(df_levels.columns) + expected_levels = [name for name in df_levels.index.names + if name is not None] + return expected_labels, expected_levels + + +def check_label_reference(frame, labels, axis): + for label in labels: + assert frame._is_label_reference(label, axis=axis) + assert not frame._is_level_reference(label, axis=axis) + assert frame._is_label_or_level_reference(label, axis=axis) + + +def check_level_reference(frame, levels, axis): + for level in levels: + assert frame._is_level_reference(level, axis=axis) + assert not frame._is_label_reference(level, axis=axis) + assert frame._is_label_or_level_reference(level, axis=axis) + + +# DataFrame +# --------- +@pytest.mark.parametrize('axis', [0, 1]) +def test_is_level_or_label_reference_df_simple(df_levels, axis): + + # Compute expected labels and levels + expected_labels, expected_levels = get_labels_levels(df_levels) + + # Transpose frame if axis == 1 + if axis == 1: + df_levels = df_levels.T + + # Perform checks + check_level_reference(df_levels, expected_levels, axis=axis) + check_label_reference(df_levels, expected_labels, axis=axis) + + +@pytest.mark.parametrize('axis', [0, 1]) +def test_is_level_reference_df_ambig(df_ambig, axis): + + # Transpose frame if axis == 1 + if axis == 1: + df_ambig = df_ambig.T + + # df has both an on-axis level and off-axis label named L1 + # Therefore L1 should reference the label, not the level + check_label_reference(df_ambig, ['L1'], axis=axis) + + # df has an on-axis level named L2 and it is not ambiguous + # Therefore L2 is an level reference + check_level_reference(df_ambig, ['L2'], axis=axis) + + # df has a column named L3 and it not an level reference + check_label_reference(df_ambig, ['L3'], axis=axis) + + +# Series +# ------ +def test_is_level_reference_series_simple_axis0(df): + + # Make series with L1 as index + s = df.set_index('L1').L2 + check_level_reference(s, ['L1'], axis=0) + assert not s._is_level_reference('L2') + + # Make series with L1 and L2 as index + s = df.set_index(['L1', 'L2']).L3 + check_level_reference(s, ['L1', 'L2'], axis=0) + assert not s._is_level_reference('L3') + + +def test_is_level_reference_series_axis1_error(df): + + # Make series with L1 as index + s = df.set_index('L1').L2 + + with tm.assert_raises_regex(ValueError, "No axis named 1"): + s._is_level_reference('L1', axis=1) + + +# Test _check_label_or_level_ambiguity_df +# ======================================= + +# DataFrame +# --------- +@pytest.mark.parametrize('axis', [0, 1]) +def test_check_label_or_level_ambiguity_df(df_ambig, axis): + + # Transpose frame if axis == 1 + if axis == 1: + df_ambig = df_ambig.T + + # df_ambig has both an on-axis level and off-axis label named L1 + # Therefore L1 is ambiguous + with tm.assert_produces_warning(FutureWarning, + clear=True, + check_stacklevel=False) as w: + + assert df_ambig._check_label_or_level_ambiguity('L1', axis=axis) + warning_msg = w[0].message.args[0] + if axis == 0: + assert warning_msg.startswith("'L1' is both an index level " + "and a column label") + else: + assert warning_msg.startswith("'L1' is both a column level " + "and an index label") + + # df_ambig has an on-axis level named L2 and it is not ambiguous + # No warning should be raised + with tm.assert_produces_warning(None): + assert not df_ambig._check_label_or_level_ambiguity('L2', axis=axis) + + # df_ambig has an off-axis label named L3 and it is not ambiguous + with tm.assert_produces_warning(None): + assert not df_ambig._is_level_reference('L3', axis=axis) + + +# Series +# ------ +def test_check_label_or_level_ambiguity_series(df): + + # A series has no columns and therefore references are never ambiguous + + # Make series with L1 as index + s = df.set_index('L1').L2 + with tm.assert_produces_warning(None): + assert not s._check_label_or_level_ambiguity('L1', axis=0) + assert not s._check_label_or_level_ambiguity('L2', axis=0) + + # Make series with L1 and L2 as index + s = df.set_index(['L1', 'L2']).L3 + with tm.assert_produces_warning(None): + assert not s._check_label_or_level_ambiguity('L1', axis=0) + assert not s._check_label_or_level_ambiguity('L2', axis=0) + assert not s._check_label_or_level_ambiguity('L3', axis=0) + + +def test_check_label_or_level_ambiguity_series_axis1_error(df): + + # Make series with L1 as index + s = df.set_index('L1').L2 + + with tm.assert_raises_regex(ValueError, "No axis named 1"): + s._check_label_or_level_ambiguity('L1', axis=1) + + +# Test _get_label_or_level_values +# =============================== +def check_label_values(frame, labels, axis): + for label in labels: if axis == 0: - expected_msg = "The column label 'L2' is not unique" + expected = frame[label]._values else: - expected_msg = "The index label 'L2' is not unique" + expected = frame.loc[label]._values + + result = frame._get_label_or_level_values(label, axis=axis) + assert array_equivalent(expected, result) + + +def check_level_values(frame, levels, axis): + for level in levels: + if axis == 0: + expected = frame.index.get_level_values(level=level)._values + else: + expected = (frame.columns + .get_level_values(level=level) + ._values) + + result = frame._get_label_or_level_values(level, axis=axis) + assert array_equivalent(expected, result) + + +# DataFrame +# --------- +@pytest.mark.parametrize('axis', [0, 1]) +def test_get_label_or_level_values_df_simple(df_levels, axis): - with tm.assert_raises_regex(ValueError, expected_msg): - self.check_labels(df, ['L2'], axis=axis) + # Compute expected labels and levels + expected_labels, expected_levels = get_labels_levels(df_levels) - # Series - # ------ - def test_get_label_or_level_values_series_axis0(self): + # Transpose frame if axis == 1 + if axis == 1: + df_levels = df_levels.T - # Make series with L1 as index - s = self.df1.set_index('L1').L2 - self.check_levels(s, ['L1'], axis=0) + # Perform checks + check_label_values(df_levels, expected_labels, axis=axis) + check_level_values(df_levels, expected_levels, axis=axis) - # Make series with L1 and L2 as index - s = self.df1.set_index(['L1', 'L2']).L3 - self.check_levels(s, ['L1', 'L2'], axis=0) - def test_get_label_or_level_values_series_axis1_error(self): +@pytest.mark.parametrize('axis', [0, 1]) +def test_get_label_or_level_values_df_ambig(df_ambig, axis): - # Make series with L1 as index - s = self.df1.set_index('L1').L2 + # Transpose frame if axis == 1 + if axis == 1: + df_ambig = df_ambig.T - with tm.assert_raises_regex(ValueError, "No axis named 1"): - s._get_label_or_level_values('L1', axis=1) + # df has both an on-axis level and off-axis label named L1 + # Therefore L1 is ambiguous but will default to label + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + check_label_values(df_ambig, ['L1'], axis=axis) - # Test _drop_labels_or_levels - # =========================== - @staticmethod - def check_labels_dropped(frame, labels, axis): - for label in labels: - df_dropped = frame._drop_labels_or_levels(label, axis=axis) + # df has an on-axis level named L2 and it is not ambiguous + with tm.assert_produces_warning(None): + check_level_values(df_ambig, ['L2'], axis=axis) - if axis == 0: - assert label in frame.columns - assert label not in df_dropped.columns - else: - assert label in frame.index - assert label not in df_dropped.index + # df has an off-axis label named L3 and it is not ambiguous + with tm.assert_produces_warning(None): + check_label_values(df_ambig, ['L3'], axis=axis) - @staticmethod - def check_levels_dropped(frame, levels, axis): - for level in levels: - df_dropped = frame._drop_labels_or_levels(level, axis=axis) - if axis == 0: - assert level in frame.index.names - assert level not in df_dropped.index.names - else: - assert level in frame.columns.names - assert level not in df_dropped.columns.names +@pytest.mark.parametrize('axis', [0, 1]) +def test_get_label_or_level_values_df_duplabels(df_duplabels, axis): - # DataFrame - # --------- - @pytest.mark.parametrize('axis', [0, 1]) - def test_drop_labels_or_levels_df(self, axis): + # Transpose frame if axis == 1 + if axis == 1: + df_duplabels = df_duplabels.T - # ### df has no named index levels ### - df = self.prepare_df1(axis=axis) - self.check_labels_dropped(df, ['L1', 'L2', 'L3'], axis=axis) + # df has unambiguous level 'L1' + check_level_values(df_duplabels, ['L1'], axis=axis) + + # df has unique label 'L3' + check_label_values(df_duplabels, ['L3'], axis=axis) + + # df has duplicate labels 'L2' + if axis == 0: + expected_msg = "The column label 'L2' is not unique" + else: + expected_msg = "The index label 'L2' is not unique" + + with tm.assert_raises_regex(ValueError, expected_msg): + check_label_values(df_duplabels, ['L2'], axis=axis) + + +# Series +# ------ +def test_get_label_or_level_values_series_axis0(df): + + # Make series with L1 as index + s = df.set_index('L1').L2 + check_level_values(s, ['L1'], axis=0) + + # Make series with L1 and L2 as index + s = df.set_index(['L1', 'L2']).L3 + check_level_values(s, ['L1', 'L2'], axis=0) + + +def test_get_label_or_level_values_series_axis1_error(df): + + # Make series with L1 as index + s = df.set_index('L1').L2 + + with tm.assert_raises_regex(ValueError, "No axis named 1"): + s._get_label_or_level_values('L1', axis=1) + + +# Test _drop_labels_or_levels +# =========================== +def check_labels_dropped(frame, labels, axis): + for label in labels: + df_dropped = frame._drop_labels_or_levels(label, axis=axis) + + if axis == 0: + assert label in frame.columns + assert label not in df_dropped.columns + else: + assert label in frame.index + assert label not in df_dropped.index + + +def check_levels_dropped(frame, levels, axis): + for level in levels: + df_dropped = frame._drop_labels_or_levels(level, axis=axis) + + if axis == 0: + assert level in frame.index.names + assert level not in df_dropped.index.names + else: + assert level in frame.columns.names + assert level not in df_dropped.columns.names - with tm.assert_raises_regex(ValueError, "not valid labels or levels"): - df._drop_labels_or_levels('L4', axis=axis) - # ### Set L1 as index level ### - df = self.prepare_df1('L1', axis=axis) - self.check_labels_dropped(df, ['L2', 'L3'], axis=axis) - self.check_levels_dropped(df, ['L1'], axis=axis) +# DataFrame +# --------- +@pytest.mark.parametrize('axis', [0, 1]) +def test_drop_labels_or_levels_df(df_levels, axis): - with tm.assert_raises_regex(ValueError, "not valid labels or levels"): - df._drop_labels_or_levels('L4', axis=axis) + # Compute expected labels and levels + expected_labels, expected_levels = get_labels_levels(df_levels) - # ### Set L1 and L2 as index levels ### - df = self.prepare_df1(['L1', 'L2'], axis=axis) - self.check_labels_dropped(df, ['L3'], axis=axis) - self.check_levels_dropped(df, ['L1', 'L2'], axis=axis) + # Transpose frame if axis == 1 + if axis == 1: + df_levels = df_levels.T - with tm.assert_raises_regex(ValueError, "not valid labels or levels"): - df._drop_labels_or_levels('L4', axis=axis) + # Perform checks + check_labels_dropped(df_levels, expected_labels, axis=axis) + check_levels_dropped(df_levels, expected_levels, axis=axis) - # ### Set L1, L2, and L3 as index levels ### - df = self.prepare_df1(['L1', 'L2', 'L3'], axis=axis) - self.check_levels_dropped(df, ['L1', 'L2', 'L3'], axis=axis) + with tm.assert_raises_regex(ValueError, "not valid labels or levels"): + df_levels._drop_labels_or_levels('L4', axis=axis) - with tm.assert_raises_regex(ValueError, "not valid labels or levels"): - df._drop_labels_or_levels('L4', axis=axis) - # Series - # ------ - def test_drop_labels_or_levels_series(self): +# Series +# ------ +def test_drop_labels_or_levels_series(df): - # Make series with L1 as index - s = self.df1.set_index('L1').L2 - self.check_levels_dropped(s, ['L1'], axis=0) + # Make series with L1 as index + s = df.set_index('L1').L2 + check_levels_dropped(s, ['L1'], axis=0) - with tm.assert_raises_regex(ValueError, "not valid index levels"): - s._drop_labels_or_levels('L4', axis=0) + with tm.assert_raises_regex(ValueError, "not valid index levels"): + s._drop_labels_or_levels('L4', axis=0) - # Make series with L1 and L2 as index - s = self.df1.set_index(['L1', 'L2']).L3 - self.check_levels_dropped(s, ['L1', 'L2'], axis=0) + # Make series with L1 and L2 as index + s = df.set_index(['L1', 'L2']).L3 + check_levels_dropped(s, ['L1', 'L2'], axis=0) - with tm.assert_raises_regex(ValueError, "not valid index levels"): - s._drop_labels_or_levels('L4', axis=0) + with tm.assert_raises_regex(ValueError, "not valid index levels"): + s._drop_labels_or_levels('L4', axis=0) From 090b3e845ddf0bef327f50cf033b28009cc406c5 Mon Sep 17 00:00:00 2001 From: "Jon M. Mease" Date: Mon, 6 Nov 2017 13:30:15 -0500 Subject: [PATCH 20/34] Moved label_or_level utils on Series and DataFrame to NDFrame --- pandas/core/frame.py | 155 +-------------- pandas/core/generic.py | 185 +++++++++++++++--- pandas/core/series.py | 54 ----- .../generic/test_label_or_level_utils.py | 4 +- 4 files changed, 159 insertions(+), 239 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b6df791c2b071..f9bef27c08f22 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -68,7 +68,7 @@ standardize_mapping) from pandas.core.generic import NDFrame, _shared_docs from pandas.core.index import (Index, MultiIndex, _ensure_index, - _ensure_index_from_sequences, RangeIndex) + _ensure_index_from_sequences) from pandas.core.indexing import (maybe_droplevels, convert_to_index_sliceable, check_bool_indexer) from pandas.core.internals import (BlockManager, @@ -2220,159 +2220,6 @@ def _getitem_frame(self, key): raise ValueError('Must pass DataFrame with boolean values only') return self.where(key) - # ------------------------------------------------------------------------- - # Label or Level Combination Helpers - - @Appender(_shared_docs['_is_level_reference']) - def _is_level_reference(self, key, axis=0): - axis = self._get_axis_number(axis) - if axis == 0: - return (isinstance(key, compat.string_types) and - key not in self.columns and - key in self.index.names) - elif axis == 1: - return (isinstance(key, compat.string_types) and - key not in self.index and - key in self.columns.names) - - @Appender(_shared_docs['_is_label_reference']) - def _is_label_reference(self, key, axis=0): - axis = self._get_axis_number(axis) - if axis == 0: - return (isinstance(key, compat.string_types) and - key in self.columns) - elif axis == 1: - return (isinstance(key, compat.string_types) and - key in self.index) - - @Appender(_shared_docs['_check_label_or_level_ambiguity']) - def _check_label_or_level_ambiguity(self, key, axis=0): - - axis = self._get_axis_number(axis) - - def raise_warning(): - - # Build an informative and grammatical warning - level_article, level_type = (('an', 'index') - if axis == 0 else - ('a', 'column')) - - label_article, label_type = (('a', 'column') - if axis == 0 else - ('an', 'index')) - - warnings.warn( - ("'{key}' is both {level_article} {level_type} level and " - "{label_article} {label_type} label.\n" - "Defaulting to {label_type}, but this will raise an " - "ambiguity error in a future version" - ).format(key=key, - level_article=level_article, - level_type=level_type, - label_article=label_article, - label_type=label_type), FutureWarning) - - if axis == 0: - if (isinstance(key, compat.string_types) and - key in self.columns and - key in self.index.names): - - raise_warning() - return True - else: - return False - else: - if (isinstance(key, compat.string_types) and - key in self.index and - key in self.columns.names): - - raise_warning() - return True - else: - return False - - @Appender(_shared_docs['_get_label_or_level_values']) - def _get_label_or_level_values(self, key, axis=0): - axis = self._get_axis_number(axis) - if axis == 0: - if key in self: - self._check_label_or_level_ambiguity(key, axis=axis) - values = self[key]._values - elif self._is_level_reference(key, axis=axis): - values = self.index.get_level_values(key)._values - else: - raise KeyError(key) - else: - if key in self.index: - self._check_label_or_level_ambiguity(key, axis=axis) - values = self.loc[key]._values - elif self._is_level_reference(key, axis=axis): - values = self.columns.get_level_values(key)._values - else: - raise KeyError(key) - - # Check for duplicates - if values.ndim > 1: - label_axis_name = 'column' if axis == 0 else 'index' - raise ValueError(("The {label_axis_name} label '{key}' " - "is not unique") - .format(key=key, - label_axis_name=label_axis_name)) - - return values - - @Appender(_shared_docs['_drop_labels_or_levels']) - def _drop_labels_or_levels(self, keys, axis=0): - axis = self._get_axis_number(axis) - keys = com._maybe_make_list(keys) - - # Validate keys - invalid_keys = [k for k in keys if not - self._is_label_or_level_reference(k, axis=axis)] - - if invalid_keys: - raise ValueError(("The following keys are not valid labels or " - "levels for {axis}: {invalid_keys}") - .format(axis=axis, - invalid_keys=invalid_keys)) - - # Compute levels and labels to drop - levels_to_drop = [k for k in keys - if self._is_level_reference(k, axis=axis)] - - labels_to_drop = [k for k in keys - if not self._is_level_reference(k, axis=axis)] - - # Perform copy upfront and then use inplace operations below. - # This ensures that we always perform exactly one copy. - # ``copy`` and/or ``inplace`` options could be added in the future. - dropped = self.copy() - - if axis == 0: - # Handle dropping index levels - if levels_to_drop: - dropped.reset_index(levels_to_drop, drop=True, inplace=True) - - # Handle dropping columns labels - if labels_to_drop: - dropped.drop(labels_to_drop, axis=1, inplace=True) - else: - # Handle dropping column levels - if levels_to_drop: - if isinstance(dropped.columns, MultiIndex): - # Drop the specified levels from the MultiIndex - dropped.columns = dropped.columns.droplevel(levels_to_drop) - else: - # Drop the last level of Index by replacing with - # a RangeIndex - dropped.columns = RangeIndex(dropped.columns.size) - - # Handle dropping index labels - if labels_to_drop: - dropped.drop(labels_to_drop, axis=0, inplace=True) - - return dropped - def query(self, expr, inplace=False, **kwargs): """Query the columns of a frame with a boolean expression. diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ef4dd311cc9bf..e0affa0f19ecd 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -36,7 +36,7 @@ from pandas.core.base import PandasObject, SelectionMixin from pandas.core.index import (Index, MultiIndex, _ensure_index, - InvalidIndexError) + InvalidIndexError, RangeIndex) import pandas.core.indexing as indexing from pandas.core.indexing import maybe_convert_indices from pandas.core.indexes.datetimes import DatetimeIndex @@ -1046,7 +1046,8 @@ def equals(self, other): # operations should utilize/extend these methods when possible so that we # have consistent precedence and validation logic throughout the library. - _shared_docs['_is_level_reference'] = """ + def _is_level_reference(self, key, axis=0): + """ Test whether a key is a level reference for a given axis. To be considered a level reference, `key` must be a string that: @@ -1064,13 +1065,21 @@ def equals(self, other): Returns ------- - is_level: bool""" + is_level: bool + """ + axis = self._get_axis_number(axis) - @Appender(_shared_docs['_is_level_reference']) - def _is_level_reference(self, key, axis=0): - raise NotImplementedError() + if self.ndim > 2: + raise NotImplementedError( + "_is_level_reference is not implemented for {type}" + .format(type=type(self))) - _shared_docs['_is_label_reference'] = """ + return (isinstance(key, compat.string_types) and + key in self.axes[axis].names and + not self._is_label_reference(key, axis=axis)) + + def _is_label_reference(self, key, axis=0): + """ Test whether a key is a label reference for a given axis. To be considered a label reference, `key` must be a string that: @@ -1087,13 +1096,21 @@ def _is_level_reference(self, key, axis=0): Returns ------- - is_label: bool""" + is_label: bool + """ + axis = self._get_axis_number(axis) + other_axes = [ax for ax in range(self._AXIS_LEN) if ax != axis] - @Appender(_shared_docs['_is_label_reference']) - def _is_label_reference(self, key, axis=0): - raise NotImplementedError() + if self.ndim > 2: + raise NotImplementedError( + "_is_label_reference is not implemented for {type}" + .format(type=type(self))) + + return (isinstance(key, compat.string_types) and + any([key in self.axes[ax] for ax in other_axes])) - _shared_docs['_is_label_or_level_reference'] = """ + def _is_label_or_level_reference(self, key, axis=0): + """ Test whether a key is a label or level reference for a given axis. To be considered either a label or a level reference, `key` must be a @@ -1110,14 +1127,14 @@ def _is_label_reference(self, key, axis=0): Returns ------- - is_label_or_level: bool""" + is_label_or_level: bool + """ - @Appender(_shared_docs['_is_label_or_level_reference']) - def _is_label_or_level_reference(self, key, axis=0): return (self._is_level_reference(key, axis=axis) or self._is_label_reference(key, axis=axis)) - _shared_docs['_check_label_or_level_ambiguity'] = """ + def _check_label_or_level_ambiguity(self, key, axis=0): + """ Check whether `key` matches both a level of the input `axis` and a label of the other axis and raise a ``FutureWarning`` if this is the case. @@ -1144,12 +1161,47 @@ def _is_label_or_level_reference(self, key, axis=0): future version """ - @Appender(_shared_docs['_check_label_or_level_ambiguity']) - def _check_label_or_level_ambiguity(self, key, axis=0): - return False + axis = self._get_axis_number(axis) + other_axes = [ax for ax in range(self._AXIS_LEN) if ax != axis] + + if self.ndim > 2: + raise NotImplementedError( + "_check_label_or_level_ambiguity is not implemented for {type}" + .format(type=type(self))) + + def raise_warning(): - _shared_docs['_get_label_or_level_values'] = """ + # Build an informative and grammatical warning + level_article, level_type = (('an', 'index') + if axis == 0 else + ('a', 'column')) + + label_article, label_type = (('a', 'column') + if axis == 0 else + ('an', 'index')) + + warnings.warn( + ("'{key}' is both {level_article} {level_type} level and " + "{label_article} {label_type} label.\n" + "Defaulting to {label_type}, but this will raise an " + "ambiguity error in a future version" + ).format(key=key, + level_article=level_article, + level_type=level_type, + label_article=label_article, + label_type=label_type), FutureWarning) + + if (isinstance(key, compat.string_types) and + key in self.axes[axis].names and + any([key in self.axes[ax] for ax in other_axes])): + + raise_warning() + return True + else: + return False + def _get_label_or_level_values(self, key, axis=0): + """ Return a 1-D array of values associated with `key`, a label or level from the given `axis`. @@ -1177,14 +1229,37 @@ def _check_label_or_level_ambiguity(self, key, axis=0): KeyError if `key` matches neither a label nor a level ValueError - if `key` matches multiple labels""" + if `key` matches multiple labels + """ - @Appender(_shared_docs['_get_label_or_level_values']) - def _get_label_or_level_values(self, key, axis=0): - raise NotImplementedError() + axis = self._get_axis_number(axis) + other_axes = [ax for ax in range(self._AXIS_LEN) if ax != axis] - _shared_docs['_drop_labels_or_levels'] = """ + if self.ndim > 2: + raise NotImplementedError( + "_get_label_or_level_values is not implemented for {type}" + .format(type=type(self))) + + if self._is_label_reference(key, axis=axis): + self._check_label_or_level_ambiguity(key, axis=axis) + values = self.xs(key, axis=other_axes[0])._values + elif self._is_level_reference(key, axis=axis): + values = self.axes[axis].get_level_values(key)._values + else: + raise KeyError(key) + + # Check for duplicates + if values.ndim > 1: + label_axis_name = 'column' if axis == 0 else 'index' + raise ValueError(("The {label_axis_name} label '{key}' " + "is not unique") + .format(key=key, + label_axis_name=label_axis_name)) + + return values + def _drop_labels_or_levels(self, keys, axis=0): + """ Drop labels and/or levels for the given `axis`. For each key in `keys`: @@ -1207,11 +1282,63 @@ def _get_label_or_level_values(self, key, axis=0): Raises ------ ValueError - if any `keys` match neither a label nor a level""" + if any `keys` match neither a label nor a level + """ + + axis = self._get_axis_number(axis) + + if self.ndim > 2: + raise NotImplementedError( + "_drop_labels_or_levels is not implemented for {type}" + .format(type=type(self))) + + # Validate keys + keys = com._maybe_make_list(keys) + invalid_keys = [k for k in keys if not + self._is_label_or_level_reference(k, axis=axis)] + + if invalid_keys: + raise ValueError(("The following keys are not valid labels or " + "levels for axis {axis}: {invalid_keys}") + .format(axis=axis, + invalid_keys=invalid_keys)) + + # Compute levels and labels to drop + levels_to_drop = [k for k in keys + if self._is_level_reference(k, axis=axis)] + + labels_to_drop = [k for k in keys + if not self._is_level_reference(k, axis=axis)] + + # Perform copy upfront and then use inplace operations below. + # This ensures that we always perform exactly one copy. + # ``copy`` and/or ``inplace`` options could be added in the future. + dropped = self.copy() + + if axis == 0: + # Handle dropping index levels + if levels_to_drop: + dropped.reset_index(levels_to_drop, drop=True, inplace=True) + + # Handle dropping columns labels + if labels_to_drop: + dropped.drop(labels_to_drop, axis=1, inplace=True) + else: + # Handle dropping column levels + if levels_to_drop: + if isinstance(dropped.columns, MultiIndex): + # Drop the specified levels from the MultiIndex + dropped.columns = dropped.columns.droplevel(levels_to_drop) + else: + # Drop the last level of Index by replacing with + # a RangeIndex + dropped.columns = RangeIndex(dropped.columns.size) + + # Handle dropping index labels + if labels_to_drop: + dropped.drop(labels_to_drop, axis=0, inplace=True) - @Appender(_shared_docs['_drop_labels_or_levels']) - def _drop_labels_or_levels(self, key, axis=0): - raise NotImplementedError() + return dropped # ---------------------------------------------------------------------- # Iteration diff --git a/pandas/core/series.py b/pandas/core/series.py index 32b5e0da0d1ec..1c92c4b8850ee 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -412,60 +412,6 @@ def get_values(self): """ same as values (but handles sparseness conversions); is a view """ return self._data.get_values() - # ------------------------------------------------------------------------- - # Label or Level Combination Helpers - - @Appender(generic._shared_docs['_is_level_reference']) - def _is_level_reference(self, key, axis=0): - axis = self._get_axis_number(axis) - return (isinstance(key, compat.string_types) and - key in self.index.names) - - @Appender(generic._shared_docs['_is_label_reference']) - def _is_label_reference(self, key, axis=0): - axis = self._get_axis_number(axis) - return False - - @Appender(generic._shared_docs['_get_label_or_level_values']) - def _get_label_or_level_values(self, key, axis=0): - axis = self._get_axis_number(axis) - if self._is_level_reference(key, axis=axis): - values = self.index.get_level_values(key)._values - else: - raise KeyError(key) - - return values - - @Appender(generic._shared_docs['_drop_labels_or_levels']) - def _drop_labels_or_levels(self, keys, axis=0): - axis = self._get_axis_number(axis) - keys = com._maybe_make_list(keys) - - # Validate keys - invalid_keys = [k for k in keys - if not self._is_level_reference(k, axis=axis)] - - if invalid_keys: - raise ValueError(("The following keys are not valid index levels: " - "{invalid_keys}") - .format(axis=axis, - invalid_keys=invalid_keys)) - - # Compute levels and labels to drop - levels_to_drop = [k for k in keys - if self._is_level_reference(k, axis=axis)] - - # Perform copy upfront and then use inplace operations below. - # This ensures that we always perform exactly one copy. - # ``copy`` and/or ``inplace`` options could be added in the future. - dropped = self.copy() - - # Handle dropping index levels - if levels_to_drop: - dropped.reset_index(levels_to_drop, drop=True, inplace=True) - - return dropped - @property def asobject(self): """ diff --git a/pandas/tests/generic/test_label_or_level_utils.py b/pandas/tests/generic/test_label_or_level_utils.py index 96b08c7e62ad3..61a6c6e06d15f 100644 --- a/pandas/tests/generic/test_label_or_level_utils.py +++ b/pandas/tests/generic/test_label_or_level_utils.py @@ -357,12 +357,12 @@ def test_drop_labels_or_levels_series(df): s = df.set_index('L1').L2 check_levels_dropped(s, ['L1'], axis=0) - with tm.assert_raises_regex(ValueError, "not valid index levels"): + with tm.assert_raises_regex(ValueError, "not valid labels or levels"): s._drop_labels_or_levels('L4', axis=0) # Make series with L1 and L2 as index s = df.set_index(['L1', 'L2']).L3 check_levels_dropped(s, ['L1', 'L2'], axis=0) - with tm.assert_raises_regex(ValueError, "not valid index levels"): + with tm.assert_raises_regex(ValueError, "not valid labels or levels"): s._drop_labels_or_levels('L4', axis=0) From 47ff8b8ad9822d81e960f5e025409c46ef2923e1 Mon Sep 17 00:00:00 2001 From: "Jon M. Mease" Date: Mon, 6 Nov 2017 13:45:56 -0500 Subject: [PATCH 21/34] fix test comment typo --- pandas/tests/groupby/test_index_as_string.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_index_as_string.py b/pandas/tests/groupby/test_index_as_string.py index 3b6e15036cfe2..cee78eab3a636 100644 --- a/pandas/tests/groupby/test_index_as_string.py +++ b/pandas/tests/groupby/test_index_as_string.py @@ -108,7 +108,7 @@ def test_grouper_column_index_level_precedence(frame, assert_frame_equal(result, expected) - # Grouping with level Grouper should produce a difference result but + # Grouping with level Grouper should produce a different result but # still no warning with tm.assert_produces_warning(False): not_expected = frame.groupby(level_groupers).mean() From 59f2dce2be4bacaea1da641bf5de4c84a218d036 Mon Sep 17 00:00:00 2001 From: "Jon M. Mease" Date: Mon, 6 Nov 2017 13:46:18 -0500 Subject: [PATCH 22/34] PEP8ify --- pandas/core/frame.py | 2 +- pandas/core/generic.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f9bef27c08f22..9b489e117eab5 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -200,7 +200,7 @@ ---- Support for specifying index levels as the ``on``, ``left_on``, and ``right_on`` parameters was added in version 0.22.0. - + Examples -------- diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e0affa0f19ecd..177c69cb200bf 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1295,7 +1295,7 @@ def _drop_labels_or_levels(self, keys, axis=0): # Validate keys keys = com._maybe_make_list(keys) invalid_keys = [k for k in keys if not - self._is_label_or_level_reference(k, axis=axis)] + self._is_label_or_level_reference(k, axis=axis)] if invalid_keys: raise ValueError(("The following keys are not valid labels or " From 1d7e57074001848c55cc3d223bcfb22e6982f04b Mon Sep 17 00:00:00 2001 From: "Jon M. Mease" Date: Mon, 6 Nov 2017 14:47:09 -0500 Subject: [PATCH 23/34] Moved column and index tests to new file --- pandas/tests/reshape/test_index_as_string.py | 193 +++++++++++++++++++ pandas/tests/reshape/test_merge.py | 187 ------------------ 2 files changed, 193 insertions(+), 187 deletions(-) create mode 100644 pandas/tests/reshape/test_index_as_string.py diff --git a/pandas/tests/reshape/test_index_as_string.py b/pandas/tests/reshape/test_index_as_string.py new file mode 100644 index 0000000000000..a2ee2356355a9 --- /dev/null +++ b/pandas/tests/reshape/test_index_as_string.py @@ -0,0 +1,193 @@ +import numpy as np +import pytest + +from pandas import DataFrame +from pandas.util import testing as tm +from pandas.util.testing import assert_frame_equal + + +class TestMergeColumnAndIndex(object): + # GH14355 + + def setup_method(self): + # Construct test DataFrames + self.df1 = DataFrame(dict( + outer=[1, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4], + inner=[1, 2, 3, 1, 2, 3, 4, 1, 2, 1, 2], + v1=np.linspace(0, 1, 11) + )) + + self.df2 = DataFrame(dict( + outer=[1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3], + inner=[1, 2, 2, 3, 3, 4, 2, 3, 1, 1, 2, 3], + v2=np.linspace(10, 11, 12) + )) + + @pytest.fixture(params=[[], ['outer'], ['outer', 'inner']]) + def left_df(self, request): + """ Construct left test DataFrame with specified levels + (any of 'outer', 'inner', and 'v1')""" + levels = request.param + res = self.df1 + + if levels: + res = res.set_index(levels) + + return res + + @pytest.fixture(params=[[], ['outer'], ['outer', 'inner']]) + def right_df(self, request): + """ Construct right test DataFrame with specified levels + (any of 'outer', 'inner', and 'v2')""" + levels = request.param + res = self.df2 + + if levels: + res = res.set_index(levels) + + return res + + @pytest.fixture(params=['inner', 'left', 'right', 'outer']) + def how(self, request): + return request.param + + @staticmethod + def compute_expected(df_left, df_right, + on=None, left_on=None, right_on=None, how=None): + """ + Compute the expected merge result for the test case. + + This method computes the expected result of merging two DataFrames on + a combination of their columns and index levels. It does so by + explicitly dropping/resetting their named index levels, performing a + merge on their columns, and then finally restoring the appropriate + index in the result. + + Parameters + ---------- + df_left : DataFrame + The left DataFrame (may have zero or more named index levels) + df_right : DataFrame + The right DataFrame (may have zero or more named index levels) + on : list of str + The on parameter to the merge operation + left_on : list of str + The left_on parameter to the merge operation + right_on : list of str + The right_on parameter to the merge operation + how : str + The how parameter to the merge operation + + Returns + ------- + DataFrame + The expected merge result + """ + + # Handle on param if specified + if on is not None: + left_on, right_on = on, on + + # Compute input named index levels + left_levels = [n for n in df_left.index.names if n is not None] + right_levels = [n for n in df_right.index.names if n is not None] + + # Compute output named index levels + output_levels = [i for i in left_on + if i in right_levels and i in left_levels] + + # Drop index levels that aren't involved in the merge + drop_left = [n for n in left_levels if n not in left_on] + if drop_left: + df_left = df_left.reset_index(drop_left, drop=True) + + drop_right = [n for n in right_levels if n not in right_on] + if drop_right: + df_right = df_right.reset_index(drop_right, drop=True) + + # Convert remaining index levels to columns + reset_left = [n for n in left_levels if n in left_on] + if reset_left: + df_left = df_left.reset_index(level=reset_left) + + reset_right = [n for n in right_levels if n in right_on] + if reset_right: + df_right = df_right.reset_index(level=reset_right) + + # Perform merge + expected = df_left.merge(df_right, + left_on=left_on, + right_on=right_on, + how=how) + + # Restore index levels + if output_levels: + expected = expected.set_index(output_levels) + + return expected + + @pytest.mark.parametrize('on', + [['outer'], ['inner'], + ['outer', 'inner'], + ['inner', 'outer']]) + def test_merge_indexes_and_columns_on( + self, left_df, right_df, on, how): + + # Construct expected result + expected = self.compute_expected(left_df, right_df, on=on, how=how) + + # Perform merge + result = left_df.merge(right_df, on=on, how=how) + assert_frame_equal(result, expected, check_like=True) + + @pytest.mark.parametrize('left_on,right_on', + [(['outer'], ['outer']), (['inner'], ['inner']), + (['outer', 'inner'], ['outer', 'inner']), + (['inner', 'outer'], ['inner', 'outer'])]) + def test_merge_indexes_and_columns_lefton_righton( + self, left_df, right_df, left_on, right_on, how): + + # Construct expected result + expected = self.compute_expected(left_df, right_df, + left_on=left_on, + right_on=right_on, + how=how) + + # Perform merge + result = left_df.merge(right_df, + left_on=left_on, right_on=right_on, how=how) + assert_frame_equal(result, expected, check_like=True) + + def test_merge_index_column_precedence(self): + + # Construct left_df with both an index and a column named 'outer'. + # We make this 'outer' column equal to the 'inner' column so that we + # can verify that the correct values are used by the merge operation + left_df = self.df1.set_index('outer') + left_df['outer'] = left_df['inner'] + + # Construct right_df with an index level named 'outer' + right_df = self.df2.set_index('outer') + + # Construct expected result. + # The 'outer' column from left_df is chosen and the resulting + # frame has no index levels + expected = (left_df.reset_index(level='outer', drop=True) + .merge(right_df.reset_index(), on=['outer', 'inner'])) + + # Merge left_df and right_df on 'outer' and 'inner' + # 'outer' for left_df should refer to the 'outer' column, not the + # 'outer' index level and a FutureWarning should be raised + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = left_df.merge(right_df, on=['outer', 'inner']) + + # Check results + assert_frame_equal(result, expected) + + # Perform the same using the left_on and right_on parameters + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = left_df.merge(right_df, + left_on=['outer', 'inner'], + right_on=['outer', 'inner']) + + assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_merge.py b/pandas/tests/reshape/test_merge.py index 6f1fe03971836..3fe285a5df8fb 100644 --- a/pandas/tests/reshape/test_merge.py +++ b/pandas/tests/reshape/test_merge.py @@ -1361,193 +1361,6 @@ def f(): pytest.raises(NotImplementedError, f) -class TestMergeColumnAndIndex(object): - # GH14355 - - def setup_method(self): - # Construct test DataFrames - self.df1 = DataFrame(dict( - outer=[1, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4], - inner=[1, 2, 3, 1, 2, 3, 4, 1, 2, 1, 2], - v1=np.linspace(0, 1, 11) - )) - - self.df2 = DataFrame(dict( - outer=[1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3], - inner=[1, 2, 2, 3, 3, 4, 2, 3, 1, 1, 2, 3], - v2=np.linspace(10, 11, 12) - )) - - @pytest.fixture(params=[[], ['outer'], ['outer', 'inner']]) - def left_df(self, request): - """ Construct left test DataFrame with specified levels - (any of 'outer', 'inner', and 'v1')""" - levels = request.param - res = self.df1 - - if levels: - res = res.set_index(levels) - - return res - - @pytest.fixture(params=[[], ['outer'], ['outer', 'inner']]) - def right_df(self, request): - """ Construct right test DataFrame with specified levels - (any of 'outer', 'inner', and 'v2')""" - levels = request.param - res = self.df2 - - if levels: - res = res.set_index(levels) - - return res - - @pytest.fixture(params=['inner', 'left', 'right', 'outer']) - def how(self, request): - return request.param - - @staticmethod - def compute_expected(df_left, df_right, - on=None, left_on=None, right_on=None, how=None): - """ - Compute the expected merge result for the test case. - - This method computes the expected result of merging two DataFrames on - a combination of their columns and index levels. It does so by - explicitly dropping/resetting their named index levels, performing a - merge on their columns, and then finally restoring the appropriate - index in the result. - - Parameters - ---------- - df_left : DataFrame - The left DataFrame (may have zero or more named index levels) - df_right : DataFrame - The right DataFrame (may have zero or more named index levels) - on : list of str - The on parameter to the merge operation - left_on : list of str - The left_on parameter to the merge operation - right_on : list of str - The right_on parameter to the merge operation - how : str - The how parameter to the merge operation - - Returns - ------- - DataFrame - The expected merge result - """ - - # Handle on param if specified - if on is not None: - left_on, right_on = on, on - - # Compute input named index levels - left_levels = [n for n in df_left.index.names if n is not None] - right_levels = [n for n in df_right.index.names if n is not None] - - # Compute output named index levels - output_levels = [i for i in left_on - if i in right_levels and i in left_levels] - - # Drop index levels that aren't involved in the merge - drop_left = [n for n in left_levels if n not in left_on] - if drop_left: - df_left = df_left.reset_index(drop_left, drop=True) - - drop_right = [n for n in right_levels if n not in right_on] - if drop_right: - df_right = df_right.reset_index(drop_right, drop=True) - - # Convert remaining index levels to columns - reset_left = [n for n in left_levels if n in left_on] - if reset_left: - df_left = df_left.reset_index(level=reset_left) - - reset_right = [n for n in right_levels if n in right_on] - if reset_right: - df_right = df_right.reset_index(level=reset_right) - - # Perform merge - expected = df_left.merge(df_right, - left_on=left_on, - right_on=right_on, - how=how) - - # Restore index levels - if output_levels: - expected = expected.set_index(output_levels) - - return expected - - @pytest.mark.parametrize('on', - [['outer'], ['inner'], - ['outer', 'inner'], - ['inner', 'outer']]) - def test_merge_indexes_and_columns_on( - self, left_df, right_df, on, how): - - # Construct expected result - expected = self.compute_expected(left_df, right_df, on=on, how=how) - - # Perform merge - result = left_df.merge(right_df, on=on, how=how) - assert_frame_equal(result, expected, check_like=True) - - @pytest.mark.parametrize('left_on,right_on', - [(['outer'], ['outer']), (['inner'], ['inner']), - (['outer', 'inner'], ['outer', 'inner']), - (['inner', 'outer'], ['inner', 'outer'])]) - def test_merge_indexes_and_columns_lefton_righton( - self, left_df, right_df, left_on, right_on, how): - - # Construct expected result - expected = self.compute_expected(left_df, right_df, - left_on=left_on, - right_on=right_on, - how=how) - - # Perform merge - result = left_df.merge(right_df, - left_on=left_on, right_on=right_on, how=how) - assert_frame_equal(result, expected, check_like=True) - - def test_merge_index_column_precedence(self): - - # Construct left_df with both an index and a column named 'outer'. - # We make this 'outer' column equal to the 'inner' column so that we - # can verify that the correct values are used by the merge operation - left_df = self.df1.set_index('outer') - left_df['outer'] = left_df['inner'] - - # Construct right_df with an index level named 'outer' - right_df = self.df2.set_index('outer') - - # Construct expected result. - # The 'outer' column from left_df is chosen and the resulting - # frame has no index levels - expected = (left_df.reset_index(level='outer', drop=True) - .merge(right_df.reset_index(), on=['outer', 'inner'])) - - # Merge left_df and right_df on 'outer' and 'inner' - # 'outer' for left_df should refer to the 'outer' column, not the - # 'outer' index level and a FutureWarning should be raised - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = left_df.merge(right_df, on=['outer', 'inner']) - - # Check results - assert_frame_equal(result, expected) - - # Perform the same using the left_on and right_on parameters - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = left_df.merge(right_df, - left_on=['outer', 'inner'], - right_on=['outer', 'inner']) - - assert_frame_equal(result, expected) - - @pytest.fixture def df(): return DataFrame( From dd289a6789d17e5542af58e98186baa92229b497 Mon Sep 17 00:00:00 2001 From: "Jon M. Mease" Date: Mon, 6 Nov 2017 14:53:03 -0500 Subject: [PATCH 24/34] Remove test class and convert to using fixtures --- pandas/tests/reshape/test_index_as_string.py | 368 +++++++++---------- 1 file changed, 184 insertions(+), 184 deletions(-) diff --git a/pandas/tests/reshape/test_index_as_string.py b/pandas/tests/reshape/test_index_as_string.py index a2ee2356355a9..b6b0a59b806af 100644 --- a/pandas/tests/reshape/test_index_as_string.py +++ b/pandas/tests/reshape/test_index_as_string.py @@ -6,188 +6,188 @@ from pandas.util.testing import assert_frame_equal -class TestMergeColumnAndIndex(object): - # GH14355 - - def setup_method(self): - # Construct test DataFrames - self.df1 = DataFrame(dict( - outer=[1, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4], - inner=[1, 2, 3, 1, 2, 3, 4, 1, 2, 1, 2], - v1=np.linspace(0, 1, 11) - )) - - self.df2 = DataFrame(dict( - outer=[1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3], - inner=[1, 2, 2, 3, 3, 4, 2, 3, 1, 1, 2, 3], - v2=np.linspace(10, 11, 12) - )) - - @pytest.fixture(params=[[], ['outer'], ['outer', 'inner']]) - def left_df(self, request): - """ Construct left test DataFrame with specified levels - (any of 'outer', 'inner', and 'v1')""" - levels = request.param - res = self.df1 - - if levels: - res = res.set_index(levels) - - return res - - @pytest.fixture(params=[[], ['outer'], ['outer', 'inner']]) - def right_df(self, request): - """ Construct right test DataFrame with specified levels - (any of 'outer', 'inner', and 'v2')""" - levels = request.param - res = self.df2 - - if levels: - res = res.set_index(levels) - - return res - - @pytest.fixture(params=['inner', 'left', 'right', 'outer']) - def how(self, request): - return request.param - - @staticmethod - def compute_expected(df_left, df_right, - on=None, left_on=None, right_on=None, how=None): - """ - Compute the expected merge result for the test case. - - This method computes the expected result of merging two DataFrames on - a combination of their columns and index levels. It does so by - explicitly dropping/resetting their named index levels, performing a - merge on their columns, and then finally restoring the appropriate - index in the result. - - Parameters - ---------- - df_left : DataFrame - The left DataFrame (may have zero or more named index levels) - df_right : DataFrame - The right DataFrame (may have zero or more named index levels) - on : list of str - The on parameter to the merge operation - left_on : list of str - The left_on parameter to the merge operation - right_on : list of str - The right_on parameter to the merge operation - how : str - The how parameter to the merge operation - - Returns - ------- - DataFrame - The expected merge result - """ - - # Handle on param if specified - if on is not None: - left_on, right_on = on, on - - # Compute input named index levels - left_levels = [n for n in df_left.index.names if n is not None] - right_levels = [n for n in df_right.index.names if n is not None] - - # Compute output named index levels - output_levels = [i for i in left_on - if i in right_levels and i in left_levels] - - # Drop index levels that aren't involved in the merge - drop_left = [n for n in left_levels if n not in left_on] - if drop_left: - df_left = df_left.reset_index(drop_left, drop=True) - - drop_right = [n for n in right_levels if n not in right_on] - if drop_right: - df_right = df_right.reset_index(drop_right, drop=True) - - # Convert remaining index levels to columns - reset_left = [n for n in left_levels if n in left_on] - if reset_left: - df_left = df_left.reset_index(level=reset_left) - - reset_right = [n for n in right_levels if n in right_on] - if reset_right: - df_right = df_right.reset_index(level=reset_right) - - # Perform merge - expected = df_left.merge(df_right, - left_on=left_on, - right_on=right_on, - how=how) - - # Restore index levels - if output_levels: - expected = expected.set_index(output_levels) - - return expected - - @pytest.mark.parametrize('on', - [['outer'], ['inner'], - ['outer', 'inner'], - ['inner', 'outer']]) - def test_merge_indexes_and_columns_on( - self, left_df, right_df, on, how): - - # Construct expected result - expected = self.compute_expected(left_df, right_df, on=on, how=how) - - # Perform merge - result = left_df.merge(right_df, on=on, how=how) - assert_frame_equal(result, expected, check_like=True) - - @pytest.mark.parametrize('left_on,right_on', - [(['outer'], ['outer']), (['inner'], ['inner']), - (['outer', 'inner'], ['outer', 'inner']), - (['inner', 'outer'], ['inner', 'outer'])]) - def test_merge_indexes_and_columns_lefton_righton( - self, left_df, right_df, left_on, right_on, how): - - # Construct expected result - expected = self.compute_expected(left_df, right_df, - left_on=left_on, - right_on=right_on, - how=how) - - # Perform merge +@pytest.fixture +def df1(): + return DataFrame(dict( + outer=[1, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4], + inner=[1, 2, 3, 1, 2, 3, 4, 1, 2, 1, 2], + v1=np.linspace(0, 1, 11))) + + +@pytest.fixture +def df2(): + return DataFrame(dict( + outer=[1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3], + inner=[1, 2, 2, 3, 3, 4, 2, 3, 1, 1, 2, 3], + v2=np.linspace(10, 11, 12))) + + +@pytest.fixture(params=[[], ['outer'], ['outer', 'inner']]) +def left_df(request, df1): + """ Construct left test DataFrame with specified levels + (any of 'outer', 'inner', and 'v1')""" + levels = request.param + if levels: + df1 = df1.set_index(levels) + + return df1 + + +@pytest.fixture(params=[[], ['outer'], ['outer', 'inner']]) +def right_df(request, df2): + """ Construct right test DataFrame with specified levels + (any of 'outer', 'inner', and 'v2')""" + levels = request.param + + if levels: + df2 = df2.set_index(levels) + + return df2 + + +@pytest.fixture(params=['inner', 'left', 'right', 'outer']) +def how(request): + return request.param + + +def compute_expected(df_left, df_right, + on=None, left_on=None, right_on=None, how=None): + """ + Compute the expected merge result for the test case. + + This method computes the expected result of merging two DataFrames on + a combination of their columns and index levels. It does so by + explicitly dropping/resetting their named index levels, performing a + merge on their columns, and then finally restoring the appropriate + index in the result. + + Parameters + ---------- + df_left : DataFrame + The left DataFrame (may have zero or more named index levels) + df_right : DataFrame + The right DataFrame (may have zero or more named index levels) + on : list of str + The on parameter to the merge operation + left_on : list of str + The left_on parameter to the merge operation + right_on : list of str + The right_on parameter to the merge operation + how : str + The how parameter to the merge operation + + Returns + ------- + DataFrame + The expected merge result + """ + + # Handle on param if specified + if on is not None: + left_on, right_on = on, on + + # Compute input named index levels + left_levels = [n for n in df_left.index.names if n is not None] + right_levels = [n for n in df_right.index.names if n is not None] + + # Compute output named index levels + output_levels = [i for i in left_on + if i in right_levels and i in left_levels] + + # Drop index levels that aren't involved in the merge + drop_left = [n for n in left_levels if n not in left_on] + if drop_left: + df_left = df_left.reset_index(drop_left, drop=True) + + drop_right = [n for n in right_levels if n not in right_on] + if drop_right: + df_right = df_right.reset_index(drop_right, drop=True) + + # Convert remaining index levels to columns + reset_left = [n for n in left_levels if n in left_on] + if reset_left: + df_left = df_left.reset_index(level=reset_left) + + reset_right = [n for n in right_levels if n in right_on] + if reset_right: + df_right = df_right.reset_index(level=reset_right) + + # Perform merge + expected = df_left.merge(df_right, + left_on=left_on, + right_on=right_on, + how=how) + + # Restore index levels + if output_levels: + expected = expected.set_index(output_levels) + + return expected + + +@pytest.mark.parametrize('on', + [['outer'], ['inner'], + ['outer', 'inner'], + ['inner', 'outer']]) +def test_merge_indexes_and_columns_on(left_df, right_df, on, how): + + # Construct expected result + expected = compute_expected(left_df, right_df, on=on, how=how) + + # Perform merge + result = left_df.merge(right_df, on=on, how=how) + assert_frame_equal(result, expected, check_like=True) + + +@pytest.mark.parametrize('left_on,right_on', + [(['outer'], ['outer']), (['inner'], ['inner']), + (['outer', 'inner'], ['outer', 'inner']), + (['inner', 'outer'], ['inner', 'outer'])]) +def test_merge_indexes_and_columns_lefton_righton( + left_df, right_df, left_on, right_on, how): + + # Construct expected result + expected = compute_expected(left_df, right_df, + left_on=left_on, + right_on=right_on, + how=how) + + # Perform merge + result = left_df.merge(right_df, + left_on=left_on, right_on=right_on, how=how) + assert_frame_equal(result, expected, check_like=True) + + +def test_merge_index_column_precedence(df1, df2): + + # Construct left_df with both an index and a column named 'outer'. + # We make this 'outer' column equal to the 'inner' column so that we + # can verify that the correct values are used by the merge operation + left_df = df1.set_index('outer') + left_df['outer'] = left_df['inner'] + + # Construct right_df with an index level named 'outer' + right_df = df2.set_index('outer') + + # Construct expected result. + # The 'outer' column from left_df is chosen and the resulting + # frame has no index levels + expected = (left_df.reset_index(level='outer', drop=True) + .merge(right_df.reset_index(), on=['outer', 'inner'])) + + # Merge left_df and right_df on 'outer' and 'inner' + # 'outer' for left_df should refer to the 'outer' column, not the + # 'outer' index level and a FutureWarning should be raised + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = left_df.merge(right_df, on=['outer', 'inner']) + + # Check results + assert_frame_equal(result, expected) + + # Perform the same using the left_on and right_on parameters + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = left_df.merge(right_df, - left_on=left_on, right_on=right_on, how=how) - assert_frame_equal(result, expected, check_like=True) - - def test_merge_index_column_precedence(self): - - # Construct left_df with both an index and a column named 'outer'. - # We make this 'outer' column equal to the 'inner' column so that we - # can verify that the correct values are used by the merge operation - left_df = self.df1.set_index('outer') - left_df['outer'] = left_df['inner'] - - # Construct right_df with an index level named 'outer' - right_df = self.df2.set_index('outer') - - # Construct expected result. - # The 'outer' column from left_df is chosen and the resulting - # frame has no index levels - expected = (left_df.reset_index(level='outer', drop=True) - .merge(right_df.reset_index(), on=['outer', 'inner'])) - - # Merge left_df and right_df on 'outer' and 'inner' - # 'outer' for left_df should refer to the 'outer' column, not the - # 'outer' index level and a FutureWarning should be raised - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = left_df.merge(right_df, on=['outer', 'inner']) - - # Check results - assert_frame_equal(result, expected) - - # Perform the same using the left_on and right_on parameters - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = left_df.merge(right_df, - left_on=['outer', 'inner'], - right_on=['outer', 'inner']) - - assert_frame_equal(result, expected) + left_on=['outer', 'inner'], + right_on=['outer', 'inner']) + + assert_frame_equal(result, expected) From 313d2c36df9d4b6a3a9f1f6c74438ab639438f67 Mon Sep 17 00:00:00 2001 From: "Jon M. Mease" Date: Mon, 6 Nov 2017 14:53:34 -0500 Subject: [PATCH 25/34] Rename new test file --- .../{test_index_as_string.py => test_merge_index_as_string.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename pandas/tests/reshape/{test_index_as_string.py => test_merge_index_as_string.py} (100%) diff --git a/pandas/tests/reshape/test_index_as_string.py b/pandas/tests/reshape/test_merge_index_as_string.py similarity index 100% rename from pandas/tests/reshape/test_index_as_string.py rename to pandas/tests/reshape/test_merge_index_as_string.py From 0b0397bf18f8c2ef184440b76a2f22e9426451e7 Mon Sep 17 00:00:00 2001 From: "Jon M. Mease" Date: Tue, 7 Nov 2017 09:17:45 -0500 Subject: [PATCH 26/34] Documentation and testing review updates --- doc/source/whatsnew/v0.22.0.txt | 3 +- pandas/core/generic.py | 5 + .../generic/test_label_or_level_utils.py | 117 ++++++++++++++---- 3 files changed, 97 insertions(+), 28 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 333727bc71308..a9c17395fd181 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -23,10 +23,11 @@ Merging on a combination of columns and index levels ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Strings passed to :meth:`DataFrame.merge` as the ``on``, ``left_on``, and ``right_on`` -parameters may now refer to either column names or index level names (:issue:`14355`). +parameters may now refer to either column names or index level names. This enables merging ``DataFrame`` instances on a combination of index levels and columns without resetting indexes. See the :ref:`Merge on columns and levels ` documentation section. +(:issue:`14355`) .. ipython:: python diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 177c69cb200bf..84c32f96f3126 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1130,6 +1130,11 @@ def _is_label_or_level_reference(self, key, axis=0): is_label_or_level: bool """ + if self.ndim > 2: + raise NotImplementedError( + "_is_label_or_level_reference is not implemented for {type}" + .format(type=type(self))) + return (self._is_level_reference(key, axis=axis) or self._is_label_reference(key, axis=axis)) diff --git a/pandas/tests/generic/test_label_or_level_utils.py b/pandas/tests/generic/test_label_or_level_utils.py index 61a6c6e06d15f..456cb48020500 100644 --- a/pandas/tests/generic/test_label_or_level_utils.py +++ b/pandas/tests/generic/test_label_or_level_utils.py @@ -44,6 +44,13 @@ def df_duplabels(df): return df +@pytest.fixture +def panel(): + with tm.assert_produces_warning(DeprecationWarning, + check_stacklevel=False): + return pd.Panel() + + # Test is label/level reference # ============================= def get_labels_levels(df_levels): @@ -53,14 +60,14 @@ def get_labels_levels(df_levels): return expected_labels, expected_levels -def check_label_reference(frame, labels, axis): +def assert_label_reference(frame, labels, axis): for label in labels: assert frame._is_label_reference(label, axis=axis) assert not frame._is_level_reference(label, axis=axis) assert frame._is_label_or_level_reference(label, axis=axis) -def check_level_reference(frame, levels, axis): +def assert_level_reference(frame, levels, axis): for level in levels: assert frame._is_level_reference(level, axis=axis) assert not frame._is_label_reference(level, axis=axis) @@ -80,8 +87,8 @@ def test_is_level_or_label_reference_df_simple(df_levels, axis): df_levels = df_levels.T # Perform checks - check_level_reference(df_levels, expected_levels, axis=axis) - check_label_reference(df_levels, expected_labels, axis=axis) + assert_level_reference(df_levels, expected_levels, axis=axis) + assert_label_reference(df_levels, expected_labels, axis=axis) @pytest.mark.parametrize('axis', [0, 1]) @@ -93,14 +100,14 @@ def test_is_level_reference_df_ambig(df_ambig, axis): # df has both an on-axis level and off-axis label named L1 # Therefore L1 should reference the label, not the level - check_label_reference(df_ambig, ['L1'], axis=axis) + assert_label_reference(df_ambig, ['L1'], axis=axis) # df has an on-axis level named L2 and it is not ambiguous # Therefore L2 is an level reference - check_level_reference(df_ambig, ['L2'], axis=axis) + assert_level_reference(df_ambig, ['L2'], axis=axis) # df has a column named L3 and it not an level reference - check_label_reference(df_ambig, ['L3'], axis=axis) + assert_label_reference(df_ambig, ['L3'], axis=axis) # Series @@ -109,12 +116,12 @@ def test_is_level_reference_series_simple_axis0(df): # Make series with L1 as index s = df.set_index('L1').L2 - check_level_reference(s, ['L1'], axis=0) + assert_level_reference(s, ['L1'], axis=0) assert not s._is_level_reference('L2') # Make series with L1 and L2 as index s = df.set_index(['L1', 'L2']).L3 - check_level_reference(s, ['L1', 'L2'], axis=0) + assert_level_reference(s, ['L1', 'L2'], axis=0) assert not s._is_level_reference('L3') @@ -127,6 +134,32 @@ def test_is_level_reference_series_axis1_error(df): s._is_level_reference('L1', axis=1) +# Panel +# ----- +def test_is_level_reference_panel_error(panel): + msg = ("_is_level_reference is not implemented for {type}" + .format(type=type(panel))) + + with tm.assert_raises_regex(NotImplementedError, msg): + panel._is_level_reference('L1', axis=0) + + +def test_is_label_reference_panel_error(panel): + msg = ("_is_label_reference is not implemented for {type}" + .format(type=type(panel))) + + with tm.assert_raises_regex(NotImplementedError, msg): + panel._is_label_reference('L1', axis=0) + + +def test_is_label_or_level_reference_panel_error(panel): + msg = ("_is_label_or_level_reference is not implemented for {type}" + .format(type=type(panel))) + + with tm.assert_raises_regex(NotImplementedError, msg): + panel._is_label_or_level_reference('L1', axis=0) + + # Test _check_label_or_level_ambiguity_df # ======================================= @@ -193,9 +226,19 @@ def test_check_label_or_level_ambiguity_series_axis1_error(df): s._check_label_or_level_ambiguity('L1', axis=1) +# Panel +# ----- +def test_check_label_or_level_ambiguity_panel_error(panel): + msg = ("_check_label_or_level_ambiguity is not implemented for {type}" + .format(type=type(panel))) + + with tm.assert_raises_regex(NotImplementedError, msg): + panel._check_label_or_level_ambiguity('L1', axis=0) + + # Test _get_label_or_level_values # =============================== -def check_label_values(frame, labels, axis): +def assert_label_values(frame, labels, axis): for label in labels: if axis == 0: expected = frame[label]._values @@ -206,7 +249,7 @@ def check_label_values(frame, labels, axis): assert array_equivalent(expected, result) -def check_level_values(frame, levels, axis): +def assert_level_values(frame, levels, axis): for level in levels: if axis == 0: expected = frame.index.get_level_values(level=level)._values @@ -232,8 +275,8 @@ def test_get_label_or_level_values_df_simple(df_levels, axis): df_levels = df_levels.T # Perform checks - check_label_values(df_levels, expected_labels, axis=axis) - check_level_values(df_levels, expected_levels, axis=axis) + assert_label_values(df_levels, expected_labels, axis=axis) + assert_level_values(df_levels, expected_levels, axis=axis) @pytest.mark.parametrize('axis', [0, 1]) @@ -247,15 +290,15 @@ def test_get_label_or_level_values_df_ambig(df_ambig, axis): # Therefore L1 is ambiguous but will default to label with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - check_label_values(df_ambig, ['L1'], axis=axis) + assert_label_values(df_ambig, ['L1'], axis=axis) # df has an on-axis level named L2 and it is not ambiguous with tm.assert_produces_warning(None): - check_level_values(df_ambig, ['L2'], axis=axis) + assert_level_values(df_ambig, ['L2'], axis=axis) # df has an off-axis label named L3 and it is not ambiguous with tm.assert_produces_warning(None): - check_label_values(df_ambig, ['L3'], axis=axis) + assert_label_values(df_ambig, ['L3'], axis=axis) @pytest.mark.parametrize('axis', [0, 1]) @@ -266,10 +309,10 @@ def test_get_label_or_level_values_df_duplabels(df_duplabels, axis): df_duplabels = df_duplabels.T # df has unambiguous level 'L1' - check_level_values(df_duplabels, ['L1'], axis=axis) + assert_level_values(df_duplabels, ['L1'], axis=axis) # df has unique label 'L3' - check_label_values(df_duplabels, ['L3'], axis=axis) + assert_label_values(df_duplabels, ['L3'], axis=axis) # df has duplicate labels 'L2' if axis == 0: @@ -278,7 +321,7 @@ def test_get_label_or_level_values_df_duplabels(df_duplabels, axis): expected_msg = "The index label 'L2' is not unique" with tm.assert_raises_regex(ValueError, expected_msg): - check_label_values(df_duplabels, ['L2'], axis=axis) + assert_label_values(df_duplabels, ['L2'], axis=axis) # Series @@ -287,11 +330,11 @@ def test_get_label_or_level_values_series_axis0(df): # Make series with L1 as index s = df.set_index('L1').L2 - check_level_values(s, ['L1'], axis=0) + assert_level_values(s, ['L1'], axis=0) # Make series with L1 and L2 as index s = df.set_index(['L1', 'L2']).L3 - check_level_values(s, ['L1', 'L2'], axis=0) + assert_level_values(s, ['L1', 'L2'], axis=0) def test_get_label_or_level_values_series_axis1_error(df): @@ -303,9 +346,19 @@ def test_get_label_or_level_values_series_axis1_error(df): s._get_label_or_level_values('L1', axis=1) +# Panel +# ----- +def test_get_label_or_level_values_panel_error(panel): + msg = ("_get_label_or_level_values is not implemented for {type}" + .format(type=type(panel))) + + with tm.assert_raises_regex(NotImplementedError, msg): + panel._get_label_or_level_values('L1', axis=0) + + # Test _drop_labels_or_levels # =========================== -def check_labels_dropped(frame, labels, axis): +def assert_labels_dropped(frame, labels, axis): for label in labels: df_dropped = frame._drop_labels_or_levels(label, axis=axis) @@ -317,7 +370,7 @@ def check_labels_dropped(frame, labels, axis): assert label not in df_dropped.index -def check_levels_dropped(frame, levels, axis): +def assert_levels_dropped(frame, levels, axis): for level in levels: df_dropped = frame._drop_labels_or_levels(level, axis=axis) @@ -342,8 +395,8 @@ def test_drop_labels_or_levels_df(df_levels, axis): df_levels = df_levels.T # Perform checks - check_labels_dropped(df_levels, expected_labels, axis=axis) - check_levels_dropped(df_levels, expected_levels, axis=axis) + assert_labels_dropped(df_levels, expected_labels, axis=axis) + assert_levels_dropped(df_levels, expected_levels, axis=axis) with tm.assert_raises_regex(ValueError, "not valid labels or levels"): df_levels._drop_labels_or_levels('L4', axis=axis) @@ -355,14 +408,24 @@ def test_drop_labels_or_levels_series(df): # Make series with L1 as index s = df.set_index('L1').L2 - check_levels_dropped(s, ['L1'], axis=0) + assert_levels_dropped(s, ['L1'], axis=0) with tm.assert_raises_regex(ValueError, "not valid labels or levels"): s._drop_labels_or_levels('L4', axis=0) # Make series with L1 and L2 as index s = df.set_index(['L1', 'L2']).L3 - check_levels_dropped(s, ['L1', 'L2'], axis=0) + assert_levels_dropped(s, ['L1', 'L2'], axis=0) with tm.assert_raises_regex(ValueError, "not valid labels or levels"): s._drop_labels_or_levels('L4', axis=0) + + +# Panel +# ----- +def test_drop_labels_or_levels_panel_error(panel): + msg = ("_drop_labels_or_levels is not implemented for {type}" + .format(type=type(panel))) + + with tm.assert_raises_regex(NotImplementedError, msg): + panel._drop_labels_or_levels('L1', axis=0) From a49012c7e4b2996031fa42feb6f55e32ff551f41 Mon Sep 17 00:00:00 2001 From: "Jon M. Mease" Date: Thu, 23 Nov 2017 12:26:56 -0500 Subject: [PATCH 27/34] Fix generator/list lint issues --- pandas/core/generic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index fa500aa84f417..a525505e4e04e 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1107,7 +1107,7 @@ def _is_label_reference(self, key, axis=0): .format(type=type(self))) return (isinstance(key, compat.string_types) and - any([key in self.axes[ax] for ax in other_axes])) + any(key in self.axes[ax] for ax in other_axes)) def _is_label_or_level_reference(self, key, axis=0): """ @@ -1198,7 +1198,7 @@ def raise_warning(): if (isinstance(key, compat.string_types) and key in self.axes[axis].names and - any([key in self.axes[ax] for ax in other_axes])): + any(key in self.axes[ax] for ax in other_axes)): raise_warning() return True From 6fd9760174acf6266a9d0cc013cf3dbfe3184eb1 Mon Sep 17 00:00:00 2001 From: "Jon M. Mease" Date: Sat, 25 Nov 2017 21:08:18 -0500 Subject: [PATCH 28/34] Allow non-None hashable objects to reference index levels (not just strings) --- pandas/core/generic.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index a525505e4e04e..b359c1db20ae8 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -27,6 +27,7 @@ is_re_compilable, pandas_dtype) from pandas.core.dtypes.cast import maybe_promote, maybe_upcast_putmask +from pandas.core.dtypes.inference import is_hashable from pandas.core.dtypes.missing import isna, notna from pandas.core.dtypes.generic import ABCSeries, ABCPanel, ABCDataFrame from pandas.core.common import (_count_not_none, @@ -1074,7 +1075,8 @@ def _is_level_reference(self, key, axis=0): "_is_level_reference is not implemented for {type}" .format(type=type(self))) - return (isinstance(key, compat.string_types) and + return (key is not None and + is_hashable(key) and key in self.axes[axis].names and not self._is_label_reference(key, axis=axis)) @@ -1106,7 +1108,8 @@ def _is_label_reference(self, key, axis=0): "_is_label_reference is not implemented for {type}" .format(type=type(self))) - return (isinstance(key, compat.string_types) and + return (key is not None and + is_hashable(key) and any(key in self.axes[ax] for ax in other_axes)) def _is_label_or_level_reference(self, key, axis=0): @@ -1174,7 +1177,10 @@ def _check_label_or_level_ambiguity(self, key, axis=0): "_check_label_or_level_ambiguity is not implemented for {type}" .format(type=type(self))) - def raise_warning(): + if (key is not None and + is_hashable(key) and + key in self.axes[axis].names and + any(key in self.axes[ax] for ax in other_axes)): # Build an informative and grammatical warning level_article, level_type = (('an', 'index') From f7e04f56b02ba1f17b4f54770ae9fa1f4c4abfc6 Mon Sep 17 00:00:00 2001 From: "Jon M. Mease" Date: Sat, 25 Nov 2017 21:12:11 -0500 Subject: [PATCH 29/34] Reduce parameterized test case count by removing how fixture Now we parameterize the how parameter alongside on. Reduces test-case count by a factor of 4. --- .../reshape/test_merge_index_as_string.py | 23 ++++++++----------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/pandas/tests/reshape/test_merge_index_as_string.py b/pandas/tests/reshape/test_merge_index_as_string.py index b6b0a59b806af..afd21e96a0076 100644 --- a/pandas/tests/reshape/test_merge_index_as_string.py +++ b/pandas/tests/reshape/test_merge_index_as_string.py @@ -45,11 +45,6 @@ def right_df(request, df2): return df2 -@pytest.fixture(params=['inner', 'left', 'right', 'outer']) -def how(request): - return request.param - - def compute_expected(df_left, df_right, on=None, left_on=None, right_on=None, how=None): """ @@ -125,10 +120,11 @@ def compute_expected(df_left, df_right, return expected -@pytest.mark.parametrize('on', - [['outer'], ['inner'], - ['outer', 'inner'], - ['inner', 'outer']]) +@pytest.mark.parametrize('on,how', + [(['outer'], 'inner'), + (['inner'], 'left'), + (['outer', 'inner'], 'right'), + (['inner', 'outer'], 'outer')]) def test_merge_indexes_and_columns_on(left_df, right_df, on, how): # Construct expected result @@ -139,10 +135,11 @@ def test_merge_indexes_and_columns_on(left_df, right_df, on, how): assert_frame_equal(result, expected, check_like=True) -@pytest.mark.parametrize('left_on,right_on', - [(['outer'], ['outer']), (['inner'], ['inner']), - (['outer', 'inner'], ['outer', 'inner']), - (['inner', 'outer'], ['inner', 'outer'])]) +@pytest.mark.parametrize('left_on,right_on,how', + [(['outer'], ['outer'], 'inner'), + (['inner'], ['inner'], 'right'), + (['outer', 'inner'], ['outer', 'inner'], 'left'), + (['inner', 'outer'], ['inner', 'outer'], 'outer')]) def test_merge_indexes_and_columns_lefton_righton( left_df, right_df, left_on, right_on, how): From cf8e654b1c9532c2fd2a126aa9131a7aeda65594 Mon Sep 17 00:00:00 2001 From: "Jon M. Mease" Date: Sat, 25 Nov 2017 21:13:35 -0500 Subject: [PATCH 30/34] Refactor warning code and add stacklevel --- pandas/core/generic.py | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index b359c1db20ae8..876349d856dea 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1191,22 +1191,17 @@ def _check_label_or_level_ambiguity(self, key, axis=0): if axis == 0 else ('an', 'index')) - warnings.warn( - ("'{key}' is both {level_article} {level_type} level and " - "{label_article} {label_type} label.\n" - "Defaulting to {label_type}, but this will raise an " - "ambiguity error in a future version" - ).format(key=key, - level_article=level_article, - level_type=level_type, - label_article=label_article, - label_type=label_type), FutureWarning) - - if (isinstance(key, compat.string_types) and - key in self.axes[axis].names and - any(key in self.axes[ax] for ax in other_axes)): + msg = ("'{key}' is both {level_article} {level_type} level and " + "{label_article} {label_type} label.\n" + "Defaulting to {label_type}, but this will raise an " + "ambiguity error in a future version" + ).format(key=key, + level_article=level_article, + level_type=level_type, + label_article=label_article, + label_type=label_type) - raise_warning() + warnings.warn(msg, FutureWarning, stacklevel=2) return True else: return False From e874f04d80b6fa72270ed7dd59059975d15539e4 Mon Sep 17 00:00:00 2001 From: "Jon M. Mease" Date: Sat, 25 Nov 2017 21:16:16 -0500 Subject: [PATCH 31/34] Use single backticks to reference method params in docstrings Rename vector -> array --- pandas/core/frame.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9adcaf56987f8..88bd6123affbb 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -148,16 +148,16 @@ join; preserve the order of the left keys on : label or list Column or index level names to join on. These must be found in both - DataFrames. If on is None and not merging on indexes then this defaults to - the intersection of the columns in both DataFrames. + DataFrames. If `on` is None and not merging on indexes then this defaults + to the intersection of the columns in both DataFrames. left_on : label or list, or array-like Column or index level names to join on in the left DataFrame. Can also - be a vector or list of vectors of the length of the left DataFrame. - These vectors are treated as though they are columns. + be an array or list of arrays of the length of the left DataFrame. + These arrays are treated as if they are columns. right_on : label or list, or array-like Column or index level names to join on in the right DataFrame. Can also - be a vector or list of vectors of the length of the right DataFrame. - These vectors are treated as though they are columns. + be an array or list of arrays of the length of the right DataFrame. + These arrays are treated as if they are columns. left_index : boolean, default False Use the index from the left DataFrame as the join key(s). If it is a MultiIndex, the number of keys in the other DataFrame (either the index @@ -198,8 +198,8 @@ Note ---- -Support for specifying index levels as the ``on``, ``left_on``, and -``right_on`` parameters was added in version 0.22.0. +Support for specifying index levels as the `on`, `left_on`, and +`right_on` parameters was added in version 0.22.0 Examples -------- From 13ce87cf20ccc370ae043b354b2208ccae058725 Mon Sep 17 00:00:00 2001 From: "Jon M. Mease" Date: Sat, 25 Nov 2017 21:21:26 -0500 Subject: [PATCH 32/34] Add tests and docstring updates for using index levels as `on` param to df.join Fixed errors that cropped up when using join on a combination of columns and index levels --- pandas/core/frame.py | 15 ++++++----- pandas/core/reshape/merge.py | 16 +++++++++--- .../reshape/test_merge_index_as_string.py | 25 +++++++++++++++++++ 3 files changed, 47 insertions(+), 9 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 88bd6123affbb..dce80f359ddc1 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5202,12 +5202,12 @@ def join(self, other, on=None, how='left', lsuffix='', rsuffix='', Index should be similar to one of the columns in this one. If a Series is passed, its name attribute must be set, and that will be used as the column name in the resulting joined DataFrame - on : column name, tuple/list of column names, or array-like - Column(s) in the caller to join on the index in other, - otherwise joins index-on-index. If multiples - columns given, the passed DataFrame must have a MultiIndex. Can - pass an array as the join key if not already contained in the - calling DataFrame. Like an Excel VLOOKUP operation + on : name, tuple/list of names, or array-like + Column or index level name(s) in the caller to join on the index + in `other`, otherwise joins index-on-index. If multiple + values given, the `other` DataFrame must have a MultiIndex. Can + pass an array as the join key if it is not already contained in + the calling DataFrame. Like an Excel VLOOKUP operation how : {'left', 'right', 'outer', 'inner'}, default: 'left' How to handle the operation of the two objects. @@ -5232,6 +5232,9 @@ def join(self, other, on=None, how='left', lsuffix='', rsuffix='', on, lsuffix, and rsuffix options are not supported when passing a list of DataFrame objects + Support for specifying index levels as the `on` parameter was added + in version 0.22.0 + Examples -------- >>> caller = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'], diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index b0798d4abbfc9..ec30e32f7f374 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -671,8 +671,9 @@ def _maybe_restore_index_levels(self, result): for name, left_key, right_key in zip(self.join_names, self.left_on, self.right_on): - if self.orig_left._is_level_reference(left_key) and \ - self.orig_right._is_level_reference(right_key): + if (self.orig_left._is_level_reference(left_key) and + self.orig_right._is_level_reference(right_key) and + name not in result.index.names): names_to_restore.append(name) @@ -748,8 +749,17 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer): else: key_col = Index(lvals).where(~mask, rvals) - if name in result: + if result._is_label_reference(name): result[name] = key_col + elif result._is_level_reference(name): + if isinstance(result.index, MultiIndex): + idx_list = [result.index.get_level_values(level_name) + if level_name != name else key_col + for level_name in result.index.names] + + result.set_index(idx_list, inplace=True) + else: + result.index = Index(key_col, name=name) else: result.insert(i, name or 'key_{i}'.format(i=i), key_col) diff --git a/pandas/tests/reshape/test_merge_index_as_string.py b/pandas/tests/reshape/test_merge_index_as_string.py index afd21e96a0076..4c638f8e441fa 100644 --- a/pandas/tests/reshape/test_merge_index_as_string.py +++ b/pandas/tests/reshape/test_merge_index_as_string.py @@ -155,6 +155,31 @@ def test_merge_indexes_and_columns_lefton_righton( assert_frame_equal(result, expected, check_like=True) +@pytest.mark.parametrize('left_index', + ['inner', ['inner', 'outer']]) +@pytest.mark.parametrize('how', + ['inner', 'left', 'right', 'outer']) +def test_join_indexes_and_columns_on(df1, df2, left_index, how): + + # Construct left_df + left_df = df1.set_index(left_index) + + # Construct right_df + right_df = df2.set_index(['outer', 'inner']) + + # Result + expected = (left_df.reset_index() + .join(right_df, on=['outer', 'inner'], how=how, + lsuffix='_x', rsuffix='_y') + .set_index(left_index)) + + # Perform join + result = left_df.join(right_df, on=['outer', 'inner'], how=how, + lsuffix='_x', rsuffix='_y') + + assert_frame_equal(result, expected, check_like=True) + + def test_merge_index_column_precedence(df1, df2): # Construct left_df with both an index and a column named 'outer'. From b5cb4c1568eb994f44361c16f42df0b8529b4a23 Mon Sep 17 00:00:00 2001 From: "Jon M. Mease" Date: Sat, 25 Nov 2017 21:23:17 -0500 Subject: [PATCH 33/34] PEP8 --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index dce80f359ddc1..a726fb3eeecdb 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -148,7 +148,7 @@ join; preserve the order of the left keys on : label or list Column or index level names to join on. These must be found in both - DataFrames. If `on` is None and not merging on indexes then this defaults + DataFrames. If `on` is None and not merging on indexes then this defaults to the intersection of the columns in both DataFrames. left_on : label or list, or array-like Column or index level names to join on in the left DataFrame. Can also From f3b95feba9a9b55febae333ebaec5239f6af63b4 Mon Sep 17 00:00:00 2001 From: "Jon M. Mease" Date: Thu, 30 Nov 2017 20:43:57 -0500 Subject: [PATCH 34/34] Fixed Note->Notes in docstring --- pandas/core/frame.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a726fb3eeecdb..e28d14aacdce3 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -196,8 +196,8 @@ .. versionadded:: 0.21.0 -Note ----- +Notes +----- Support for specifying index levels as the `on`, `left_on`, and `right_on` parameters was added in version 0.22.0