diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index cc4bab8b9a923..85d087082171f 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -62,6 +62,8 @@ Removal of prior version deprecations/changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Removed the previously deprecated :meth:`Series.get_value`, :meth:`Series.set_value`, :meth:`DataFrame.get_value`, :meth:`DataFrame.set_value` (:issue:`17739`) - Changed the the default value of `inplace` in :meth:`DataFrame.set_index` and :meth:`Series.set_axis`. It now defaults to False (:issue:`27600`) +- :meth:`pandas.Series.str.cat` now defaults to aligning ``others``, using ``join='left'`` (:issue:`27611`) +- :meth:`pandas.Series.str.cat` does not accept list-likes *within* list-likes anymore (:issue:`27611`) - .. _whatsnew_1000.performance: diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 54882d039f135..a0e73172be1e4 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -21,7 +21,12 @@ is_scalar, is_string_like, ) -from pandas.core.dtypes.generic import ABCIndexClass, ABCMultiIndex, ABCSeries +from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCIndexClass, + ABCMultiIndex, + ABCSeries, +) from pandas.core.dtypes.missing import isna from pandas.core.algorithms import take_1d @@ -2058,7 +2063,7 @@ def cons_row(x): cons = self._orig._constructor return cons(result, name=name, index=index) - def _get_series_list(self, others, ignore_index=False): + def _get_series_list(self, others): """ Auxiliary function for :meth:`str.cat`. Turn potentially mixed input into a list of Series (elements without an index must match the length @@ -2066,122 +2071,56 @@ def _get_series_list(self, others, ignore_index=False): Parameters ---------- - others : Series, Index, DataFrame, np.ndarray, list-like or list-like - of objects that are Series, Index or np.ndarray (1-dim) - ignore_index : boolean, default False - Determines whether to forcefully align others with index of caller + others : Series, DataFrame, np.ndarray, list-like or list-like of + objects that are either Series, Index or np.ndarray (1-dim) Returns ------- - tuple : (others transformed into list of Series, - boolean whether FutureWarning should be raised) + list : others transformed into list of Series """ - - # Once str.cat defaults to alignment, this function can be simplified; - # will not need `ignore_index` and the second boolean output anymore - from pandas import Series, DataFrame # self._orig is either Series or Index idx = self._orig if isinstance(self._orig, ABCIndexClass) else self._orig.index - err_msg = ( - "others must be Series, Index, DataFrame, np.ndarray or " - "list-like (either containing only strings or containing " - "only objects of type Series/Index/list-like/np.ndarray)" - ) - # Generally speaking, all objects without an index inherit the index # `idx` of the calling Series/Index - i.e. must have matching length. - # Objects with an index (i.e. Series/Index/DataFrame) keep their own - # index, *unless* ignore_index is set to True. + # Objects with an index (i.e. Series/Index/DataFrame) keep their own. if isinstance(others, ABCSeries): - warn = not others.index.equals(idx) - # only reconstruct Series when absolutely necessary - los = [ - Series(others.values, index=idx) if ignore_index and warn else others - ] - return (los, warn) + return [others] elif isinstance(others, ABCIndexClass): - warn = not others.equals(idx) - los = [Series(others.values, index=(idx if ignore_index else others))] - return (los, warn) - elif isinstance(others, DataFrame): - warn = not others.index.equals(idx) - if ignore_index and warn: - # without copy, this could change "others" - # that was passed to str.cat - others = others.copy() - others.index = idx - return ([others[x] for x in others], warn) + return [Series(others.values, index=others)] + elif isinstance(others, ABCDataFrame): + return [others[x] for x in others] elif isinstance(others, np.ndarray) and others.ndim == 2: others = DataFrame(others, index=idx) - return ([others[x] for x in others], False) + return [others[x] for x in others] elif is_list_like(others, allow_sets=False): others = list(others) # ensure iterators do not get read twice etc # in case of list-like `others`, all elements must be - # either one-dimensional list-likes or scalars - if all(is_list_like(x, allow_sets=False) for x in others): + # either Series/Index/np.ndarray (1-dim)... + if all( + isinstance(x, (ABCSeries, ABCIndexClass)) + or (isinstance(x, np.ndarray) and x.ndim == 1) + for x in others + ): los = [] - join_warn = False - depr_warn = False - # iterate through list and append list of series for each - # element (which we check to be one-dimensional and non-nested) - while others: - nxt = others.pop(0) # nxt is guaranteed list-like by above - - # GH 21950 - DeprecationWarning - # only allowing Series/Index/np.ndarray[1-dim] will greatly - # simply this function post-deprecation. - if not ( - isinstance(nxt, (Series, ABCIndexClass)) - or (isinstance(nxt, np.ndarray) and nxt.ndim == 1) - ): - depr_warn = True - - if not isinstance( - nxt, (DataFrame, Series, ABCIndexClass, np.ndarray) - ): - # safety for non-persistent list-likes (e.g. iterators) - # do not map indexed/typed objects; info needed below - nxt = list(nxt) - - # known types for which we can avoid deep inspection - no_deep = ( - isinstance(nxt, np.ndarray) and nxt.ndim == 1 - ) or isinstance(nxt, (Series, ABCIndexClass)) - # nested list-likes are forbidden: - # -> elements of nxt must not be list-like - is_legal = (no_deep and nxt.dtype == object) or all( - not is_list_like(x) for x in nxt - ) - - # DataFrame is false positive of is_legal - # because "x in df" returns column names - if not is_legal or isinstance(nxt, DataFrame): - raise TypeError(err_msg) - - nxt, wnx = self._get_series_list(nxt, ignore_index=ignore_index) - los = los + nxt - join_warn = join_warn or wnx - - if depr_warn: - warnings.warn( - "list-likes other than Series, Index, or " - "np.ndarray WITHIN another list-like are " - "deprecated and will be removed in a future " - "version.", - FutureWarning, - stacklevel=4, - ) - return (los, join_warn) + while others: # iterate through list and append each element + los = los + self._get_series_list(others.pop(0)) + return los + # ... or just strings elif all(not is_list_like(x) for x in others): - return ([Series(others, index=idx)], False) - raise TypeError(err_msg) + return [Series(others, index=idx)] + raise TypeError( + "others must be Series, Index, DataFrame, np.ndarrary " + "or list-like (either containing only strings or " + "containing only objects of type Series/Index/" + "np.ndarray[1-dim])" + ) @forbid_nonstring_types(["bytes", "mixed", "mixed-integer"]) - def cat(self, others=None, sep=None, na_rep=None, join=None): + def cat(self, others=None, sep=None, na_rep=None, join="left"): """ Concatenate strings in the Series/Index with given separator. @@ -2215,16 +2154,15 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): - If `na_rep` is None, and `others` is not None, a row containing a missing value in any of the columns (before concatenation) will have a missing value in the result. - join : {'left', 'right', 'outer', 'inner'}, default None + join : {'left', 'right', 'outer', 'inner'}, default 'left' Determines the join-style between the calling Series/Index and any Series/Index/DataFrame in `others` (objects without an index need - to match the length of the calling Series/Index). If None, - alignment is disabled, but this option will be removed in a future - version of pandas and replaced with a default of `'left'`. To - disable alignment, use `.values` on any Series/Index/DataFrame in - `others`. + to match the length of the calling Series/Index). To disable + alignment, use `.values` on any Series/Index/DataFrame in `others`. .. versionadded:: 0.23.0 + .. versionchanged:: 1.0.0 + Changed default of `join` from None to `'left'`. Returns ------- @@ -2340,39 +2278,14 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): try: # turn anything in "others" into lists of Series - others, warn = self._get_series_list(others, ignore_index=(join is None)) + others = self._get_series_list(others) except ValueError: # do not catch TypeError raised by _get_series_list - if join is None: - raise ValueError( - "All arrays must be same length, except " - "those having an index if `join` is not None" - ) - else: - raise ValueError( - "If `others` contains arrays or lists (or " - "other list-likes without an index), these " - "must all be of the same length as the " - "calling Series/Index." - ) - - if join is None and warn: - warnings.warn( - "A future version of pandas will perform index " - "alignment when `others` is a Series/Index/" - "DataFrame (or a list-like containing one). To " - "disable alignment (the behavior before v.0.23) and " - "silence this warning, use `.values` on any Series/" - "Index/DataFrame in `others`. To enable alignment " - "and silence this warning, pass `join='left'|" - "'outer'|'inner'|'right'`. The future default will " - "be `join='left'`.", - FutureWarning, - stacklevel=3, + raise ValueError( + "If `others` contains arrays or lists (or other " + "list-likes without an index), these must all be " + "of the same length as the calling Series/Index." ) - # if join is None, _get_series_list already force-aligned indexes - join = "left" if join is None else join - # align if required if any(not data.index.equals(x.index) for x in others): # Need to add keys for uniqueness in case of duplicate columns diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index bc848a528f2fd..bc8dc7272a83a 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -384,7 +384,7 @@ def test_str_cat_name(self, box, other): other = other(values) else: other = values - result = box(values, name="name").str.cat(other, sep=",", join="left") + result = box(values, name="name").str.cat(other, sep=",") assert result.name == "name" @pytest.mark.parametrize("box", [Series, Index]) @@ -418,12 +418,9 @@ def test_str_cat(self, box): assert_series_or_index_equal(result, expected) # errors for incorrect lengths - rgx = "All arrays must be same length, except those having an index.*" + rgx = r"If `others` contains arrays or lists \(or other list-likes.*" z = Series(["1", "2", "3"]) - with pytest.raises(ValueError, match=rgx): - s.str.cat(z) - with pytest.raises(ValueError, match=rgx): s.str.cat(z.values) @@ -452,14 +449,12 @@ def test_str_cat_categorical(self, box, dtype_caller, dtype_target, sep): expected = Index(["ab", "aa", "bb", "ac"]) expected = expected if box == Index else Series(expected, index=s) - # Series/Index with unaligned Index - with tm.assert_produces_warning(expected_warning=FutureWarning): - # FutureWarning to switch to alignment by default - result = s.str.cat(t, sep=sep) - assert_series_or_index_equal(result, expected) + # Series/Index with unaligned Index -> t.values + result = s.str.cat(t.values, sep=sep) + assert_series_or_index_equal(result, expected) # Series/Index with Series having matching Index - t = Series(t, index=s) + t = Series(t.values, index=s) result = s.str.cat(t, sep=sep) assert_series_or_index_equal(result, expected) @@ -468,11 +463,14 @@ def test_str_cat_categorical(self, box, dtype_caller, dtype_target, sep): assert_series_or_index_equal(result, expected) # Series/Index with Series having different Index - t = Series(t.values, index=t) - with tm.assert_produces_warning(expected_warning=FutureWarning): - # FutureWarning to switch to alignment by default - result = s.str.cat(t, sep=sep) - assert_series_or_index_equal(result, expected) + t = Series(t.values, index=t.values) + expected = Index(["aa", "aa", "aa", "bb", "bb"]) + expected = ( + expected if box == Index else Series(expected, index=expected.str[:1]) + ) + + result = s.str.cat(t, sep=sep) + assert_series_or_index_equal(result, expected) # test integer/float dtypes (inferred by constructor) and mixed @pytest.mark.parametrize( @@ -523,55 +521,33 @@ def test_str_cat_mixed_inputs(self, box): result = s.str.cat([t, s.values]) assert_series_or_index_equal(result, expected) - # Series/Index with list of list-likes - with tm.assert_produces_warning(expected_warning=FutureWarning): - # nested list-likes will be deprecated - result = s.str.cat([t.values, list(s)]) - assert_series_or_index_equal(result, expected) - # Series/Index with list of Series; different indexes t.index = ["b", "c", "d", "a"] - with tm.assert_produces_warning(expected_warning=FutureWarning): - # FutureWarning to switch to alignment by default - result = s.str.cat([t, s]) - assert_series_or_index_equal(result, expected) + expected = box(["aDa", "bAb", "cBc", "dCd"]) + expected = expected if box == Index else Series(expected.values, index=s.values) + result = s.str.cat([t, s]) + assert_series_or_index_equal(result, expected) - # Series/Index with mixed list; different indexes - with tm.assert_produces_warning(expected_warning=FutureWarning): - # FutureWarning to switch to alignment by default - result = s.str.cat([t, s.values]) - assert_series_or_index_equal(result, expected) + # Series/Index with mixed list; different index + result = s.str.cat([t, s.values]) + assert_series_or_index_equal(result, expected) # Series/Index with DataFrame; different indexes d.index = ["b", "c", "d", "a"] - with tm.assert_produces_warning(expected_warning=FutureWarning): - # FutureWarning to switch to alignment by default - result = s.str.cat(d) - assert_series_or_index_equal(result, expected) - - # Series/Index with iterator of list-likes - with tm.assert_produces_warning(expected_warning=FutureWarning): - # nested list-likes will be deprecated - result = s.str.cat(iter([t.values, list(s)])) - assert_series_or_index_equal(result, expected) + expected = box(["aDd", "bAa", "cBb", "dCc"]) + expected = expected if box == Index else Series(expected.values, index=s.values) + result = s.str.cat(d) + assert_series_or_index_equal(result, expected) # errors for incorrect lengths - rgx = "All arrays must be same length, except those having an index.*" + rgx = r"If `others` contains arrays or lists \(or other list-likes.*" z = Series(["1", "2", "3"]) e = concat([z, z], axis=1) - # DataFrame - with pytest.raises(ValueError, match=rgx): - s.str.cat(e) - # two-dimensional ndarray with pytest.raises(ValueError, match=rgx): s.str.cat(e.values) - # list of Series - with pytest.raises(ValueError, match=rgx): - s.str.cat([z, s]) - # list of list-likes with pytest.raises(ValueError, match=rgx): s.str.cat([z.values, s.values]) @@ -615,6 +591,10 @@ def test_str_cat_mixed_inputs(self, box): with pytest.raises(TypeError, match=rgx): s.str.cat(1) + # nested list-likes + with pytest.raises(TypeError, match=rgx): + s.str.cat(iter([t.values, list(s)])) + @pytest.mark.parametrize("join", ["left", "outer", "inner", "right"]) @pytest.mark.parametrize("box", [Series, Index]) def test_str_cat_align_indexed(self, box, join): @@ -660,10 +640,9 @@ def test_str_cat_align_mixed_inputs(self, join): result = s.str.cat([t, u], join=join, na_rep="-") tm.assert_series_equal(result, expected) - with tm.assert_produces_warning(expected_warning=FutureWarning): - # nested list-likes will be deprecated - result = s.str.cat([t, list(u)], join=join, na_rep="-") - tm.assert_series_equal(result, expected) + with pytest.raises(TypeError, match="others must be Series,.*"): + # nested lists are forbidden + s.str.cat([t, list(u)], join=join) # errors for incorrect lengths rgx = r"If `others` contains arrays or lists \(or other list-likes.*"