diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index bea0f42f6849c9..4f57a7c2825cf8 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -63,6 +63,27 @@ Or ``astype`` after the ``Series`` or ``DataFrame`` is created s s.astype("string") + +.. versionchanged:: 1.1.0 + +You can also use ``string`` dtype on non-string data and it will be converted to +``string`` dtype: + +.. ipython:: python + + s = pd.Series(['a', 2, np.nan], dtype="string") + s + type(s[1]) + +or convert from existing pandas data: + + s1 = pd.Series([1,2, np.nan], dtype="Int64") + s1 + s2 = s1.astype("string") + s2 + type(s2[0]) + + .. _text.differences: Behavior differences diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 2f4e961ff433f3..d9f826ff85a676 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -13,6 +13,32 @@ including other versions of pandas. Enhancements ~~~~~~~~~~~~ +.. _whatsnew_110.astype_string: + +All dtypes can now be converted to ``StringDtype`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Previously, declaring or converting to :class:`StringDtype` was in general only possible if the data was already only ``str`` or nan-like. +For example: + +.. code-block:: ipython + + In [1]: pd.Series([1, "abc", np.nan], dtype="string") + Out[1]: ValueError: StringArray requires a sequence of strings or pandas.NA + In [2]: pd.Series([1,2, np.nan], dtype="Int64").astype("string") + Out[2]: ValueError: StringArray requires a sequence of strings or pandas.NA + +This meant that in order to convert arbitrary data to :class:`StringDtype`, you would often have to use ``.astype(str).astype('string')``, which was not intuitive. +:class:`StringDtype` now works in all situations where ``astype(str)`` or ``dtype=str`` work: + +.. ipython:: python + + ser = pd.Series([1, "abc", np.nan], dtype="string") + ser + ser[0] + pd.Series([1,2, np.nan], dtype="Int64").astype("string") + + .. _whatsnew_110.period_index_partial_string_slicing: Nonmonotonic PeriodIndex Partial String Slicing @@ -89,6 +115,7 @@ Other enhancements - :meth:`DataFrame.sample` will now also allow array-like and BitGenerator objects to be passed to ``random_state`` as seeds (:issue:`32503`) - :meth:`MultiIndex.union` will now raise `RuntimeWarning` if the object inside are unsortable, pass `sort=False` to suppress this warning (:issue:`33015`) - :class:`Series.dt` and :class:`DatatimeIndex` now have an `isocalendar` accessor that returns a :class:`DataFrame` with year, week, and day calculated according to the ISO 8601 calendar (:issue:`33206`). +- :meth:`Series.combine` has gained a ``dtype`` argument. If supplied, the combined series will get that dtype (:issue:`33465`) - The :meth:`DataFrame.to_feather` method now supports additional keyword arguments (e.g. to set the compression) that are added in pyarrow 0.17 (:issue:`33422`). diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 7447d593a7ff0e..1debf40d018411 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -20,7 +20,7 @@ from pandas.util._validators import validate_fillna_kwargs from pandas.core.dtypes.cast import maybe_cast_to_extension_array -from pandas.core.dtypes.common import is_array_like, is_list_like +from pandas.core.dtypes.common import is_array_like, is_list_like, pandas_dtype from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import isna @@ -176,7 +176,7 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): ---------- scalars : Sequence Each element will be an instance of the scalar type for this - array, ``cls.dtype.type``. + array, ``cls.dtype.type`` or be converted into this type in this method. dtype : dtype, optional Construct for this particular dtype. This should be a Dtype compatible with the ExtensionArray. @@ -431,6 +431,11 @@ def astype(self, dtype, copy=True): array : ndarray NumPy ndarray with 'dtype' for its dtype. """ + from pandas.core.arrays.string_ import StringDtype + + dtype = pandas_dtype(dtype) + if isinstance(dtype, StringDtype): + return dtype.construct_array_type()._from_sequence(self, copy=False) return np.array(self, dtype=dtype, copy=copy) def isna(self) -> ArrayLike: diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 5605b3fbc5dfaa..a6095649903d10 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -1,5 +1,5 @@ import numbers -from typing import TYPE_CHECKING, Tuple, Type, Union +from typing import TYPE_CHECKING, Dict, Tuple, Type, Union import warnings import numpy as np @@ -449,17 +449,20 @@ def astype(self, dtype, copy: bool = True) -> ArrayLike: if incompatible type with an IntegerDtype, equivalent of same_kind casting """ - from pandas.core.arrays.boolean import BooleanArray, BooleanDtype + from pandas.core.arrays.boolean import BooleanDtype + from pandas.core.arrays.string_ import StringDtype dtype = pandas_dtype(dtype) # if we are astyping to an existing IntegerDtype we can fastpath if isinstance(dtype, _IntegerDtype): result = self._data.astype(dtype.numpy_dtype, copy=False) - return type(self)(result, mask=self._mask, copy=False) + return dtype.construct_array_type()(result, mask=self._mask, copy=False) elif isinstance(dtype, BooleanDtype): result = self._data.astype("bool", copy=False) - return BooleanArray(result, mask=self._mask, copy=False) + return dtype.construct_array_type()(result, mask=self._mask, copy=False) + elif isinstance(dtype, StringDtype): + return dtype.construct_array_type()._from_sequence(self, copy=False) # coerce if is_float_dtype(dtype): @@ -748,7 +751,7 @@ class UInt64Dtype(_IntegerDtype): __doc__ = _dtype_docstring.format(dtype="uint64") -_dtypes = { +_dtypes: Dict[str, _IntegerDtype] = { "int8": Int8Dtype(), "int16": Int16Dtype(), "int32": Int32Dtype(), diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 220b70ff71b289..6397eb7431493d 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -678,6 +678,8 @@ def astype(self, dtype, copy=True): array : ExtensionArray or ndarray ExtensionArray or NumPy ndarray with 'dtype' for its dtype. """ + from pandas.core.arrays.string_ import StringDtype + dtype = pandas_dtype(dtype) if is_interval_dtype(dtype): if dtype == self.dtype: @@ -695,6 +697,9 @@ def astype(self, dtype, copy=True): return self._shallow_copy(new_left, new_right) elif is_categorical_dtype(dtype): return Categorical(np.asarray(self)) + elif isinstance(dtype, StringDtype): + return dtype.construct_array_type()._from_sequence(self, copy=False) + # TODO: This try/except will be repeated. try: return np.asarray(self).astype(dtype, copy=copy) diff --git a/pandas/core/arrays/sparse/dtype.py b/pandas/core/arrays/sparse/dtype.py index afa11586fda043..a30cfd02928302 100644 --- a/pandas/core/arrays/sparse/dtype.py +++ b/pandas/core/arrays/sparse/dtype.py @@ -320,7 +320,7 @@ def update_dtype(self, dtype): dtype = pandas_dtype(dtype) if not isinstance(dtype, cls): - fill_value = astype_nansafe(np.array(self.fill_value), dtype).item() + fill_value = astype_nansafe(np.array([self.fill_value]), dtype)[0] dtype = cls(dtype, fill_value=fill_value) return dtype diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index dbca8e74f5e1b2..39b08e2639fc45 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -203,10 +203,14 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): # TODO: it would be nice to do this in _validate / lib.is_string_array # We are already doing a scan over the values there. na_values = isna(result) - if na_values.any(): - if result is scalars: - # force a copy now, if we haven't already - result = result.copy() + has_nans = na_values.any() + if has_nans and result is scalars: + # force a copy now, if we haven't already + result = result.copy() + # convert to str, then to object to avoid dtype like ' "Series": + def combine(self, other, func, fill_value=None, dtype=None) -> "Series": """ Combine the Series with a Series or scalar according to `func`. @@ -2682,6 +2682,11 @@ def combine(self, other, func, fill_value=None) -> "Series": The value to assume when an index is missing from one Series or the other. The default specifies to use the appropriate NaN value for the underlying dtype of the Series. + dtype : str, numpy.dtype, or ExtensionDtype, optional + Data type for the output Series. If not specified, this will be + inferred from the combined data. + + .. versionadded:: 1.1.0 Returns ------- @@ -2752,6 +2757,10 @@ def combine(self, other, func, fill_value=None) -> "Series": new_values = [func(lv, other) for lv in self._values] new_name = self.name + if dtype is not None: + return self._constructor( + new_values, index=new_index, name=new_name, dtype=dtype + ) if is_categorical_dtype(self.dtype): pass elif is_extension_array_dtype(self.dtype): diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index cb3a70e934dcb1..9ad4011cba43b9 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -529,7 +529,6 @@ def test_astype_all(self, any_real_dtype): np.array([0, 1], dtype="datetime64[ns]"), dtype=SparseDtype("datetime64[ns]", pd.Timestamp("1970")), ), - marks=[pytest.mark.xfail(reason="NumPy-7619")], ), ( SparseArray([0, 1, 10]), diff --git a/pandas/tests/extension/base/casting.py b/pandas/tests/extension/base/casting.py index f33f960e8e3419..f7dae9f460d777 100644 --- a/pandas/tests/extension/base/casting.py +++ b/pandas/tests/extension/base/casting.py @@ -36,6 +36,11 @@ def test_astype_str(self, data): expected = pd.Series(data[:5].astype(str)) self.assert_series_equal(result, expected) + def test_astype_string(self, data): + result = pd.Series(data[:5]).astype("string") + expected = pd.Series(data[:5].astype("string")) + self.assert_series_equal(result, expected) + def test_to_numpy(self, data): expected = np.asarray(data) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 22e53dbc89f015..3d3cf13bda7528 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -172,15 +172,16 @@ def test_combine_le(self, data_repeated): orig_data1, orig_data2 = data_repeated(2) s1 = pd.Series(orig_data1) s2 = pd.Series(orig_data2) - result = s1.combine(s2, lambda x1, x2: x1 <= x2) + result = s1.combine(s2, lambda x1, x2: x1 <= x2, dtype="boolean") expected = pd.Series( - [a <= b for (a, b) in zip(list(orig_data1), list(orig_data2))] + [a <= b for (a, b) in zip(list(orig_data1), list(orig_data2))], + dtype="boolean", ) self.assert_series_equal(result, expected) val = s1.iloc[0] - result = s1.combine(val, lambda x1, x2: x1 <= x2) - expected = pd.Series([a <= val for a in list(orig_data1)]) + result = s1.combine(val, lambda x1, x2: x1 <= x2, dtype="boolean") + expected = pd.Series([a <= val for a in list(orig_data1)], dtype="boolean") self.assert_series_equal(result, expected) def test_combine_add(self, data_repeated): diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 85d8ad6ec6e380..fb55c5ae039252 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -7,6 +7,7 @@ import numpy as np from pandas.core.dtypes.base import ExtensionDtype +from pandas.core.dtypes.common import pandas_dtype import pandas as pd from pandas.api.extensions import no_default, register_extension_dtype @@ -130,8 +131,13 @@ def copy(self): return type(self)(self._data.copy()) def astype(self, dtype, copy=True): + from pandas.core.arrays.string_ import StringDtype + + dtype = pandas_dtype(dtype) if isinstance(dtype, type(self.dtype)): return type(self)(self._data, context=dtype.context) + elif isinstance(dtype, StringDtype): + return dtype.construct_array_type()._from_sequence(self, copy=False) return np.asarray(self, dtype=dtype) def __setitem__(self, key, value): diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index 1f026e405dc173..4c8f69e28779f5 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -21,6 +21,8 @@ import numpy as np +from pandas.core.dtypes.common import pandas_dtype + import pandas as pd from pandas.api.extensions import ExtensionArray, ExtensionDtype @@ -154,12 +156,18 @@ def astype(self, dtype, copy=True): # NumPy has issues when all the dicts are the same length. # np.array([UserDict(...), UserDict(...)]) fails, # but np.array([{...}, {...}]) works, so cast. + from pandas.core.arrays.string_ import StringDtype + dtype = pandas_dtype(dtype) # needed to add this check for the Series constructor if isinstance(dtype, type(self.dtype)) and dtype == self.dtype: if copy: return self.copy() return self + elif isinstance(dtype, StringDtype): + value = self.astype(str) # numpy doesn'y like nested dicts + return dtype.construct_array_type()._from_sequence(value, copy=False) + return np.array([dict(x) for x in self], dtype=dtype, copy=copy) def unique(self): diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index aa5a99282131ab..29a9b48fba3464 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -139,6 +139,11 @@ def test_astype_str(self, data): # ValueError: setting an array element with a sequence super().test_astype_str(data) + @skip_nested + def test_astype_string(self, data): + # ValueError: setting an array element with a sequence + super().test_astype_string(data) + class TestConstructors(BaseNumPyTests, base.BaseConstructorsTests): @pytest.mark.skip(reason="We don't register our dtype")