diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index bea0f42f6849c..3408b98b3179d 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -63,6 +63,29 @@ Or ``astype`` after the ``Series`` or ``DataFrame`` is created s s.astype("string") + +.. versionchanged:: 1.1.0 + +You can also use :class:`StringDtype`/``"string"`` as the dtype on non-string data and +it will be converted to ``string`` dtype: + +.. ipython:: python + + s = pd.Series(['a', 2, np.nan], dtype="string") + s + type(s[1]) + +or convert from existing pandas data: + +.. ipython:: python + + s1 = pd.Series([1, 2, np.nan], dtype="Int64") + s1 + s2 = s1.astype("string") + s2 + type(s2[0]) + + .. _text.differences: Behavior differences diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 20e2cce1a3dfa..64fdf5b2244cb 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -13,6 +13,24 @@ including other versions of pandas. Enhancements ~~~~~~~~~~~~ +.. _whatsnew_110.astype_string: + +All dtypes can now be converted to ``StringDtype`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Previously, declaring or converting to :class:`StringDtype` was in general only possible if the data was already only ``str`` or nan-like (:issue:`31204`). +:class:`StringDtype` now works in all situations where ``astype(str)`` or ``dtype=str`` work: + +For example, the below now works: + +.. ipython:: python + + ser = pd.Series([1, "abc", np.nan], dtype="string") + ser + ser[0] + pd.Series([1, 2, np.nan], dtype="Int64").astype("string") + + .. _whatsnew_110.period_index_partial_string_slicing: Nonmonotonic PeriodIndex Partial String Slicing diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index fb9e2f6732018..b5e917bafca7e 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -20,7 +20,7 @@ from pandas.util._validators import validate_fillna_kwargs from pandas.core.dtypes.cast import maybe_cast_to_extension_array -from pandas.core.dtypes.common import is_array_like, is_list_like +from pandas.core.dtypes.common import is_array_like, is_list_like, pandas_dtype from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import isna @@ -178,7 +178,7 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): ---------- scalars : Sequence Each element will be an instance of the scalar type for this - array, ``cls.dtype.type``. + array, ``cls.dtype.type`` or be converted into this type in this method. dtype : dtype, optional Construct for this particular dtype. This should be a Dtype compatible with the ExtensionArray. @@ -451,6 +451,12 @@ def astype(self, dtype, copy=True): array : ndarray NumPy ndarray with 'dtype' for its dtype. """ + from pandas.core.arrays.string_ import StringDtype + + dtype = pandas_dtype(dtype) + if isinstance(dtype, StringDtype): # allow conversion to StringArrays + return dtype.construct_array_type()._from_sequence(self, copy=False) + return np.array(self, dtype=dtype, copy=copy) def isna(self) -> ArrayLike: diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 3c1602344c314..cf3cde155a3bb 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -27,6 +27,7 @@ is_datetime64tz_dtype, is_datetime_or_timedelta_dtype, is_dtype_equal, + is_extension_array_dtype, is_float_dtype, is_integer_dtype, is_list_like, @@ -619,7 +620,11 @@ def astype(self, dtype, copy=True): if is_object_dtype(dtype): return self._box_values(self.asi8.ravel()).reshape(self.shape) elif is_string_dtype(dtype) and not is_categorical_dtype(dtype): - return self._format_native_types() + if is_extension_array_dtype(dtype): + arr_cls = dtype.construct_array_type() + return arr_cls._from_sequence(self, dtype=dtype) + else: + return self._format_native_types() elif is_integer_dtype(dtype): # we deliberately ignore int32 vs. int64 here. # See https://github.com/pandas-dev/pandas/issues/24381 for more. diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 59954f548fd33..d4137f9666946 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -1,5 +1,5 @@ import numbers -from typing import TYPE_CHECKING, List, Optional, Tuple, Type, Union +from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Type, Union import warnings import numpy as np @@ -442,17 +442,20 @@ def astype(self, dtype, copy: bool = True) -> ArrayLike: if incompatible type with an IntegerDtype, equivalent of same_kind casting """ - from pandas.core.arrays.boolean import BooleanArray, BooleanDtype + from pandas.core.arrays.boolean import BooleanDtype + from pandas.core.arrays.string_ import StringDtype dtype = pandas_dtype(dtype) # if we are astyping to an existing IntegerDtype we can fastpath if isinstance(dtype, _IntegerDtype): result = self._data.astype(dtype.numpy_dtype, copy=False) - return type(self)(result, mask=self._mask, copy=False) + return dtype.construct_array_type()(result, mask=self._mask, copy=False) elif isinstance(dtype, BooleanDtype): result = self._data.astype("bool", copy=False) - return BooleanArray(result, mask=self._mask, copy=False) + return dtype.construct_array_type()(result, mask=self._mask, copy=False) + elif isinstance(dtype, StringDtype): + return dtype.construct_array_type()._from_sequence(self, copy=False) # coerce if is_float_dtype(dtype): @@ -722,7 +725,7 @@ class UInt64Dtype(_IntegerDtype): __doc__ = _dtype_docstring.format(dtype="uint64") -_dtypes = { +_dtypes: Dict[str, _IntegerDtype] = { "int8": Int8Dtype(), "int16": Int16Dtype(), "int32": Int32Dtype(), diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index c5366884fbdfe..c861d25afd13f 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -680,8 +680,11 @@ def astype(self, dtype, copy=True): array : ExtensionArray or ndarray ExtensionArray or NumPy ndarray with 'dtype' for its dtype. """ + from pandas.core.arrays.string_ import StringDtype + if dtype is not None: dtype = pandas_dtype(dtype) + if is_interval_dtype(dtype): if dtype == self.dtype: return self.copy() if copy else self @@ -698,6 +701,9 @@ def astype(self, dtype, copy=True): return self._shallow_copy(new_left, new_right) elif is_categorical_dtype(dtype): return Categorical(np.asarray(self)) + elif isinstance(dtype, StringDtype): + return dtype.construct_array_type()._from_sequence(self, copy=False) + # TODO: This try/except will be repeated. try: return np.asarray(self).astype(dtype, copy=copy) diff --git a/pandas/core/arrays/sparse/dtype.py b/pandas/core/arrays/sparse/dtype.py index a9090570e64a9..8d17ed412f6b4 100644 --- a/pandas/core/arrays/sparse/dtype.py +++ b/pandas/core/arrays/sparse/dtype.py @@ -13,6 +13,7 @@ from pandas.core.dtypes.cast import astype_nansafe from pandas.core.dtypes.common import ( is_bool_dtype, + is_extension_array_dtype, is_object_dtype, is_scalar, is_string_dtype, @@ -322,6 +323,9 @@ def update_dtype(self, dtype): dtype = pandas_dtype(dtype) if not isinstance(dtype, cls): + if is_extension_array_dtype(dtype): + raise TypeError("sparse arrays of extension dtypes not supported") + fill_value = astype_nansafe(np.array(self.fill_value), dtype).item() dtype = cls(dtype, fill_value=fill_value) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 537b1cf3dd439..ac501a8afbe09 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -152,15 +152,21 @@ class StringArray(PandasArray): ['This is', 'some text', , 'data.'] Length: 4, dtype: string - Unlike ``object`` dtype arrays, ``StringArray`` doesn't allow non-string - values. + Unlike arrays instantiated with ``dtype="object"``, ``StringArray`` + will convert the values to strings. + >>> pd.array(['1', 1], dtype="object") + + ['1', 1] + Length: 2, dtype: object >>> pd.array(['1', 1], dtype="string") - Traceback (most recent call last): - ... - ValueError: StringArray requires an object-dtype ndarray of strings. + + ['1', '1'] + Length: 2, dtype: string + + However, instantiating StringArrays directly with non-strings will raise an error. - For comparison methods, this returns a :class:`pandas.BooleanArray` + For comparison methods, `StringArray` returns a :class:`pandas.BooleanArray`: >>> pd.array(["a", None, "c"], dtype="string") == "a" @@ -203,10 +209,15 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): # TODO: it would be nice to do this in _validate / lib.is_string_array # We are already doing a scan over the values there. na_values = isna(result) - if na_values.any(): - if result is scalars: - # force a copy now, if we haven't already - result = result.copy() + has_nans = na_values.any() + if has_nans and result is scalars: + # force a copy now, if we haven't already + result = result.copy() + + # convert to str, then to object to avoid dtype like '