pandas-dev · jreback · May 26, 2020 · Apr 14, 2020 · Apr 14, 2020 · May 22, 2020
diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst
@@ -63,6 +63,29 @@ Or ``astype`` after the ``Series`` or ``DataFrame`` is created
    s
    s.astype("string")
 
+
+.. versionchanged:: 1.1.0
+
+You can also use :class:`StringDtype`/``"string"`` as the dtype on non-string data and
+it will be converted to ``string`` dtype:
+
+.. ipython:: python
+
+   s = pd.Series(['a', 2, np.nan], dtype="string")
+   s
+   type(s[1])
+
+or convert from existing pandas data:
+
+.. ipython:: python
+
+   s1 = pd.Series([1, 2, np.nan], dtype="Int64")
+   s1
+   s2 = s1.astype("string")
+   s2
+   type(s2[0])
+
+
 .. _text.differences:
 
 Behavior differences

diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -13,6 +13,24 @@ including other versions of pandas.
 Enhancements
 ~~~~~~~~~~~~
 
+.. _whatsnew_110.astype_string:
+
+All dtypes can now be converted to ``StringDtype``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Previously, declaring or converting to :class:`StringDtype` was in general only possible if the data was already only ``str`` or nan-like (:issue:`31204`).
+:class:`StringDtype` now works in all situations where ``astype(str)`` or ``dtype=str`` work:
+
+For example, the below now works:
+
+.. ipython:: python
+
+   ser = pd.Series([1, "abc", np.nan], dtype="string")
+   ser
+   ser[0]
+   pd.Series([1, 2, np.nan], dtype="Int64").astype("string")
+
+
 .. _whatsnew_110.period_index_partial_string_slicing:
 
 Nonmonotonic PeriodIndex Partial String Slicing

diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -20,7 +20,7 @@
 from pandas.util._validators import validate_fillna_kwargs
 
 from pandas.core.dtypes.cast import maybe_cast_to_extension_array
-from pandas.core.dtypes.common import is_array_like, is_list_like
+from pandas.core.dtypes.common import is_array_like, is_list_like, pandas_dtype
 from pandas.core.dtypes.dtypes import ExtensionDtype
 from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries
 from pandas.core.dtypes.missing import isna
@@ -178,7 +178,7 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
         ----------
         scalars : Sequence
             Each element will be an instance of the scalar type for this
-            array, ``cls.dtype.type``.
+            array, ``cls.dtype.type`` or be converted into this type in this method.
         dtype : dtype, optional
             Construct for this particular dtype. This should be a Dtype
             compatible with the ExtensionArray.
@@ -451,6 +451,12 @@ def astype(self, dtype, copy=True):
         array : ndarray
             NumPy ndarray with 'dtype' for its dtype.
         """
+        from pandas.core.arrays.string_ import StringDtype
+
+        dtype = pandas_dtype(dtype)
+        if isinstance(dtype, StringDtype):  # allow conversion to StringArrays
+            return dtype.construct_array_type()._from_sequence(self, copy=False)
+
         return np.array(self, dtype=dtype, copy=copy)
 
     def isna(self) -> ArrayLike:

diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
@@ -27,6 +27,7 @@
     is_datetime64tz_dtype,
     is_datetime_or_timedelta_dtype,
     is_dtype_equal,
+    is_extension_array_dtype,
     is_float_dtype,
     is_integer_dtype,
     is_list_like,
@@ -619,7 +620,11 @@ def astype(self, dtype, copy=True):
         if is_object_dtype(dtype):
             return self._box_values(self.asi8.ravel()).reshape(self.shape)
         elif is_string_dtype(dtype) and not is_categorical_dtype(dtype):
-            return self._format_native_types()
+            if is_extension_array_dtype(dtype):
+                arr_cls = dtype.construct_array_type()
+                return arr_cls._from_sequence(self, dtype=dtype)
+            else:
+                return self._format_native_types()
         elif is_integer_dtype(dtype):
             # we deliberately ignore int32 vs. int64 here.
             # See https://github.com/pandas-dev/pandas/issues/24381 for more.

diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py
@@ -1,5 +1,5 @@
 import numbers
-from typing import TYPE_CHECKING, List, Optional, Tuple, Type, Union
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Type, Union
 import warnings
 
 import numpy as np
@@ -442,17 +442,20 @@ def astype(self, dtype, copy: bool = True) -> ArrayLike:
             if incompatible type with an IntegerDtype, equivalent of same_kind
             casting
         """
-        from pandas.core.arrays.boolean import BooleanArray, BooleanDtype
+        from pandas.core.arrays.boolean import BooleanDtype
+        from pandas.core.arrays.string_ import StringDtype
 
         dtype = pandas_dtype(dtype)
 
         # if we are astyping to an existing IntegerDtype we can fastpath
         if isinstance(dtype, _IntegerDtype):
             result = self._data.astype(dtype.numpy_dtype, copy=False)
-            return type(self)(result, mask=self._mask, copy=False)
+            return dtype.construct_array_type()(result, mask=self._mask, copy=False)
         elif isinstance(dtype, BooleanDtype):
             result = self._data.astype("bool", copy=False)
-            return BooleanArray(result, mask=self._mask, copy=False)
+            return dtype.construct_array_type()(result, mask=self._mask, copy=False)
+        elif isinstance(dtype, StringDtype):
+            return dtype.construct_array_type()._from_sequence(self, copy=False)
 
         # coerce
         if is_float_dtype(dtype):
@@ -722,7 +725,7 @@ class UInt64Dtype(_IntegerDtype):
     __doc__ = _dtype_docstring.format(dtype="uint64")
 
 
-_dtypes = {
+_dtypes: Dict[str, _IntegerDtype] = {
     "int8": Int8Dtype(),
     "int16": Int16Dtype(),
     "int32": Int32Dtype(),

diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py
@@ -680,8 +680,11 @@ def astype(self, dtype, copy=True):
         array : ExtensionArray or ndarray
             ExtensionArray or NumPy ndarray with 'dtype' for its dtype.
         """
+        from pandas.core.arrays.string_ import StringDtype
+
         if dtype is not None:
             dtype = pandas_dtype(dtype)
+
         if is_interval_dtype(dtype):
             if dtype == self.dtype:
                 return self.copy() if copy else self
@@ -698,6 +701,9 @@ def astype(self, dtype, copy=True):
             return self._shallow_copy(new_left, new_right)
         elif is_categorical_dtype(dtype):
             return Categorical(np.asarray(self))
+        elif isinstance(dtype, StringDtype):
+            return dtype.construct_array_type()._from_sequence(self, copy=False)
+
         # TODO: This try/except will be repeated.
         try:
             return np.asarray(self).astype(dtype, copy=copy)

diff --git a/pandas/core/arrays/sparse/dtype.py b/pandas/core/arrays/sparse/dtype.py
@@ -13,6 +13,7 @@
 from pandas.core.dtypes.cast import astype_nansafe
 from pandas.core.dtypes.common import (
     is_bool_dtype,
+    is_extension_array_dtype,
     is_object_dtype,
     is_scalar,
     is_string_dtype,
@@ -322,6 +323,9 @@ def update_dtype(self, dtype):
         dtype = pandas_dtype(dtype)
 
         if not isinstance(dtype, cls):
+            if is_extension_array_dtype(dtype):
+                raise TypeError("sparse arrays of extension dtypes not supported")
+
             fill_value = astype_nansafe(np.array(self.fill_value), dtype).item()
             dtype = cls(dtype, fill_value=fill_value)
 

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
@@ -152,15 +152,21 @@ class StringArray(PandasArray):
     ['This is', 'some text', <NA>, 'data.']
     Length: 4, dtype: string
 
-    Unlike ``object`` dtype arrays, ``StringArray`` doesn't allow non-string
-    values.
+    Unlike arrays instantiated with ``dtype="object"``, ``StringArray``
+    will convert the values to strings.
 
+    >>> pd.array(['1', 1], dtype="object")
+    <PandasArray>
+    ['1', 1]
+    Length: 2, dtype: object
     >>> pd.array(['1', 1], dtype="string")
-    Traceback (most recent call last):
-    ...
-    ValueError: StringArray requires an object-dtype ndarray of strings.
+    <StringArray>
+    ['1', '1']
+    Length: 2, dtype: string
+
+    However, instantiating StringArrays directly with non-strings will raise an error.
 
-    For comparison methods, this returns a :class:`pandas.BooleanArray`
+    For comparison methods, `StringArray` returns a :class:`pandas.BooleanArray`:
 
     >>> pd.array(["a", None, "c"], dtype="string") == "a"
     <BooleanArray>
@@ -203,10 +209,15 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
         # TODO: it would be nice to do this in _validate / lib.is_string_array
         # We are already doing a scan over the values there.
         na_values = isna(result)
-        if na_values.any():
-            if result is scalars:
-                # force a copy now, if we haven't already
-                result = result.copy()
+        has_nans = na_values.any()
+        if has_nans and result is scalars:
+            # force a copy now, if we haven't already
+            result = result.copy()
+
+        # convert to str, then to object to avoid dtype like '<U3', then insert na_value
+        result = np.asarray(result, dtype=str)
+        result = np.asarray(result, dtype="object")
+        if has_nans:
             result[na_values] = StringDtype.na_value
 
         return cls(result)

diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -337,9 +337,16 @@ def maybe_cast_to_extension_array(cls: Type["ExtensionArray"], obj, dtype=None):
     -------
     ExtensionArray or obj
     """
+    from pandas.core.arrays.string_ import StringArray
+
     assert isinstance(cls, type), f"must pass a type: {cls}"
     assertion_msg = f"must pass a subclass of ExtensionArray: {cls}"
     assert issubclass(cls, ABCExtensionArray), assertion_msg
+
+    # Everything can be be converted to StringArrays, but we may not want to convert
+    if issubclass(cls, StringArray) and lib.infer_dtype(obj) != "string":
+        return obj
+
     try:
         result = cls._from_sequence(obj, dtype=dtype)
     except Exception:

diff --git a/pandas/tests/extension/base/casting.py b/pandas/tests/extension/base/casting.py
@@ -33,7 +33,13 @@ def test_tolist(self, data):
 
     def test_astype_str(self, data):
         result = pd.Series(data[:5]).astype(str)
-        expected = pd.Series(data[:5].astype(str))
+        expected = pd.Series([str(x) for x in data[:5]], dtype=str)
+        self.assert_series_equal(result, expected)
+
+    def test_astype_string(self, data):
+        # GH-33465
+        result = pd.Series(data[:5]).astype("string")
+        expected = pd.Series([str(x) for x in data[:5]], dtype="string")
         self.assert_series_equal(result, expected)
 
     def test_to_numpy(self, data):

diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py
@@ -7,6 +7,7 @@
 import numpy as np
 
 from pandas.core.dtypes.base import ExtensionDtype
+from pandas.core.dtypes.common import pandas_dtype
 
 import pandas as pd
 from pandas.api.extensions import no_default, register_extension_dtype
@@ -130,9 +131,11 @@ def copy(self):
         return type(self)(self._data.copy())
 
     def astype(self, dtype, copy=True):
+        dtype = pandas_dtype(dtype)
         if isinstance(dtype, type(self.dtype)):
             return type(self)(self._data, context=dtype.context)
-        return np.asarray(self, dtype=dtype)
+
+        return super().astype(dtype, copy=copy)
 
     def __setitem__(self, key, value):
         if pd.api.types.is_list_like(value):

diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py
@@ -21,6 +21,8 @@
 
 import numpy as np
 
+from pandas.core.dtypes.common import pandas_dtype
+
 import pandas as pd
 from pandas.api.extensions import ExtensionArray, ExtensionDtype
 
@@ -160,12 +162,18 @@ def astype(self, dtype, copy=True):
         # NumPy has issues when all the dicts are the same length.
         # np.array([UserDict(...), UserDict(...)]) fails,
         # but np.array([{...}, {...}]) works, so cast.
+        from pandas.core.arrays.string_ import StringDtype
 
+        dtype = pandas_dtype(dtype)
         # needed to add this check for the Series constructor
         if isinstance(dtype, type(self.dtype)) and dtype == self.dtype:
             if copy:
                 return self.copy()
             return self
+        elif isinstance(dtype, StringDtype):
+            value = self.astype(str)  # numpy doesn'y like nested dicts
+            return dtype.construct_array_type()._from_sequence(value, copy=False)
+
         return np.array([dict(x) for x in self], dtype=dtype, copy=copy)
 
     def unique(self):

diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py
@@ -139,6 +139,12 @@ def test_astype_str(self, data):
         # ValueError: setting an array element with a sequence
         super().test_astype_str(data)
 
+    @skip_nested
+    def test_astype_string(self, data):
+        # GH-33465
+        # ValueError: setting an array element with a sequence
+        super().test_astype_string(data)
+
 
 class TestConstructors(BaseNumPyTests, base.BaseConstructorsTests):
     @pytest.mark.skip(reason="We don't register our dtype")

diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py
@@ -343,6 +343,16 @@ def test_astype_object_frame(self, all_data):
         # comp = result.dtypes.equals(df.dtypes)
         # assert not comp.any()
 
+    def test_astype_str(self, data):
+        result = pd.Series(data[:5]).astype(str)
+        expected_dtype = pd.SparseDtype(str, str(data.fill_value))
+        expected = pd.Series([str(x) for x in data[:5]], dtype=expected_dtype)
+        self.assert_series_equal(result, expected)
+
+    @pytest.mark.xfail(raises=TypeError, reason="no sparse StringDtype")
+    def test_astype_string(self, data):
+        super().test_astype_string(data)
+
 
 class TestArithmeticOps(BaseSparseTests, base.BaseArithmeticOpsTests):
     series_scalar_exc = None