Merge branch 'astype_string' of https://github.com/topper-123/pandas …

…into astype_string
pandas-dev · Apr 14, 2020 · 2a835e4 · 2a835e4
2 parents 556e1c2 + 59e9c3b
commit 2a835e4
Show file tree

Hide file tree

Showing 14 changed files with 116 additions and 18 deletions.
diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst
@@ -63,6 +63,27 @@ Or ``astype`` after the ``Series`` or ``DataFrame`` is created
    s
    s.astype("string")
 
+
+.. versionchanged:: 1.1.0
+
+You can also use ``string`` dtype on non-string data and it will be converted to
+``string`` dtype:
+
+.. ipython:: python
+
+   s = pd.Series(['a', 2, np.nan], dtype="string")
+   s
+   type(s[1])
+
+or convert from existing pandas data:
+
+   s1 = pd.Series([1,2, np.nan], dtype="Int64")
+   s1
+   s2 = s1.astype("string")
+   s2
+   type(s2[0])
+
+
 .. _text.differences:
 
 Behavior differences

diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -13,6 +13,32 @@ including other versions of pandas.
 Enhancements
 ~~~~~~~~~~~~
 
+.. _whatsnew_110.astype_string:
+
+All dtypes can now be converted to ``StringDtype``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Previously, declaring or converting to :class:`StringDtype` was in general only possible if the data was already only ``str`` or nan-like.
+For example:
+
+.. code-block:: ipython
+
+    In [1]: pd.Series([1, "abc", np.nan], dtype="string")
+    Out[1]: ValueError: StringArray requires a sequence of strings or pandas.NA
+    In [2]: pd.Series([1,2, np.nan], dtype="Int64").astype("string")
+    Out[2]: ValueError: StringArray requires a sequence of strings or pandas.NA
+
+This meant that in order to convert arbitrary data to :class:`StringDtype`, you would often have to use ``.astype(str).astype('string')``, which was not intuitive.
+:class:`StringDtype` now works in all situations where ``astype(str)`` or ``dtype=str`` work:
+
+.. ipython:: python
+
+   ser = pd.Series([1, "abc", np.nan], dtype="string")
+   ser
+   ser[0]
+   pd.Series([1,2, np.nan], dtype="Int64").astype("string")
+
+
 .. _whatsnew_110.period_index_partial_string_slicing:
 
 Nonmonotonic PeriodIndex Partial String Slicing
@@ -89,6 +115,7 @@ Other enhancements
 - :meth:`DataFrame.sample` will now also allow array-like and BitGenerator objects to be passed to ``random_state`` as seeds (:issue:`32503`)
 - :meth:`MultiIndex.union` will now raise `RuntimeWarning` if the object inside are unsortable, pass `sort=False` to suppress this warning (:issue:`33015`)
 - :class:`Series.dt` and :class:`DatatimeIndex` now have an `isocalendar` accessor that returns a :class:`DataFrame` with year, week, and day calculated according to the ISO 8601 calendar (:issue:`33206`).
+- :meth:`Series.combine` has gained a ``dtype`` argument. If supplied, the combined series will get that dtype (:issue:`33465`)
 - The :meth:`DataFrame.to_feather` method now supports additional keyword
   arguments (e.g. to set the compression) that are added in pyarrow 0.17
   (:issue:`33422`).

diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -20,7 +20,7 @@
 from pandas.util._validators import validate_fillna_kwargs
 
 from pandas.core.dtypes.cast import maybe_cast_to_extension_array
-from pandas.core.dtypes.common import is_array_like, is_list_like
+from pandas.core.dtypes.common import is_array_like, is_list_like, pandas_dtype
 from pandas.core.dtypes.dtypes import ExtensionDtype
 from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries
 from pandas.core.dtypes.missing import isna
@@ -176,7 +176,7 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
         ----------
         scalars : Sequence
             Each element will be an instance of the scalar type for this
-            array, ``cls.dtype.type``.
+            array, ``cls.dtype.type`` or be converted into this type in this method.
         dtype : dtype, optional
             Construct for this particular dtype. This should be a Dtype
             compatible with the ExtensionArray.
@@ -431,6 +431,11 @@ def astype(self, dtype, copy=True):
         array : ndarray
             NumPy ndarray with 'dtype' for its dtype.
         """
+        from pandas.core.arrays.string_ import StringDtype
+
+        dtype = pandas_dtype(dtype)
+        if isinstance(dtype, StringDtype):
+            return dtype.construct_array_type()._from_sequence(self, copy=False)
         return np.array(self, dtype=dtype, copy=copy)
 
     def isna(self) -> ArrayLike:

diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py
@@ -1,5 +1,5 @@
 import numbers
-from typing import TYPE_CHECKING, Tuple, Type, Union
+from typing import TYPE_CHECKING, Dict, Tuple, Type, Union
 import warnings
 
 import numpy as np
@@ -449,17 +449,20 @@ def astype(self, dtype, copy: bool = True) -> ArrayLike:
             if incompatible type with an IntegerDtype, equivalent of same_kind
             casting
         """
-        from pandas.core.arrays.boolean import BooleanArray, BooleanDtype
+        from pandas.core.arrays.boolean import BooleanDtype
+        from pandas.core.arrays.string_ import StringDtype
 
         dtype = pandas_dtype(dtype)
 
         # if we are astyping to an existing IntegerDtype we can fastpath
         if isinstance(dtype, _IntegerDtype):
             result = self._data.astype(dtype.numpy_dtype, copy=False)
-            return type(self)(result, mask=self._mask, copy=False)
+            return dtype.construct_array_type()(result, mask=self._mask, copy=False)
         elif isinstance(dtype, BooleanDtype):
             result = self._data.astype("bool", copy=False)
-            return BooleanArray(result, mask=self._mask, copy=False)
+            return dtype.construct_array_type()(result, mask=self._mask, copy=False)
+        elif isinstance(dtype, StringDtype):
+            return dtype.construct_array_type()._from_sequence(self, copy=False)
 
         # coerce
         if is_float_dtype(dtype):
@@ -748,7 +751,7 @@ class UInt64Dtype(_IntegerDtype):
     __doc__ = _dtype_docstring.format(dtype="uint64")
 
 
-_dtypes = {
+_dtypes: Dict[str, _IntegerDtype] = {
     "int8": Int8Dtype(),
     "int16": Int16Dtype(),
     "int32": Int32Dtype(),

diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py
@@ -678,6 +678,8 @@ def astype(self, dtype, copy=True):
         array : ExtensionArray or ndarray
             ExtensionArray or NumPy ndarray with 'dtype' for its dtype.
         """
+        from pandas.core.arrays.string_ import StringDtype
+
         dtype = pandas_dtype(dtype)
         if is_interval_dtype(dtype):
             if dtype == self.dtype:
@@ -695,6 +697,9 @@ def astype(self, dtype, copy=True):
             return self._shallow_copy(new_left, new_right)
         elif is_categorical_dtype(dtype):
             return Categorical(np.asarray(self))
+        elif isinstance(dtype, StringDtype):
+            return dtype.construct_array_type()._from_sequence(self, copy=False)
+
         # TODO: This try/except will be repeated.
         try:
             return np.asarray(self).astype(dtype, copy=copy)

diff --git a/pandas/core/arrays/sparse/dtype.py b/pandas/core/arrays/sparse/dtype.py
@@ -320,7 +320,7 @@ def update_dtype(self, dtype):
         dtype = pandas_dtype(dtype)
 
         if not isinstance(dtype, cls):
-            fill_value = astype_nansafe(np.array(self.fill_value), dtype).item()
+            fill_value = astype_nansafe(np.array([self.fill_value]), dtype)[0]
             dtype = cls(dtype, fill_value=fill_value)
 
         return dtype

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
@@ -203,10 +203,14 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
         # TODO: it would be nice to do this in _validate / lib.is_string_array
         # We are already doing a scan over the values there.
         na_values = isna(result)
-        if na_values.any():
-            if result is scalars:
-                # force a copy now, if we haven't already
-                result = result.copy()
+        has_nans = na_values.any()
+        if has_nans and result is scalars:
+            # force a copy now, if we haven't already
+            result = result.copy()
+        # convert to str, then to object to avoid dtype like '<U3', then insert na_value
+        result = np.asarray(result, dtype=str)
+        result = np.asarray(result, dtype="object")
+        if has_nans:
             result[na_values] = StringDtype.na_value
 
         return cls(result)

diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -2663,7 +2663,7 @@ def _construct_result(
         out.name = name
         return out
 
-    def combine(self, other, func, fill_value=None) -> "Series":
+    def combine(self, other, func, fill_value=None, dtype=None) -> "Series":
         """
         Combine the Series with a Series or scalar according to `func`.
 
@@ -2682,6 +2682,11 @@ def combine(self, other, func, fill_value=None) -> "Series":
             The value to assume when an index is missing from
             one Series or the other. The default specifies to use the
             appropriate NaN value for the underlying dtype of the Series.
+        dtype : str, numpy.dtype, or ExtensionDtype, optional
+            Data type for the output Series. If not specified, this will be
+            inferred from the combined data.
+
+            .. versionadded:: 1.1.0
 
         Returns
         -------
@@ -2752,6 +2757,10 @@ def combine(self, other, func, fill_value=None) -> "Series":
                 new_values = [func(lv, other) for lv in self._values]
             new_name = self.name
 
+        if dtype is not None:
+            return self._constructor(
+                new_values, index=new_index, name=new_name, dtype=dtype
+            )
         if is_categorical_dtype(self.dtype):
             pass
         elif is_extension_array_dtype(self.dtype):

diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py
@@ -529,7 +529,6 @@ def test_astype_all(self, any_real_dtype):
                     np.array([0, 1], dtype="datetime64[ns]"),
                     dtype=SparseDtype("datetime64[ns]", pd.Timestamp("1970")),
                 ),
-                marks=[pytest.mark.xfail(reason="NumPy-7619")],
             ),
             (
                 SparseArray([0, 1, 10]),

diff --git a/pandas/tests/extension/base/casting.py b/pandas/tests/extension/base/casting.py
@@ -36,6 +36,11 @@ def test_astype_str(self, data):
         expected = pd.Series(data[:5].astype(str))
         self.assert_series_equal(result, expected)
 
+    def test_astype_string(self, data):
+        result = pd.Series(data[:5]).astype("string")
+        expected = pd.Series(data[:5].astype("string"))
+        self.assert_series_equal(result, expected)
+
     def test_to_numpy(self, data):
         expected = np.asarray(data)
 

diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py
@@ -172,15 +172,16 @@ def test_combine_le(self, data_repeated):
         orig_data1, orig_data2 = data_repeated(2)
         s1 = pd.Series(orig_data1)
         s2 = pd.Series(orig_data2)
-        result = s1.combine(s2, lambda x1, x2: x1 <= x2)
+        result = s1.combine(s2, lambda x1, x2: x1 <= x2, dtype="boolean")
         expected = pd.Series(
-            [a <= b for (a, b) in zip(list(orig_data1), list(orig_data2))]
+            [a <= b for (a, b) in zip(list(orig_data1), list(orig_data2))],
+            dtype="boolean",
         )
         self.assert_series_equal(result, expected)
 
         val = s1.iloc[0]
-        result = s1.combine(val, lambda x1, x2: x1 <= x2)
-        expected = pd.Series([a <= val for a in list(orig_data1)])
+        result = s1.combine(val, lambda x1, x2: x1 <= x2, dtype="boolean")
+        expected = pd.Series([a <= val for a in list(orig_data1)], dtype="boolean")
         self.assert_series_equal(result, expected)
 
     def test_combine_add(self, data_repeated):

diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py
@@ -7,6 +7,7 @@
 import numpy as np
 
 from pandas.core.dtypes.base import ExtensionDtype
+from pandas.core.dtypes.common import pandas_dtype
 
 import pandas as pd
 from pandas.api.extensions import no_default, register_extension_dtype
@@ -130,8 +131,13 @@ def copy(self):
         return type(self)(self._data.copy())
 
     def astype(self, dtype, copy=True):
+        from pandas.core.arrays.string_ import StringDtype
+
+        dtype = pandas_dtype(dtype)
         if isinstance(dtype, type(self.dtype)):
             return type(self)(self._data, context=dtype.context)
+        elif isinstance(dtype, StringDtype):
+            return dtype.construct_array_type()._from_sequence(self, copy=False)
         return np.asarray(self, dtype=dtype)
 
     def __setitem__(self, key, value):

diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py
@@ -21,6 +21,8 @@
 
 import numpy as np
 
+from pandas.core.dtypes.common import pandas_dtype
+
 import pandas as pd
 from pandas.api.extensions import ExtensionArray, ExtensionDtype
 
@@ -154,12 +156,18 @@ def astype(self, dtype, copy=True):
         # NumPy has issues when all the dicts are the same length.
         # np.array([UserDict(...), UserDict(...)]) fails,
         # but np.array([{...}, {...}]) works, so cast.
+        from pandas.core.arrays.string_ import StringDtype
 
+        dtype = pandas_dtype(dtype)
         # needed to add this check for the Series constructor
         if isinstance(dtype, type(self.dtype)) and dtype == self.dtype:
             if copy:
                 return self.copy()
             return self
+        elif isinstance(dtype, StringDtype):
+            value = self.astype(str)  # numpy doesn'y like nested dicts
+            return dtype.construct_array_type()._from_sequence(value, copy=False)
+
         return np.array([dict(x) for x in self], dtype=dtype, copy=copy)
 
     def unique(self):

diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py
@@ -139,6 +139,11 @@ def test_astype_str(self, data):
         # ValueError: setting an array element with a sequence
         super().test_astype_str(data)
 
+    @skip_nested
+    def test_astype_string(self, data):
+        # ValueError: setting an array element with a sequence
+        super().test_astype_string(data)
+
 
 class TestConstructors(BaseNumPyTests, base.BaseConstructorsTests):
     @pytest.mark.skip(reason="We don't register our dtype")