use _from_sequence + add tests

pandas-dev · Apr 12, 2020 · 59e9c3b · 59e9c3b
1 parent cf501bc
commit 59e9c3b
Show file tree

Hide file tree

Showing 17 changed files with 123 additions and 64 deletions.
diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst
@@ -63,6 +63,27 @@ Or ``astype`` after the ``Series`` or ``DataFrame`` is created
    s
    s.astype("string")
 
+
+.. versionchanged:: 1.1.0
+
+You can also use ``string`` dtype on non-string data and it will be converted to
+``string`` dtype:
+
+.. ipython:: python
+
+   s = pd.Series(['a', 2, np.nan], dtype="string")
+   s
+   type(s[1])
+
+or convert from existing pandas data:
+
+   s1 = pd.Series([1,2, np.nan], dtype="Int64")
+   s1
+   s2 = s1.astype("string")
+   s2
+   type(s2[0])
+
+
 .. _text.differences:
 
 Behavior differences

diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -13,6 +13,32 @@ including other versions of pandas.
 Enhancements
 ~~~~~~~~~~~~
 
+.. _whatsnew_110.astype_string:
+
+All dtypes can now be converted to ``StringDtype``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Previously, declaring or converting to :class:`StringDtype` was in general only possible if the data was already only ``str`` or nan-like.
+For example:
+
+.. code-block:: ipython
+
+    In [1]: pd.Series([1, "abc", np.nan], dtype="string")
+    Out[1]: ValueError: StringArray requires a sequence of strings or pandas.NA
+    In [2]: pd.Series([1,2, np.nan], dtype="Int64").astype("string")
+    Out[2]: ValueError: StringArray requires a sequence of strings or pandas.NA
+
+This meant that in order to convert arbitrary data to :class:`StringDtype`, you would often have to use ``.astype(str).astype('string')``, which was not intuitive.
+:class:`StringDtype` now works in all situations where ``astype(str)`` or ``dtype=str`` work:
+
+.. ipython:: python
+
+   ser = pd.Series([1, "abc", np.nan], dtype="string")
+   ser
+   ser[0]
+   pd.Series([1,2, np.nan], dtype="Int64").astype("string")
+
+
 .. _whatsnew_110.period_index_partial_string_slicing:
 
 Nonmonotonic PeriodIndex Partial String Slicing
@@ -88,6 +114,7 @@ Other enhancements
 - :class:`Series.str` now has a `fullmatch` method that matches a regular expression against the entire string in each row of the series, similar to `re.fullmatch` (:issue:`32806`).
 - :meth:`DataFrame.sample` will now also allow array-like and BitGenerator objects to be passed to ``random_state`` as seeds (:issue:`32503`)
 - :meth:`MultiIndex.union` will now raise `RuntimeWarning` if the object inside are unsortable, pass `sort=False` to suppress this warning (:issue:`33015`)
+- :meth:`Series.combine` has gained a ``dtype`` argument. If supplied, the combined series will get that dtype (:issue:`33465`)
 - The :meth:`DataFrame.to_feather` method now supports additional keyword
   arguments (e.g. to set the compression) that are added in pyarrow 0.17
   (:issue:`33422`).

diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -20,7 +20,7 @@
 from pandas.util._validators import validate_fillna_kwargs
 
 from pandas.core.dtypes.cast import maybe_cast_to_extension_array
-from pandas.core.dtypes.common import is_array_like, is_list_like
+from pandas.core.dtypes.common import is_array_like, is_list_like, pandas_dtype
 from pandas.core.dtypes.dtypes import ExtensionDtype
 from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries
 from pandas.core.dtypes.missing import isna
@@ -176,7 +176,7 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
         ----------
         scalars : Sequence
             Each element will be an instance of the scalar type for this
-            array, ``cls.dtype.type``.
+            array, ``cls.dtype.type`` or be converted into this type in this method.
         dtype : dtype, optional
             Construct for this particular dtype. This should be a Dtype
             compatible with the ExtensionArray.
@@ -213,29 +213,6 @@ def _from_sequence_of_strings(cls, strings, dtype=None, copy=False):
         """
         raise AbstractMethodError(cls)
 
-    @classmethod
-    def _from_sequence_of_any_type(cls, scalars, dtype=None, copy=False):
-        """
-        Construct a new ExtensionArray from a sequence of unknown types of scalars.
-
-        .. versionadded:: 1.1.0
-
-        Parameters
-        ----------
-        scalars : Sequence
-            Each element can  be an instance of unknown  scalar types.
-        dtype : dtype, optional
-            Construct for this particular dtype. This should be a Dtype
-            compatible with the ExtensionArray.
-        copy : bool, default False
-            If True, copy the underlying data.
-
-        Returns
-        -------
-        ExtensionArray
-        """
-        return cls._from_sequence(scalars, dtype=dtype, copy=copy)
-
     @classmethod
     def _from_factorized(cls, values, original):
         """
@@ -454,6 +431,11 @@ def astype(self, dtype, copy=True):
         array : ndarray
             NumPy ndarray with 'dtype' for its dtype.
         """
+        from pandas.core.arrays.string_ import StringDtype
+
+        dtype = pandas_dtype(dtype)
+        if isinstance(dtype, StringDtype):
+            return dtype.construct_array_type()._from_sequence(self, copy=False)
         return np.array(self, dtype=dtype, copy=copy)
 
     def isna(self) -> ArrayLike:

diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py
@@ -450,17 +450,20 @@ def astype(self, dtype, copy: bool = True) -> ArrayLike:
             if incompatible type with an IntegerDtype, equivalent of same_kind
             casting
         """
-        from pandas.core.arrays.boolean import BooleanArray, BooleanDtype
+        from pandas.core.arrays.boolean import BooleanDtype
+        from pandas.core.arrays.string_ import StringDtype
 
         dtype = pandas_dtype(dtype)
 
         # if we are astyping to an existing IntegerDtype we can fastpath
         if isinstance(dtype, _IntegerDtype):
             result = self._data.astype(dtype.numpy_dtype, copy=False)
-            return type(self)(result, mask=self._mask, copy=False)
+            return dtype.construct_array_type()(result, mask=self._mask, copy=False)
         elif isinstance(dtype, BooleanDtype):
             result = self._data.astype("bool", copy=False)
-            return BooleanArray(result, mask=self._mask, copy=False)
+            return dtype.construct_array_type()(result, mask=self._mask, copy=False)
+        elif isinstance(dtype, StringDtype):
+            return dtype.construct_array_type()._from_sequence(self, copy=False)
 
         # coerce
         if is_float_dtype(dtype):

diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py
@@ -678,6 +678,8 @@ def astype(self, dtype, copy=True):
         array : ExtensionArray or ndarray
             ExtensionArray or NumPy ndarray with 'dtype' for its dtype.
         """
+        from pandas.core.arrays.string_ import StringDtype
+
         dtype = pandas_dtype(dtype)
         if is_interval_dtype(dtype):
             if dtype == self.dtype:
@@ -695,6 +697,9 @@ def astype(self, dtype, copy=True):
             return self._shallow_copy(new_left, new_right)
         elif is_categorical_dtype(dtype):
             return Categorical(np.asarray(self))
+        elif isinstance(dtype, StringDtype):
+            return dtype.construct_array_type()._from_sequence(self, copy=False)
+
         # TODO: This try/except will be repeated.
         try:
             return np.asarray(self).astype(dtype, copy=copy)

diff --git a/pandas/core/arrays/sparse/dtype.py b/pandas/core/arrays/sparse/dtype.py
@@ -320,7 +320,7 @@ def update_dtype(self, dtype):
         dtype = pandas_dtype(dtype)
 
         if not isinstance(dtype, cls):
-            fill_value = astype_nansafe(np.array(self.fill_value), dtype).item()
+            fill_value = astype_nansafe(np.array([self.fill_value]), dtype)[0]
             dtype = cls(dtype, fill_value=fill_value)
 
         return dtype

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
@@ -203,10 +203,14 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
         # TODO: it would be nice to do this in _validate / lib.is_string_array
         # We are already doing a scan over the values there.
         na_values = isna(result)
-        if na_values.any():
-            if result is scalars:
-                # force a copy now, if we haven't already
-                result = result.copy()
+        has_nans = na_values.any()
+        if has_nans and result is scalars:
+            # force a copy now, if we haven't already
+            result = result.copy()
+        # convert to str, then to object to avoid dtype like '<U3', then insert na_value
+        result = np.asarray(result, dtype=str)
+        result = np.asarray(result, dtype="object")
+        if has_nans:
             result[na_values] = StringDtype.na_value
 
         return cls(result)
@@ -215,21 +219,6 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
     def _from_sequence_of_strings(cls, strings, dtype=None, copy=False):
         return cls._from_sequence(strings, dtype=dtype, copy=copy)
 
-    @classmethod
-    def _from_sequence_of_any_type(cls, scalars, dtype=None, copy=False):
-        values = np.asarray(scalars, dtype="object")
-        na_values = isna(values)
-        has_nans = na_values.any()
-        if has_nans and values is scalars:
-            # force a copy now, if we haven't already
-            values = values.copy()
-        # convert to str, then to object to avoid dtype like '<U3', then insert na_value
-        values = np.asarray(values, dtype=str)
-        values = np.asarray(values, dtype="object")
-        if has_nans:
-            values[na_values] = dtype.na_value
-        return cls._from_sequence(values, dtype=dtype, copy=copy)
-
     def __arrow_array__(self, type=None):
         """
         Convert myself into a pyarrow Array.

diff --git a/pandas/core/construction.py b/pandas/core/construction.py
@@ -283,7 +283,7 @@ def array(
 
     if is_extension_array_dtype(dtype):
         cls = cast(ExtensionDtype, dtype).construct_array_type()
-        return cls._from_sequence_of_any_type(data, dtype=dtype, copy=copy)
+        return cls._from_sequence(data, dtype=dtype, copy=copy)
 
     if dtype is None:
         inferred_dtype = lib.infer_dtype(data, skipna=True)
@@ -562,7 +562,7 @@ def _try_cast(
         elif is_extension_array_dtype(dtype):
             # create an extension array from its dtype
             dtype = cast(ExtensionDtype, dtype)
-            array_type = dtype.construct_array_type()._from_sequence_of_any_type
+            array_type = dtype.construct_array_type()._from_sequence
             subarr = array_type(arr, dtype=dtype, copy=copy)
         elif dtype is not None and raise_cast_failure:
             raise

diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -924,8 +924,7 @@ def astype_nansafe(arr, dtype, copy: bool = True, skipna: bool = False):
     """
     # dispatch on extension dtype if needed
     if is_extension_array_dtype(dtype):
-        arr_type = dtype.construct_array_type()._from_sequence_of_any_type
-        return arr_type(arr, dtype=dtype, copy=copy)
+        return dtype.construct_array_type()._from_sequence(arr, dtype=dtype, copy=copy)
 
     if not isinstance(dtype, np.dtype):
         dtype = pandas_dtype(dtype)

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -10478,7 +10478,7 @@ def _doc_parms(cls):
 True
 >>> pd.Series([True, False]).all()
 False
->>> pd.Series([], dtype=object).all()
+>>> pd.Series([]).all()
 True
 >>> pd.Series([np.nan]).all()
 True
@@ -10846,7 +10846,7 @@ def _doc_parms(cls):
 False
 >>> pd.Series([True, False]).any()
 True
->>> pd.Series([], dtype=object).any()
+>>> pd.Series([]).any()
 False
 >>> pd.Series([np.nan]).any()
 False
@@ -10948,13 +10948,13 @@ def _doc_parms(cls):
 
 By default, the sum of an empty or all-NA Series is ``0``.
 
->>> pd.Series([], dtype=float).sum()  # min_count=0 is the default
+>>> pd.Series([]).sum()  # min_count=0 is the default
 0.0
 
 This can be controlled with the ``min_count`` parameter. For example, if
 you'd like the sum of an empty series to be NaN, pass ``min_count=1``.
 
->>> pd.Series([], dtype=float).sum(min_count=1)
+>>> pd.Series([]).sum(min_count=1)
 nan
 
 Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and
@@ -10995,12 +10995,12 @@ def _doc_parms(cls):
 --------
 By default, the product of an empty or all-NA Series is ``1``
 
->>> pd.Series([], dtype=float).prod()
+>>> pd.Series([]).prod()
 1.0
 
 This can be controlled with the ``min_count`` parameter
 
->>> pd.Series([], dtype=float).prod(min_count=1)
+>>> pd.Series([]).prod(min_count=1)
 nan
 
 Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and

diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -2660,7 +2660,7 @@ def _construct_result(
         out.name = name
         return out
 
-    def combine(self, other, func, fill_value=None) -> "Series":
+    def combine(self, other, func, fill_value=None, dtype=None) -> "Series":
         """
         Combine the Series with a Series or scalar according to `func`.
 
@@ -2679,6 +2679,11 @@ def combine(self, other, func, fill_value=None) -> "Series":
             The value to assume when an index is missing from
             one Series or the other. The default specifies to use the
             appropriate NaN value for the underlying dtype of the Series.
+        dtype : str, numpy.dtype, or ExtensionDtype, optional
+            Data type for the output Series. If not specified, this will be
+            inferred from the combined data.
+
+            .. versionadded:: 1.1.0
 
         Returns
         -------
@@ -2749,6 +2754,10 @@ def combine(self, other, func, fill_value=None) -> "Series":
                 new_values = [func(lv, other) for lv in self._values]
             new_name = self.name
 
+        if dtype is not None:
+            return self._constructor(
+                new_values, index=new_index, name=new_name, dtype=dtype
+            )
         if is_categorical_dtype(self.dtype):
             pass
         elif is_extension_array_dtype(self.dtype):

diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py
@@ -529,7 +529,6 @@ def test_astype_all(self, any_real_dtype):
                     np.array([0, 1], dtype="datetime64[ns]"),
                     dtype=SparseDtype("datetime64[ns]", pd.Timestamp("1970")),
                 ),
-                marks=[pytest.mark.xfail(reason="NumPy-7619")],
             ),
             (
                 SparseArray([0, 1, 10]),

diff --git a/pandas/tests/extension/base/casting.py b/pandas/tests/extension/base/casting.py
@@ -36,6 +36,11 @@ def test_astype_str(self, data):
         expected = pd.Series(data[:5].astype(str))
         self.assert_series_equal(result, expected)
 
+    def test_astype_string(self, data):
+        result = pd.Series(data[:5]).astype("string")
+        expected = pd.Series(data[:5].astype("string"))
+        self.assert_series_equal(result, expected)
+
     def test_to_numpy(self, data):
         expected = np.asarray(data)
 

diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py
@@ -172,15 +172,16 @@ def test_combine_le(self, data_repeated):
         orig_data1, orig_data2 = data_repeated(2)
         s1 = pd.Series(orig_data1)
         s2 = pd.Series(orig_data2)
-        result = s1.combine(s2, lambda x1, x2: x1 <= x2)
+        result = s1.combine(s2, lambda x1, x2: x1 <= x2, dtype="boolean")
         expected = pd.Series(
-            [a <= b for (a, b) in zip(list(orig_data1), list(orig_data2))]
+            [a <= b for (a, b) in zip(list(orig_data1), list(orig_data2))],
+            dtype="boolean",
         )
         self.assert_series_equal(result, expected)
 
         val = s1.iloc[0]
-        result = s1.combine(val, lambda x1, x2: x1 <= x2)
-        expected = pd.Series([a <= val for a in list(orig_data1)])
+        result = s1.combine(val, lambda x1, x2: x1 <= x2, dtype="boolean")
+        expected = pd.Series([a <= val for a in list(orig_data1)], dtype="boolean")
         self.assert_series_equal(result, expected)
 
     def test_combine_add(self, data_repeated):

diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py
@@ -7,6 +7,7 @@
 import numpy as np
 
 from pandas.core.dtypes.base import ExtensionDtype
+from pandas.core.dtypes.common import pandas_dtype
 
 import pandas as pd
 from pandas.api.extensions import no_default, register_extension_dtype
@@ -130,8 +131,13 @@ def copy(self):
         return type(self)(self._data.copy())
 
     def astype(self, dtype, copy=True):
+        from pandas.core.arrays.string_ import StringDtype
+
+        dtype = pandas_dtype(dtype)
         if isinstance(dtype, type(self.dtype)):
             return type(self)(self._data, context=dtype.context)
+        elif isinstance(dtype, StringDtype):
+            return dtype.construct_array_type()._from_sequence(self, copy=False)
         return np.asarray(self, dtype=dtype)
 
     def __setitem__(self, key, value):