Skip to content

Commit

Permalink
use _from_sequence + add tests
Browse files Browse the repository at this point in the history
  • Loading branch information
topper-123 committed Apr 12, 2020
1 parent cf501bc commit 59e9c3b
Show file tree
Hide file tree
Showing 17 changed files with 123 additions and 64 deletions.
21 changes: 21 additions & 0 deletions doc/source/user_guide/text.rst
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,27 @@ Or ``astype`` after the ``Series`` or ``DataFrame`` is created
s
s.astype("string")
.. versionchanged:: 1.1.0

You can also use ``string`` dtype on non-string data and it will be converted to
``string`` dtype:

.. ipython:: python
s = pd.Series(['a', 2, np.nan], dtype="string")
s
type(s[1])
or convert from existing pandas data:

s1 = pd.Series([1,2, np.nan], dtype="Int64")
s1
s2 = s1.astype("string")
s2
type(s2[0])


.. _text.differences:

Behavior differences
Expand Down
27 changes: 27 additions & 0 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,32 @@ including other versions of pandas.
Enhancements
~~~~~~~~~~~~

.. _whatsnew_110.astype_string:

All dtypes can now be converted to ``StringDtype``
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Previously, declaring or converting to :class:`StringDtype` was in general only possible if the data was already only ``str`` or nan-like.
For example:

.. code-block:: ipython
In [1]: pd.Series([1, "abc", np.nan], dtype="string")
Out[1]: ValueError: StringArray requires a sequence of strings or pandas.NA
In [2]: pd.Series([1,2, np.nan], dtype="Int64").astype("string")
Out[2]: ValueError: StringArray requires a sequence of strings or pandas.NA
This meant that in order to convert arbitrary data to :class:`StringDtype`, you would often have to use ``.astype(str).astype('string')``, which was not intuitive.
:class:`StringDtype` now works in all situations where ``astype(str)`` or ``dtype=str`` work:

.. ipython:: python
ser = pd.Series([1, "abc", np.nan], dtype="string")
ser
ser[0]
pd.Series([1,2, np.nan], dtype="Int64").astype("string")
.. _whatsnew_110.period_index_partial_string_slicing:

Nonmonotonic PeriodIndex Partial String Slicing
Expand Down Expand Up @@ -88,6 +114,7 @@ Other enhancements
- :class:`Series.str` now has a `fullmatch` method that matches a regular expression against the entire string in each row of the series, similar to `re.fullmatch` (:issue:`32806`).
- :meth:`DataFrame.sample` will now also allow array-like and BitGenerator objects to be passed to ``random_state`` as seeds (:issue:`32503`)
- :meth:`MultiIndex.union` will now raise `RuntimeWarning` if the object inside are unsortable, pass `sort=False` to suppress this warning (:issue:`33015`)
- :meth:`Series.combine` has gained a ``dtype`` argument. If supplied, the combined series will get that dtype (:issue:`33465`)
- The :meth:`DataFrame.to_feather` method now supports additional keyword
arguments (e.g. to set the compression) that are added in pyarrow 0.17
(:issue:`33422`).
Expand Down
32 changes: 7 additions & 25 deletions pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from pandas.util._validators import validate_fillna_kwargs

from pandas.core.dtypes.cast import maybe_cast_to_extension_array
from pandas.core.dtypes.common import is_array_like, is_list_like
from pandas.core.dtypes.common import is_array_like, is_list_like, pandas_dtype
from pandas.core.dtypes.dtypes import ExtensionDtype
from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries
from pandas.core.dtypes.missing import isna
Expand Down Expand Up @@ -176,7 +176,7 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
----------
scalars : Sequence
Each element will be an instance of the scalar type for this
array, ``cls.dtype.type``.
array, ``cls.dtype.type`` or be converted into this type in this method.
dtype : dtype, optional
Construct for this particular dtype. This should be a Dtype
compatible with the ExtensionArray.
Expand Down Expand Up @@ -213,29 +213,6 @@ def _from_sequence_of_strings(cls, strings, dtype=None, copy=False):
"""
raise AbstractMethodError(cls)

@classmethod
def _from_sequence_of_any_type(cls, scalars, dtype=None, copy=False):
"""
Construct a new ExtensionArray from a sequence of unknown types of scalars.
.. versionadded:: 1.1.0
Parameters
----------
scalars : Sequence
Each element can be an instance of unknown scalar types.
dtype : dtype, optional
Construct for this particular dtype. This should be a Dtype
compatible with the ExtensionArray.
copy : bool, default False
If True, copy the underlying data.
Returns
-------
ExtensionArray
"""
return cls._from_sequence(scalars, dtype=dtype, copy=copy)

@classmethod
def _from_factorized(cls, values, original):
"""
Expand Down Expand Up @@ -454,6 +431,11 @@ def astype(self, dtype, copy=True):
array : ndarray
NumPy ndarray with 'dtype' for its dtype.
"""
from pandas.core.arrays.string_ import StringDtype

dtype = pandas_dtype(dtype)
if isinstance(dtype, StringDtype):
return dtype.construct_array_type()._from_sequence(self, copy=False)
return np.array(self, dtype=dtype, copy=copy)

def isna(self) -> ArrayLike:
Expand Down
9 changes: 6 additions & 3 deletions pandas/core/arrays/integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -450,17 +450,20 @@ def astype(self, dtype, copy: bool = True) -> ArrayLike:
if incompatible type with an IntegerDtype, equivalent of same_kind
casting
"""
from pandas.core.arrays.boolean import BooleanArray, BooleanDtype
from pandas.core.arrays.boolean import BooleanDtype
from pandas.core.arrays.string_ import StringDtype

dtype = pandas_dtype(dtype)

# if we are astyping to an existing IntegerDtype we can fastpath
if isinstance(dtype, _IntegerDtype):
result = self._data.astype(dtype.numpy_dtype, copy=False)
return type(self)(result, mask=self._mask, copy=False)
return dtype.construct_array_type()(result, mask=self._mask, copy=False)
elif isinstance(dtype, BooleanDtype):
result = self._data.astype("bool", copy=False)
return BooleanArray(result, mask=self._mask, copy=False)
return dtype.construct_array_type()(result, mask=self._mask, copy=False)
elif isinstance(dtype, StringDtype):
return dtype.construct_array_type()._from_sequence(self, copy=False)

# coerce
if is_float_dtype(dtype):
Expand Down
5 changes: 5 additions & 0 deletions pandas/core/arrays/interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -678,6 +678,8 @@ def astype(self, dtype, copy=True):
array : ExtensionArray or ndarray
ExtensionArray or NumPy ndarray with 'dtype' for its dtype.
"""
from pandas.core.arrays.string_ import StringDtype

dtype = pandas_dtype(dtype)
if is_interval_dtype(dtype):
if dtype == self.dtype:
Expand All @@ -695,6 +697,9 @@ def astype(self, dtype, copy=True):
return self._shallow_copy(new_left, new_right)
elif is_categorical_dtype(dtype):
return Categorical(np.asarray(self))
elif isinstance(dtype, StringDtype):
return dtype.construct_array_type()._from_sequence(self, copy=False)

# TODO: This try/except will be repeated.
try:
return np.asarray(self).astype(dtype, copy=copy)
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/arrays/sparse/dtype.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,7 +320,7 @@ def update_dtype(self, dtype):
dtype = pandas_dtype(dtype)

if not isinstance(dtype, cls):
fill_value = astype_nansafe(np.array(self.fill_value), dtype).item()
fill_value = astype_nansafe(np.array([self.fill_value]), dtype)[0]
dtype = cls(dtype, fill_value=fill_value)

return dtype
Expand Down
27 changes: 8 additions & 19 deletions pandas/core/arrays/string_.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,10 +203,14 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
# TODO: it would be nice to do this in _validate / lib.is_string_array
# We are already doing a scan over the values there.
na_values = isna(result)
if na_values.any():
if result is scalars:
# force a copy now, if we haven't already
result = result.copy()
has_nans = na_values.any()
if has_nans and result is scalars:
# force a copy now, if we haven't already
result = result.copy()
# convert to str, then to object to avoid dtype like '<U3', then insert na_value
result = np.asarray(result, dtype=str)
result = np.asarray(result, dtype="object")
if has_nans:
result[na_values] = StringDtype.na_value

return cls(result)
Expand All @@ -215,21 +219,6 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
def _from_sequence_of_strings(cls, strings, dtype=None, copy=False):
return cls._from_sequence(strings, dtype=dtype, copy=copy)

@classmethod
def _from_sequence_of_any_type(cls, scalars, dtype=None, copy=False):
values = np.asarray(scalars, dtype="object")
na_values = isna(values)
has_nans = na_values.any()
if has_nans and values is scalars:
# force a copy now, if we haven't already
values = values.copy()
# convert to str, then to object to avoid dtype like '<U3', then insert na_value
values = np.asarray(values, dtype=str)
values = np.asarray(values, dtype="object")
if has_nans:
values[na_values] = dtype.na_value
return cls._from_sequence(values, dtype=dtype, copy=copy)

def __arrow_array__(self, type=None):
"""
Convert myself into a pyarrow Array.
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,7 @@ def array(

if is_extension_array_dtype(dtype):
cls = cast(ExtensionDtype, dtype).construct_array_type()
return cls._from_sequence_of_any_type(data, dtype=dtype, copy=copy)
return cls._from_sequence(data, dtype=dtype, copy=copy)

if dtype is None:
inferred_dtype = lib.infer_dtype(data, skipna=True)
Expand Down Expand Up @@ -562,7 +562,7 @@ def _try_cast(
elif is_extension_array_dtype(dtype):
# create an extension array from its dtype
dtype = cast(ExtensionDtype, dtype)
array_type = dtype.construct_array_type()._from_sequence_of_any_type
array_type = dtype.construct_array_type()._from_sequence
subarr = array_type(arr, dtype=dtype, copy=copy)
elif dtype is not None and raise_cast_failure:
raise
Expand Down
3 changes: 1 addition & 2 deletions pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -924,8 +924,7 @@ def astype_nansafe(arr, dtype, copy: bool = True, skipna: bool = False):
"""
# dispatch on extension dtype if needed
if is_extension_array_dtype(dtype):
arr_type = dtype.construct_array_type()._from_sequence_of_any_type
return arr_type(arr, dtype=dtype, copy=copy)
return dtype.construct_array_type()._from_sequence(arr, dtype=dtype, copy=copy)

if not isinstance(dtype, np.dtype):
dtype = pandas_dtype(dtype)
Expand Down
12 changes: 6 additions & 6 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -10478,7 +10478,7 @@ def _doc_parms(cls):
True
>>> pd.Series([True, False]).all()
False
>>> pd.Series([], dtype=object).all()
>>> pd.Series([]).all()
True
>>> pd.Series([np.nan]).all()
True
Expand Down Expand Up @@ -10846,7 +10846,7 @@ def _doc_parms(cls):
False
>>> pd.Series([True, False]).any()
True
>>> pd.Series([], dtype=object).any()
>>> pd.Series([]).any()
False
>>> pd.Series([np.nan]).any()
False
Expand Down Expand Up @@ -10948,13 +10948,13 @@ def _doc_parms(cls):
By default, the sum of an empty or all-NA Series is ``0``.
>>> pd.Series([], dtype=float).sum() # min_count=0 is the default
>>> pd.Series([]).sum() # min_count=0 is the default
0.0
This can be controlled with the ``min_count`` parameter. For example, if
you'd like the sum of an empty series to be NaN, pass ``min_count=1``.
>>> pd.Series([], dtype=float).sum(min_count=1)
>>> pd.Series([]).sum(min_count=1)
nan
Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and
Expand Down Expand Up @@ -10995,12 +10995,12 @@ def _doc_parms(cls):
--------
By default, the product of an empty or all-NA Series is ``1``
>>> pd.Series([], dtype=float).prod()
>>> pd.Series([]).prod()
1.0
This can be controlled with the ``min_count`` parameter
>>> pd.Series([], dtype=float).prod(min_count=1)
>>> pd.Series([]).prod(min_count=1)
nan
Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and
Expand Down
11 changes: 10 additions & 1 deletion pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2660,7 +2660,7 @@ def _construct_result(
out.name = name
return out

def combine(self, other, func, fill_value=None) -> "Series":
def combine(self, other, func, fill_value=None, dtype=None) -> "Series":
"""
Combine the Series with a Series or scalar according to `func`.
Expand All @@ -2679,6 +2679,11 @@ def combine(self, other, func, fill_value=None) -> "Series":
The value to assume when an index is missing from
one Series or the other. The default specifies to use the
appropriate NaN value for the underlying dtype of the Series.
dtype : str, numpy.dtype, or ExtensionDtype, optional
Data type for the output Series. If not specified, this will be
inferred from the combined data.
.. versionadded:: 1.1.0
Returns
-------
Expand Down Expand Up @@ -2749,6 +2754,10 @@ def combine(self, other, func, fill_value=None) -> "Series":
new_values = [func(lv, other) for lv in self._values]
new_name = self.name

if dtype is not None:
return self._constructor(
new_values, index=new_index, name=new_name, dtype=dtype
)
if is_categorical_dtype(self.dtype):
pass
elif is_extension_array_dtype(self.dtype):
Expand Down
1 change: 0 additions & 1 deletion pandas/tests/arrays/sparse/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -529,7 +529,6 @@ def test_astype_all(self, any_real_dtype):
np.array([0, 1], dtype="datetime64[ns]"),
dtype=SparseDtype("datetime64[ns]", pd.Timestamp("1970")),
),
marks=[pytest.mark.xfail(reason="NumPy-7619")],
),
(
SparseArray([0, 1, 10]),
Expand Down
5 changes: 5 additions & 0 deletions pandas/tests/extension/base/casting.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,11 @@ def test_astype_str(self, data):
expected = pd.Series(data[:5].astype(str))
self.assert_series_equal(result, expected)

def test_astype_string(self, data):
result = pd.Series(data[:5]).astype("string")
expected = pd.Series(data[:5].astype("string"))
self.assert_series_equal(result, expected)

def test_to_numpy(self, data):
expected = np.asarray(data)

Expand Down
9 changes: 5 additions & 4 deletions pandas/tests/extension/base/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,15 +172,16 @@ def test_combine_le(self, data_repeated):
orig_data1, orig_data2 = data_repeated(2)
s1 = pd.Series(orig_data1)
s2 = pd.Series(orig_data2)
result = s1.combine(s2, lambda x1, x2: x1 <= x2)
result = s1.combine(s2, lambda x1, x2: x1 <= x2, dtype="boolean")
expected = pd.Series(
[a <= b for (a, b) in zip(list(orig_data1), list(orig_data2))]
[a <= b for (a, b) in zip(list(orig_data1), list(orig_data2))],
dtype="boolean",
)
self.assert_series_equal(result, expected)

val = s1.iloc[0]
result = s1.combine(val, lambda x1, x2: x1 <= x2)
expected = pd.Series([a <= val for a in list(orig_data1)])
result = s1.combine(val, lambda x1, x2: x1 <= x2, dtype="boolean")
expected = pd.Series([a <= val for a in list(orig_data1)], dtype="boolean")
self.assert_series_equal(result, expected)

def test_combine_add(self, data_repeated):
Expand Down
6 changes: 6 additions & 0 deletions pandas/tests/extension/decimal/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import numpy as np

from pandas.core.dtypes.base import ExtensionDtype
from pandas.core.dtypes.common import pandas_dtype

import pandas as pd
from pandas.api.extensions import no_default, register_extension_dtype
Expand Down Expand Up @@ -130,8 +131,13 @@ def copy(self):
return type(self)(self._data.copy())

def astype(self, dtype, copy=True):
from pandas.core.arrays.string_ import StringDtype

dtype = pandas_dtype(dtype)
if isinstance(dtype, type(self.dtype)):
return type(self)(self._data, context=dtype.context)
elif isinstance(dtype, StringDtype):
return dtype.construct_array_type()._from_sequence(self, copy=False)
return np.asarray(self, dtype=dtype)

def __setitem__(self, key, value):
Expand Down
Loading

0 comments on commit 59e9c3b

Please sign in to comment.