Skip to content

Commit

Permalink
Merge branch 'astype_string' of https://github.com/topper-123/pandas
Browse files Browse the repository at this point in the history
…into astype_string
  • Loading branch information
topper-123 committed Apr 14, 2020
2 parents 556e1c2 + 59e9c3b commit 2a835e4
Show file tree
Hide file tree
Showing 14 changed files with 116 additions and 18 deletions.
21 changes: 21 additions & 0 deletions doc/source/user_guide/text.rst
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,27 @@ Or ``astype`` after the ``Series`` or ``DataFrame`` is created
s
s.astype("string")
.. versionchanged:: 1.1.0

You can also use ``string`` dtype on non-string data and it will be converted to
``string`` dtype:

.. ipython:: python
s = pd.Series(['a', 2, np.nan], dtype="string")
s
type(s[1])
or convert from existing pandas data:

s1 = pd.Series([1,2, np.nan], dtype="Int64")
s1
s2 = s1.astype("string")
s2
type(s2[0])


.. _text.differences:

Behavior differences
Expand Down
27 changes: 27 additions & 0 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,32 @@ including other versions of pandas.
Enhancements
~~~~~~~~~~~~

.. _whatsnew_110.astype_string:

All dtypes can now be converted to ``StringDtype``
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Previously, declaring or converting to :class:`StringDtype` was in general only possible if the data was already only ``str`` or nan-like.
For example:

.. code-block:: ipython
In [1]: pd.Series([1, "abc", np.nan], dtype="string")
Out[1]: ValueError: StringArray requires a sequence of strings or pandas.NA
In [2]: pd.Series([1,2, np.nan], dtype="Int64").astype("string")
Out[2]: ValueError: StringArray requires a sequence of strings or pandas.NA
This meant that in order to convert arbitrary data to :class:`StringDtype`, you would often have to use ``.astype(str).astype('string')``, which was not intuitive.
:class:`StringDtype` now works in all situations where ``astype(str)`` or ``dtype=str`` work:

.. ipython:: python
ser = pd.Series([1, "abc", np.nan], dtype="string")
ser
ser[0]
pd.Series([1,2, np.nan], dtype="Int64").astype("string")
.. _whatsnew_110.period_index_partial_string_slicing:

Nonmonotonic PeriodIndex Partial String Slicing
Expand Down Expand Up @@ -89,6 +115,7 @@ Other enhancements
- :meth:`DataFrame.sample` will now also allow array-like and BitGenerator objects to be passed to ``random_state`` as seeds (:issue:`32503`)
- :meth:`MultiIndex.union` will now raise `RuntimeWarning` if the object inside are unsortable, pass `sort=False` to suppress this warning (:issue:`33015`)
- :class:`Series.dt` and :class:`DatatimeIndex` now have an `isocalendar` accessor that returns a :class:`DataFrame` with year, week, and day calculated according to the ISO 8601 calendar (:issue:`33206`).
- :meth:`Series.combine` has gained a ``dtype`` argument. If supplied, the combined series will get that dtype (:issue:`33465`)
- The :meth:`DataFrame.to_feather` method now supports additional keyword
arguments (e.g. to set the compression) that are added in pyarrow 0.17
(:issue:`33422`).
Expand Down
9 changes: 7 additions & 2 deletions pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from pandas.util._validators import validate_fillna_kwargs

from pandas.core.dtypes.cast import maybe_cast_to_extension_array
from pandas.core.dtypes.common import is_array_like, is_list_like
from pandas.core.dtypes.common import is_array_like, is_list_like, pandas_dtype
from pandas.core.dtypes.dtypes import ExtensionDtype
from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries
from pandas.core.dtypes.missing import isna
Expand Down Expand Up @@ -176,7 +176,7 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
----------
scalars : Sequence
Each element will be an instance of the scalar type for this
array, ``cls.dtype.type``.
array, ``cls.dtype.type`` or be converted into this type in this method.
dtype : dtype, optional
Construct for this particular dtype. This should be a Dtype
compatible with the ExtensionArray.
Expand Down Expand Up @@ -431,6 +431,11 @@ def astype(self, dtype, copy=True):
array : ndarray
NumPy ndarray with 'dtype' for its dtype.
"""
from pandas.core.arrays.string_ import StringDtype

dtype = pandas_dtype(dtype)
if isinstance(dtype, StringDtype):
return dtype.construct_array_type()._from_sequence(self, copy=False)
return np.array(self, dtype=dtype, copy=copy)

def isna(self) -> ArrayLike:
Expand Down
13 changes: 8 additions & 5 deletions pandas/core/arrays/integer.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import numbers
from typing import TYPE_CHECKING, Tuple, Type, Union
from typing import TYPE_CHECKING, Dict, Tuple, Type, Union
import warnings

import numpy as np
Expand Down Expand Up @@ -449,17 +449,20 @@ def astype(self, dtype, copy: bool = True) -> ArrayLike:
if incompatible type with an IntegerDtype, equivalent of same_kind
casting
"""
from pandas.core.arrays.boolean import BooleanArray, BooleanDtype
from pandas.core.arrays.boolean import BooleanDtype
from pandas.core.arrays.string_ import StringDtype

dtype = pandas_dtype(dtype)

# if we are astyping to an existing IntegerDtype we can fastpath
if isinstance(dtype, _IntegerDtype):
result = self._data.astype(dtype.numpy_dtype, copy=False)
return type(self)(result, mask=self._mask, copy=False)
return dtype.construct_array_type()(result, mask=self._mask, copy=False)
elif isinstance(dtype, BooleanDtype):
result = self._data.astype("bool", copy=False)
return BooleanArray(result, mask=self._mask, copy=False)
return dtype.construct_array_type()(result, mask=self._mask, copy=False)
elif isinstance(dtype, StringDtype):
return dtype.construct_array_type()._from_sequence(self, copy=False)

# coerce
if is_float_dtype(dtype):
Expand Down Expand Up @@ -748,7 +751,7 @@ class UInt64Dtype(_IntegerDtype):
__doc__ = _dtype_docstring.format(dtype="uint64")


_dtypes = {
_dtypes: Dict[str, _IntegerDtype] = {
"int8": Int8Dtype(),
"int16": Int16Dtype(),
"int32": Int32Dtype(),
Expand Down
5 changes: 5 additions & 0 deletions pandas/core/arrays/interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -678,6 +678,8 @@ def astype(self, dtype, copy=True):
array : ExtensionArray or ndarray
ExtensionArray or NumPy ndarray with 'dtype' for its dtype.
"""
from pandas.core.arrays.string_ import StringDtype

dtype = pandas_dtype(dtype)
if is_interval_dtype(dtype):
if dtype == self.dtype:
Expand All @@ -695,6 +697,9 @@ def astype(self, dtype, copy=True):
return self._shallow_copy(new_left, new_right)
elif is_categorical_dtype(dtype):
return Categorical(np.asarray(self))
elif isinstance(dtype, StringDtype):
return dtype.construct_array_type()._from_sequence(self, copy=False)

# TODO: This try/except will be repeated.
try:
return np.asarray(self).astype(dtype, copy=copy)
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/arrays/sparse/dtype.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,7 +320,7 @@ def update_dtype(self, dtype):
dtype = pandas_dtype(dtype)

if not isinstance(dtype, cls):
fill_value = astype_nansafe(np.array(self.fill_value), dtype).item()
fill_value = astype_nansafe(np.array([self.fill_value]), dtype)[0]
dtype = cls(dtype, fill_value=fill_value)

return dtype
Expand Down
12 changes: 8 additions & 4 deletions pandas/core/arrays/string_.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,10 +203,14 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
# TODO: it would be nice to do this in _validate / lib.is_string_array
# We are already doing a scan over the values there.
na_values = isna(result)
if na_values.any():
if result is scalars:
# force a copy now, if we haven't already
result = result.copy()
has_nans = na_values.any()
if has_nans and result is scalars:
# force a copy now, if we haven't already
result = result.copy()
# convert to str, then to object to avoid dtype like '<U3', then insert na_value
result = np.asarray(result, dtype=str)
result = np.asarray(result, dtype="object")
if has_nans:
result[na_values] = StringDtype.na_value

return cls(result)
Expand Down
11 changes: 10 additions & 1 deletion pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2663,7 +2663,7 @@ def _construct_result(
out.name = name
return out

def combine(self, other, func, fill_value=None) -> "Series":
def combine(self, other, func, fill_value=None, dtype=None) -> "Series":
"""
Combine the Series with a Series or scalar according to `func`.
Expand All @@ -2682,6 +2682,11 @@ def combine(self, other, func, fill_value=None) -> "Series":
The value to assume when an index is missing from
one Series or the other. The default specifies to use the
appropriate NaN value for the underlying dtype of the Series.
dtype : str, numpy.dtype, or ExtensionDtype, optional
Data type for the output Series. If not specified, this will be
inferred from the combined data.
.. versionadded:: 1.1.0
Returns
-------
Expand Down Expand Up @@ -2752,6 +2757,10 @@ def combine(self, other, func, fill_value=None) -> "Series":
new_values = [func(lv, other) for lv in self._values]
new_name = self.name

if dtype is not None:
return self._constructor(
new_values, index=new_index, name=new_name, dtype=dtype
)
if is_categorical_dtype(self.dtype):
pass
elif is_extension_array_dtype(self.dtype):
Expand Down
1 change: 0 additions & 1 deletion pandas/tests/arrays/sparse/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -529,7 +529,6 @@ def test_astype_all(self, any_real_dtype):
np.array([0, 1], dtype="datetime64[ns]"),
dtype=SparseDtype("datetime64[ns]", pd.Timestamp("1970")),
),
marks=[pytest.mark.xfail(reason="NumPy-7619")],
),
(
SparseArray([0, 1, 10]),
Expand Down
5 changes: 5 additions & 0 deletions pandas/tests/extension/base/casting.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,11 @@ def test_astype_str(self, data):
expected = pd.Series(data[:5].astype(str))
self.assert_series_equal(result, expected)

def test_astype_string(self, data):
result = pd.Series(data[:5]).astype("string")
expected = pd.Series(data[:5].astype("string"))
self.assert_series_equal(result, expected)

def test_to_numpy(self, data):
expected = np.asarray(data)

Expand Down
9 changes: 5 additions & 4 deletions pandas/tests/extension/base/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,15 +172,16 @@ def test_combine_le(self, data_repeated):
orig_data1, orig_data2 = data_repeated(2)
s1 = pd.Series(orig_data1)
s2 = pd.Series(orig_data2)
result = s1.combine(s2, lambda x1, x2: x1 <= x2)
result = s1.combine(s2, lambda x1, x2: x1 <= x2, dtype="boolean")
expected = pd.Series(
[a <= b for (a, b) in zip(list(orig_data1), list(orig_data2))]
[a <= b for (a, b) in zip(list(orig_data1), list(orig_data2))],
dtype="boolean",
)
self.assert_series_equal(result, expected)

val = s1.iloc[0]
result = s1.combine(val, lambda x1, x2: x1 <= x2)
expected = pd.Series([a <= val for a in list(orig_data1)])
result = s1.combine(val, lambda x1, x2: x1 <= x2, dtype="boolean")
expected = pd.Series([a <= val for a in list(orig_data1)], dtype="boolean")
self.assert_series_equal(result, expected)

def test_combine_add(self, data_repeated):
Expand Down
6 changes: 6 additions & 0 deletions pandas/tests/extension/decimal/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import numpy as np

from pandas.core.dtypes.base import ExtensionDtype
from pandas.core.dtypes.common import pandas_dtype

import pandas as pd
from pandas.api.extensions import no_default, register_extension_dtype
Expand Down Expand Up @@ -130,8 +131,13 @@ def copy(self):
return type(self)(self._data.copy())

def astype(self, dtype, copy=True):
from pandas.core.arrays.string_ import StringDtype

dtype = pandas_dtype(dtype)
if isinstance(dtype, type(self.dtype)):
return type(self)(self._data, context=dtype.context)
elif isinstance(dtype, StringDtype):
return dtype.construct_array_type()._from_sequence(self, copy=False)
return np.asarray(self, dtype=dtype)

def __setitem__(self, key, value):
Expand Down
8 changes: 8 additions & 0 deletions pandas/tests/extension/json/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@

import numpy as np

from pandas.core.dtypes.common import pandas_dtype

import pandas as pd
from pandas.api.extensions import ExtensionArray, ExtensionDtype

Expand Down Expand Up @@ -154,12 +156,18 @@ def astype(self, dtype, copy=True):
# NumPy has issues when all the dicts are the same length.
# np.array([UserDict(...), UserDict(...)]) fails,
# but np.array([{...}, {...}]) works, so cast.
from pandas.core.arrays.string_ import StringDtype

dtype = pandas_dtype(dtype)
# needed to add this check for the Series constructor
if isinstance(dtype, type(self.dtype)) and dtype == self.dtype:
if copy:
return self.copy()
return self
elif isinstance(dtype, StringDtype):
value = self.astype(str) # numpy doesn'y like nested dicts
return dtype.construct_array_type()._from_sequence(value, copy=False)

return np.array([dict(x) for x in self], dtype=dtype, copy=copy)

def unique(self):
Expand Down
5 changes: 5 additions & 0 deletions pandas/tests/extension/test_numpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,11 @@ def test_astype_str(self, data):
# ValueError: setting an array element with a sequence
super().test_astype_str(data)

@skip_nested
def test_astype_string(self, data):
# ValueError: setting an array element with a sequence
super().test_astype_string(data)


class TestConstructors(BaseNumPyTests, base.BaseConstructorsTests):
@pytest.mark.skip(reason="We don't register our dtype")
Expand Down

0 comments on commit 2a835e4

Please sign in to comment.