Skip to content

Commit

Permalink
API: add infer_objects for soft conversions (pandas-dev#16915)
Browse files Browse the repository at this point in the history
* API: add infer_objects for soft conversions

* doc fixups

* fixups

* doc
  • Loading branch information
chris-b1 authored Jul 18, 2017
1 parent fcb0263 commit 9e7666d
Show file tree
Hide file tree
Showing 6 changed files with 153 additions and 4 deletions.
2 changes: 2 additions & 0 deletions doc/source/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,7 @@ Conversion
:toctree: generated/

Series.astype
Series.infer_objects
Series.copy
Series.isnull
Series.notnull
Expand Down Expand Up @@ -777,6 +778,7 @@ Conversion

DataFrame.astype
DataFrame.convert_objects
DataFrame.infer_objects
DataFrame.copy
DataFrame.isnull
DataFrame.notnull
Expand Down
23 changes: 22 additions & 1 deletion doc/source/basics.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2024,7 +2024,28 @@ object conversion
~~~~~~~~~~~~~~~~~

pandas offers various functions to try to force conversion of types from the ``object`` dtype to other types.
The following functions are available for one dimensional object arrays or scalars:
In cases where the data is already of the correct type, but stored in an ``object`` array, the
:meth:`~DataFrame.infer_objects` and :meth:`~Series.infer_objects` can be used to soft convert
to the correct type.

.. ipython:: python
df = pd.DataFrame([[1, 2],
['a', 'b'],
[datetime.datetime(2016, 3, 2), datetime.datetime(2016, 3, 2)]])
df = df.T
df
df.dtypes
Because the data transposed the original inference stored all columns as object, which
``infer_objects`` will correct.

.. ipython:: python
df.infer_objects().dtypes
The following functions are available for one dimensional object arrays or scalars to perform
hard conversion of objects to a specified type:

- :meth:`~pandas.to_numeric` (conversion to numeric dtypes)

Expand Down
32 changes: 32 additions & 0 deletions doc/source/whatsnew/v0.21.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,38 @@ New features
- Added ``__fspath__`` method to :class:`~pandas.HDFStore`, :class:`~pandas.ExcelFile`,
and :class:`~pandas.ExcelWriter` to work properly with the file system path protocol (:issue:`13823`)


.. _whatsnew_0210.enhancements.infer_objects:

``infer_objects`` type conversion
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

The `:meth:`~DataFrame.infer_objects` and :meth:`~Series.infer_objects`
methods have been added to perform dtype inference on object columns, replacing
some of the functionality of the deprecated ``convert_objects``
method. See the documentation :ref:`here <basics.object_conversion>`
for more details. (:issue:`11221`)

This function only performs soft conversions on object columns, converting Python objects
to native types, but not any coercive conversions. For example:

.. ipython:: python

df = pd.DataFrame({'A': [1, 2, 3],
'B': np.array([1, 2, 3], dtype='object'),
'C': ['1', '2', '3']})
df.dtypes
df.infer_objects().dtype

Note that column ``'C'`` was not converted - only scalar numeric types
will be inferred to a new type. Other types of conversion should be accomplished
using :func:`to_numeric` function (or :func:`to_datetime`, :func:`to_timedelta`).
.. ipython:: python

df = df.infer_objects()
df['C'] = pd.to_numeric(df['C'], errors='coerce')
df.dtypes

.. _whatsnew_0210.enhancements.other:

Other Enhancements
Expand Down
56 changes: 53 additions & 3 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -3671,16 +3671,66 @@ def convert_objects(self, convert_dates=True, convert_numeric=False,
converted : same as input object
"""
from warnings import warn
warn("convert_objects is deprecated. Use the data-type specific "
"converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.",
FutureWarning, stacklevel=2)
msg = ("convert_objects is deprecated. To re-infer data dtypes for "
"object columns, use {klass}.infer_objects()\nFor all "
"other conversions use the data-type specific converters "
"pd.to_datetime, pd.to_timedelta and pd.to_numeric."
).format(klass=self.__class__.__name__)
warn(msg, FutureWarning, stacklevel=2)

return self._constructor(
self._data.convert(convert_dates=convert_dates,
convert_numeric=convert_numeric,
convert_timedeltas=convert_timedeltas,
copy=copy)).__finalize__(self)

def infer_objects(self):
"""
Attempt to infer better dtypes for object columns.
Attempts soft conversion of object-dtyped
columns, leaving non-object and unconvertible
columns unchanged. The inference rules are the
same as during normal Series/DataFrame construction.
.. versionadded:: 0.20.0
See Also
--------
pandas.to_datetime : Convert argument to datetime.
pandas.to_timedelta : Convert argument to timedelta.
pandas.to_numeric : Convert argument to numeric typeR
Returns
-------
converted : same type as input object
Examples
--------
>>> df = pd.DataFrame({"A": ["a", 1, 2, 3]})
>>> df = df.iloc[1:]
>>> df
A
1 1
2 2
3 3
>>> df.dtypes
A object
dtype: object
>>> df.infer_objects().dtypes
A int64
dtype: object
"""
# numeric=False necessary to only soft convert;
# python objects will still be converted to
# native numpy numeric types
return self._constructor(
self._data.convert(datetime=True, numeric=False,
timedelta=True, coerce=False,
copy=True)).__finalize__(self)

# ----------------------------------------------------------------------
# Filling NA's

Expand Down
26 changes: 26 additions & 0 deletions pandas/tests/frame/test_block_internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -495,6 +495,32 @@ def test_convert_objects_no_conversion(self):
mixed2 = mixed1._convert(datetime=True)
assert_frame_equal(mixed1, mixed2)

def test_infer_objects(self):
# GH 11221
df = DataFrame({'a': ['a', 1, 2, 3],
'b': ['b', 2.0, 3.0, 4.1],
'c': ['c', datetime(2016, 1, 1),
datetime(2016, 1, 2),
datetime(2016, 1, 3)],
'd': [1, 2, 3, 'd']},
columns=['a', 'b', 'c', 'd'])
df = df.iloc[1:].infer_objects()

assert df['a'].dtype == 'int64'
assert df['b'].dtype == 'float64'
assert df['c'].dtype == 'M8[ns]'
assert df['d'].dtype == 'object'

expected = DataFrame({'a': [1, 2, 3],
'b': [2.0, 3.0, 4.1],
'c': [datetime(2016, 1, 1),
datetime(2016, 1, 2),
datetime(2016, 1, 3)],
'd': [2, 3, 'd']},
columns=['a', 'b', 'c', 'd'])
# reconstruct frame to verify inference is same
tm.assert_frame_equal(df.reset_index(drop=True), expected)

def test_stale_cached_series_bug_473(self):

# this is chained, but ok
Expand Down
18 changes: 18 additions & 0 deletions pandas/tests/series/test_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,3 +268,21 @@ def test_series_to_categorical(self):
expected = Series(['a', 'b', 'c'], dtype='category')

tm.assert_series_equal(result, expected)

def test_infer_objects_series(self):
# GH 11221
actual = Series(np.array([1, 2, 3], dtype='O')).infer_objects()
expected = Series([1, 2, 3])
tm.assert_series_equal(actual, expected)

actual = Series(np.array([1, 2, 3, None], dtype='O')).infer_objects()
expected = Series([1., 2., 3., np.nan])
tm.assert_series_equal(actual, expected)

# only soft conversions, uncovertable pass thru unchanged
actual = (Series(np.array([1, 2, 3, None, 'a'], dtype='O'))
.infer_objects())
expected = Series([1, 2, 3, None, 'a'])

assert actual.dtype == 'object'
tm.assert_series_equal(actual, expected)

0 comments on commit 9e7666d

Please sign in to comment.