Skip to content

Commit

Permalink
BUG: Don't parse NaN as 'nan' in Data IO
Browse files Browse the repository at this point in the history
Closes gh-20377.
  • Loading branch information
gfyoung committed Oct 17, 2018
1 parent 913f71f commit 524dbb3
Show file tree
Hide file tree
Showing 6 changed files with 144 additions and 15 deletions.
39 changes: 37 additions & 2 deletions doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -440,15 +440,15 @@ In addition to these API breaking changes, many :ref:`performance improvements a
Raise ValueError in ``DataFrame.to_dict(orient='index')``
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Bug in :func:`DataFrame.to_dict` raises ``ValueError`` when used with
Bug in :func:`DataFrame.to_dict` raises ``ValueError`` when used with
``orient='index'`` and a non-unique index instead of losing data (:issue:`22801`)

.. ipython:: python
:okexcept:

df = pd.DataFrame({'a': [1, 2], 'b': [0.5, 0.75]}, index=['A', 'A'])
df

df.to_dict(orient='index')

.. _whatsnew_0240.api.datetimelike.normalize:
Expand Down Expand Up @@ -923,6 +923,41 @@ MultiIndex
I/O
^^^

.. _whatsnew_0240.bug_fixes.nan_with_str_dtype:

Proper handling of `np.NaN` in a string data-typed column with the Python engine
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

There was bug in :func:`read_excel` and :func:`read_csv` with the Python
engine, where missing values turned to ``'nan'`` with ``dtype=str`` and
``na_filter=True``. Now, these missing values are converted to the string
missing indicator, ``np.nan``. (:issue `20377`)

.. ipython:: python
:suppress:

from pandas.compat import StringIO

Previous Behavior:

.. code-block:: ipython

In [5]: data = 'a,b,c\n1,,3\n4,5,6'
In [6]: df = pd.read_csv(StringIO(data), engine='python', dtype=str, na_filter=True)
In [7]: df.loc[0, 'b']
Out[7]:
'nan'

Current Behavior:

.. ipython:: python

data = 'a,b,c\n1,,3\n4,5,6'
df = pd.read_csv(StringIO(data), engine='python', dtype=str, na_filter=True)
df.loc[0, 'b']

Notice how we now instead output ``np.nan`` itself instead of a stringified form of it.

- :func:`read_html()` no longer ignores all-whitespace ``<tr>`` within ``<thead>`` when considering the ``skiprows`` and ``header`` arguments. Previously, users had to decrease their ``header`` and ``skiprows`` values on such tables to work around the issue. (:issue:`21641`)
- :func:`read_excel()` will correctly show the deprecation warning for previously deprecated ``sheetname`` (:issue:`17994`)
- :func:`read_csv()` and func:`read_table()` will throw ``UnicodeError`` and not coredump on badly encoded strings (:issue:`22748`)
Expand Down
54 changes: 50 additions & 4 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -494,24 +494,70 @@ def astype_intsafe(ndarray[object] arr, new_dtype):
return result


def astype_unicode(arr: ndarray) -> ndarray[object]:
def astype_unicode(arr: ndarray,
skipna: bool=False) -> ndarray[object]:
"""
Convert all elements in an array to unicode.

Parameters
----------
arr : ndarray
The array whose elements we are casting.
skipna : bool, default False
Whether or not to coerce nulls to their stringified form
(e.g. NaN becomes 'nan').

Returns
-------
casted_arr : ndarray
A new array with the input array's elements casted.
"""
cdef:
object arr_i
Py_ssize_t i, n = arr.size
ndarray[object] result = np.empty(n, dtype=object)

for i in range(n):
result[i] = unicode(arr[i])
arr_i = arr[i]

if not (skipna and checknull(arr_i)):
arr_i = unicode(arr_i)

result[i] = arr_i

return result


def astype_str(arr: ndarray) -> ndarray[object]:
def astype_str(arr: ndarray,
skipna: bool=False) -> ndarray[object]:
"""
Convert all elements in an array to string.

Parameters
----------
arr : ndarray
The array whose elements we are casting.
skipna : bool, default False
Whether or not to coerce nulls to their stringified form
(e.g. NaN becomes 'nan').

Returns
-------
casted_arr : ndarray
A new array with the input array's elements casted.
"""
cdef:
object arr_i
Py_ssize_t i, n = arr.size
ndarray[object] result = np.empty(n, dtype=object)

for i in range(n):
result[i] = str(arr[i])
arr_i = arr[i]

if not (skipna and checknull(arr_i)):
arr_i = str(arr_i)

result[i] = arr_i

return result

Expand Down
16 changes: 10 additions & 6 deletions pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -645,17 +645,19 @@ def conv(r, dtype):
return [conv(r, dtype) for r, dtype in zip(result, dtypes)]


def astype_nansafe(arr, dtype, copy=True):
""" return a view if copy is False, but
need to be very careful as the result shape could change!
def astype_nansafe(arr, dtype, copy=True, skipna=False):
"""
Cast the elements of an array to a given dtype a nan-safe manner.
Parameters
----------
arr : ndarray
dtype : np.dtype
copy : bool, default True
If False, a view will be attempted but may fail, if
e.g. the itemsizes don't align.
e.g. the item sizes don't align.
skipna: bool, default False
Whether or not we should skip NaN when casting as a string-type.
"""

# dispatch on extension dtype if needed
Expand All @@ -668,10 +670,12 @@ def astype_nansafe(arr, dtype, copy=True):

if issubclass(dtype.type, text_type):
# in Py3 that's str, in Py2 that's unicode
return lib.astype_unicode(arr.ravel()).reshape(arr.shape)
return lib.astype_unicode(arr.ravel(),
skipna=skipna).reshape(arr.shape)

elif issubclass(dtype.type, string_types):
return lib.astype_str(arr.ravel()).reshape(arr.shape)
return lib.astype_str(arr.ravel(),
skipna=skipna).reshape(arr.shape)

elif is_datetime64_dtype(arr):
if is_object_dtype(dtype):
Expand Down
3 changes: 2 additions & 1 deletion pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1685,7 +1685,8 @@ def _cast_types(self, values, cast_type, column):

else:
try:
values = astype_nansafe(values, cast_type, copy=True)
values = astype_nansafe(values, cast_type,
copy=True, skipna=True)
except ValueError:
raise ValueError("Unable to convert column %s to "
"type %s" % (column, cast_type))
Expand Down
16 changes: 16 additions & 0 deletions pandas/tests/io/parser/na_values.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
parsing for all of the parsers defined in parsers.py
"""

import pytest
import numpy as np
from numpy import nan

Expand Down Expand Up @@ -380,3 +381,18 @@ def test_inf_na_values_with_int_index(self):
expected = DataFrame({"col1": [3, np.nan], "col2": [4, np.nan]},
index=Index([1, 2], name="idx"))
tm.assert_frame_equal(out, expected)

@pytest.mark.parametrize("na_filter", [True, False])
def test_na_values_with_dtype_str_and_na_filter(self, na_filter):
# see gh-20377
data = "a,b,c\n1,,3\n4,5,6"

# na_filter=True --> missing value becomes NaN.
# na_filter=False --> missing value remains empty string.
empty = np.nan if na_filter else ""
expected = DataFrame({"a": ["1", "4"],
"b": [empty, "5"],
"c": ["3", "6"]})

result = self.read_csv(StringIO(data), na_filter=na_filter, dtype=str)
tm.assert_frame_equal(result, expected)
31 changes: 29 additions & 2 deletions pandas/tests/io/test_excel.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import pandas as pd
import pandas.util.testing as tm
import pandas.util._test_decorators as td
from pandas import DataFrame, Index, MultiIndex
from pandas import DataFrame, Index, MultiIndex, Series
from pandas.compat import u, range, map, BytesIO, iteritems, PY36
from pandas.core.config import set_option, get_option
from pandas.io.common import URLError
Expand Down Expand Up @@ -371,7 +371,34 @@ def test_reader_dtype(self, ext):
tm.assert_frame_equal(actual, expected)

with pytest.raises(ValueError):
actual = self.get_exceldf(basename, ext, dtype={'d': 'int64'})
self.get_exceldf(basename, ext, dtype={'d': 'int64'})

@pytest.mark.parametrize("dtype,expected", [
(None,
DataFrame({
"a": [1, 2, 3, 4],
"b": [2.5, 3.5, 4.5, 5.5],
"c": [1, 2, 3, 4],
"d": [1.0, 2.0, np.nan, 4.0]
})),
({"a": "float64",
"b": "float32",
"c": str,
"d": str
},
DataFrame({
"a": Series([1, 2, 3, 4], dtype="float64"),
"b": Series([2.5, 3.5, 4.5, 5.5], dtype="float32"),
"c": ["001", "002", "003", "004"],
"d": ["1", "2", np.nan, "4"]
})),
])
def test_reader_dtype_str(self, ext, dtype, expected):
# see gh-20377
basename = "testdtype"

actual = self.get_exceldf(basename, ext, dtype=dtype)
tm.assert_frame_equal(actual, expected)

def test_reading_all_sheets(self, ext):
# Test reading all sheetnames by setting sheetname to None,
Expand Down

0 comments on commit 524dbb3

Please sign in to comment.