From 524dbb3316bf3984606eac75243d4452290aee49 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Sun, 14 Oct 2018 22:08:06 -0700 Subject: [PATCH] BUG: Don't parse NaN as 'nan' in Data IO Closes gh-20377. --- doc/source/whatsnew/v0.24.0.txt | 39 +++++++++++++++++++-- pandas/_libs/lib.pyx | 54 ++++++++++++++++++++++++++--- pandas/core/dtypes/cast.py | 16 +++++---- pandas/io/parsers.py | 3 +- pandas/tests/io/parser/na_values.py | 16 +++++++++ pandas/tests/io/test_excel.py | 31 +++++++++++++++-- 6 files changed, 144 insertions(+), 15 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 30536257215605..b84e63e354027c 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -440,7 +440,7 @@ In addition to these API breaking changes, many :ref:`performance improvements a Raise ValueError in ``DataFrame.to_dict(orient='index')`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Bug in :func:`DataFrame.to_dict` raises ``ValueError`` when used with +Bug in :func:`DataFrame.to_dict` raises ``ValueError`` when used with ``orient='index'`` and a non-unique index instead of losing data (:issue:`22801`) .. ipython:: python @@ -448,7 +448,7 @@ Bug in :func:`DataFrame.to_dict` raises ``ValueError`` when used with df = pd.DataFrame({'a': [1, 2], 'b': [0.5, 0.75]}, index=['A', 'A']) df - + df.to_dict(orient='index') .. _whatsnew_0240.api.datetimelike.normalize: @@ -923,6 +923,41 @@ MultiIndex I/O ^^^ +.. _whatsnew_0240.bug_fixes.nan_with_str_dtype: + +Proper handling of `np.NaN` in a string data-typed column with the Python engine +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +There was bug in :func:`read_excel` and :func:`read_csv` with the Python +engine, where missing values turned to ``'nan'`` with ``dtype=str`` and +``na_filter=True``. Now, these missing values are converted to the string +missing indicator, ``np.nan``. (:issue `20377`) + +.. ipython:: python + :suppress: + + from pandas.compat import StringIO + +Previous Behavior: + +.. code-block:: ipython + + In [5]: data = 'a,b,c\n1,,3\n4,5,6' + In [6]: df = pd.read_csv(StringIO(data), engine='python', dtype=str, na_filter=True) + In [7]: df.loc[0, 'b'] + Out[7]: + 'nan' + +Current Behavior: + +.. ipython:: python + + data = 'a,b,c\n1,,3\n4,5,6' + df = pd.read_csv(StringIO(data), engine='python', dtype=str, na_filter=True) + df.loc[0, 'b'] + +Notice how we now instead output ``np.nan`` itself instead of a stringified form of it. + - :func:`read_html()` no longer ignores all-whitespace ```` within ```` when considering the ``skiprows`` and ``header`` arguments. Previously, users had to decrease their ``header`` and ``skiprows`` values on such tables to work around the issue. (:issue:`21641`) - :func:`read_excel()` will correctly show the deprecation warning for previously deprecated ``sheetname`` (:issue:`17994`) - :func:`read_csv()` and func:`read_table()` will throw ``UnicodeError`` and not coredump on badly encoded strings (:issue:`22748`) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 0b9793a6ef97af..c5d5a431e81397 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -494,24 +494,70 @@ def astype_intsafe(ndarray[object] arr, new_dtype): return result -def astype_unicode(arr: ndarray) -> ndarray[object]: +def astype_unicode(arr: ndarray, + skipna: bool=False) -> ndarray[object]: + """ + Convert all elements in an array to unicode. + + Parameters + ---------- + arr : ndarray + The array whose elements we are casting. + skipna : bool, default False + Whether or not to coerce nulls to their stringified form + (e.g. NaN becomes 'nan'). + + Returns + ------- + casted_arr : ndarray + A new array with the input array's elements casted. + """ cdef: + object arr_i Py_ssize_t i, n = arr.size ndarray[object] result = np.empty(n, dtype=object) for i in range(n): - result[i] = unicode(arr[i]) + arr_i = arr[i] + + if not (skipna and checknull(arr_i)): + arr_i = unicode(arr_i) + + result[i] = arr_i return result -def astype_str(arr: ndarray) -> ndarray[object]: +def astype_str(arr: ndarray, + skipna: bool=False) -> ndarray[object]: + """ + Convert all elements in an array to string. + + Parameters + ---------- + arr : ndarray + The array whose elements we are casting. + skipna : bool, default False + Whether or not to coerce nulls to their stringified form + (e.g. NaN becomes 'nan'). + + Returns + ------- + casted_arr : ndarray + A new array with the input array's elements casted. + """ cdef: + object arr_i Py_ssize_t i, n = arr.size ndarray[object] result = np.empty(n, dtype=object) for i in range(n): - result[i] = str(arr[i]) + arr_i = arr[i] + + if not (skipna and checknull(arr_i)): + arr_i = str(arr_i) + + result[i] = arr_i return result diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index a95a45d5f9ae4e..56bf394729773f 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -645,9 +645,9 @@ def conv(r, dtype): return [conv(r, dtype) for r, dtype in zip(result, dtypes)] -def astype_nansafe(arr, dtype, copy=True): - """ return a view if copy is False, but - need to be very careful as the result shape could change! +def astype_nansafe(arr, dtype, copy=True, skipna=False): + """ + Cast the elements of an array to a given dtype a nan-safe manner. Parameters ---------- @@ -655,7 +655,9 @@ def astype_nansafe(arr, dtype, copy=True): dtype : np.dtype copy : bool, default True If False, a view will be attempted but may fail, if - e.g. the itemsizes don't align. + e.g. the item sizes don't align. + skipna: bool, default False + Whether or not we should skip NaN when casting as a string-type. """ # dispatch on extension dtype if needed @@ -668,10 +670,12 @@ def astype_nansafe(arr, dtype, copy=True): if issubclass(dtype.type, text_type): # in Py3 that's str, in Py2 that's unicode - return lib.astype_unicode(arr.ravel()).reshape(arr.shape) + return lib.astype_unicode(arr.ravel(), + skipna=skipna).reshape(arr.shape) elif issubclass(dtype.type, string_types): - return lib.astype_str(arr.ravel()).reshape(arr.shape) + return lib.astype_str(arr.ravel(), + skipna=skipna).reshape(arr.shape) elif is_datetime64_dtype(arr): if is_object_dtype(dtype): diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 1edc6f6e14442c..eeba30ed8a44f8 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1685,7 +1685,8 @@ def _cast_types(self, values, cast_type, column): else: try: - values = astype_nansafe(values, cast_type, copy=True) + values = astype_nansafe(values, cast_type, + copy=True, skipna=True) except ValueError: raise ValueError("Unable to convert column %s to " "type %s" % (column, cast_type)) diff --git a/pandas/tests/io/parser/na_values.py b/pandas/tests/io/parser/na_values.py index 880ab707cfd07f..29aed63e657fb1 100644 --- a/pandas/tests/io/parser/na_values.py +++ b/pandas/tests/io/parser/na_values.py @@ -5,6 +5,7 @@ parsing for all of the parsers defined in parsers.py """ +import pytest import numpy as np from numpy import nan @@ -380,3 +381,18 @@ def test_inf_na_values_with_int_index(self): expected = DataFrame({"col1": [3, np.nan], "col2": [4, np.nan]}, index=Index([1, 2], name="idx")) tm.assert_frame_equal(out, expected) + + @pytest.mark.parametrize("na_filter", [True, False]) + def test_na_values_with_dtype_str_and_na_filter(self, na_filter): + # see gh-20377 + data = "a,b,c\n1,,3\n4,5,6" + + # na_filter=True --> missing value becomes NaN. + # na_filter=False --> missing value remains empty string. + empty = np.nan if na_filter else "" + expected = DataFrame({"a": ["1", "4"], + "b": [empty, "5"], + "c": ["3", "6"]}) + + result = self.read_csv(StringIO(data), na_filter=na_filter, dtype=str) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index a639556eb07d6d..1bd2fb5887e384 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -14,7 +14,7 @@ import pandas as pd import pandas.util.testing as tm import pandas.util._test_decorators as td -from pandas import DataFrame, Index, MultiIndex +from pandas import DataFrame, Index, MultiIndex, Series from pandas.compat import u, range, map, BytesIO, iteritems, PY36 from pandas.core.config import set_option, get_option from pandas.io.common import URLError @@ -371,7 +371,34 @@ def test_reader_dtype(self, ext): tm.assert_frame_equal(actual, expected) with pytest.raises(ValueError): - actual = self.get_exceldf(basename, ext, dtype={'d': 'int64'}) + self.get_exceldf(basename, ext, dtype={'d': 'int64'}) + + @pytest.mark.parametrize("dtype,expected", [ + (None, + DataFrame({ + "a": [1, 2, 3, 4], + "b": [2.5, 3.5, 4.5, 5.5], + "c": [1, 2, 3, 4], + "d": [1.0, 2.0, np.nan, 4.0] + })), + ({"a": "float64", + "b": "float32", + "c": str, + "d": str + }, + DataFrame({ + "a": Series([1, 2, 3, 4], dtype="float64"), + "b": Series([2.5, 3.5, 4.5, 5.5], dtype="float32"), + "c": ["001", "002", "003", "004"], + "d": ["1", "2", np.nan, "4"] + })), + ]) + def test_reader_dtype_str(self, ext, dtype, expected): + # see gh-20377 + basename = "testdtype" + + actual = self.get_exceldf(basename, ext, dtype=dtype) + tm.assert_frame_equal(actual, expected) def test_reading_all_sheets(self, ext): # Test reading all sheetnames by setting sheetname to None,