Skip to content

Commit

Permalink
COMPAT: Pypy tweaks (pandas-dev#17351)
Browse files Browse the repository at this point in the history
  • Loading branch information
mattip authored and jbrockmendel committed Sep 8, 2017
1 parent 2be1405 commit aafa941
Show file tree
Hide file tree
Showing 6 changed files with 92 additions and 17 deletions.
11 changes: 9 additions & 2 deletions doc/source/whatsnew/v0.21.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -371,13 +371,11 @@ Performance Improvements
Bug Fixes
~~~~~~~~~


Conversion
^^^^^^^^^^

- Bug in assignment against datetime-like data with ``int`` may incorrectly convert to datetime-like (:issue:`14145`)
- Bug in assignment against ``int64`` data with ``np.ndarray`` with ``float64`` dtype may keep ``int64`` dtype (:issue:`14001`)
- Fix :func:`DataFrame.memory_usage` to support PyPy. Objects on PyPy do not have a fixed size, so an approximation is used instead (:issue:`17228`)
- Fixed the return type of ``IntervalIndex.is_non_overlapping_monotonic`` to be a Python ``bool`` for consistency with similar attributes/methods. Previously returned a ``numpy.bool_``. (:issue:`17237`)
- Bug in ``IntervalIndex.is_non_overlapping_monotonic`` when intervals are closed on both sides and overlap at a point (:issue:`16560`)
- Bug in :func:`Series.fillna` returns frame when ``inplace=True`` and ``value`` is dict (:issue:`16156`)
Expand Down Expand Up @@ -463,6 +461,15 @@ Categorical
the ``.categories`` to be an empty ``Float64Index`` rather than an empty
``Index`` with object dtype (:issue:`17248`)

PyPy
^^^^

- Compatibility with PyPy in :func:`read_csv` with ``usecols=[<unsorted ints>]`` and
:func:`read_json` (:issue:`17351`)
- Split tests into cases for CPython and PyPy where needed, which highlights the fragility
of index matching with ``float('nan')``, ``np.nan`` and ``NAT`` (:issue:`17351`)
- Fix :func:`DataFrame.memory_usage` to support PyPy. Objects on PyPy do not have a fixed size,
so an approximation is used instead (:issue:`17228`)

Other
^^^^^
Expand Down
16 changes: 8 additions & 8 deletions pandas/_libs/src/ujson/python/JSONtoObj.c
Original file line number Diff line number Diff line change
Expand Up @@ -409,7 +409,7 @@ JSOBJ Object_npyEndObject(void *prv, JSOBJ obj) {
}

int Object_npyObjectAddKey(void *prv, JSOBJ obj, JSOBJ name, JSOBJ value) {
PyObject *label;
PyObject *label, *labels;
npy_intp labelidx;
// add key to label array, value to values array
NpyArrContext *npyarr = (NpyArrContext *)obj;
Expand All @@ -424,11 +424,11 @@ int Object_npyObjectAddKey(void *prv, JSOBJ obj, JSOBJ name, JSOBJ value) {
if (!npyarr->labels[labelidx]) {
npyarr->labels[labelidx] = PyList_New(0);
}

labels = npyarr->labels[labelidx];
// only fill label array once, assumes all column labels are the same
// for 2-dimensional arrays.
if (PyList_GET_SIZE(npyarr->labels[labelidx]) <= npyarr->elcount) {
PyList_Append(npyarr->labels[labelidx], label);
if (PyList_Check(labels) && PyList_GET_SIZE(labels) <= npyarr->elcount) {
PyList_Append(labels, label);
}

if (((JSONObjectDecoder *)npyarr->dec)->arrayAddItem(prv, obj, value)) {
Expand All @@ -439,16 +439,16 @@ int Object_npyObjectAddKey(void *prv, JSOBJ obj, JSOBJ name, JSOBJ value) {
}

int Object_objectAddKey(void *prv, JSOBJ obj, JSOBJ name, JSOBJ value) {
PyDict_SetItem(obj, name, value);
int ret = PyDict_SetItem(obj, name, value);
Py_DECREF((PyObject *)name);
Py_DECREF((PyObject *)value);
return 1;
return ret == 0 ? 1 : 0;
}

int Object_arrayAddItem(void *prv, JSOBJ obj, JSOBJ value) {
PyList_Append(obj, value);
int ret = PyList_Append(obj, value);
Py_DECREF((PyObject *)value);
return 1;
return ret == 0 ? 1 : 0;
}

JSOBJ Object_newString(void *prv, wchar_t *start, wchar_t *end) {
Expand Down
1 change: 1 addition & 0 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1716,6 +1716,7 @@ def _set_noconvert_columns(self):
# A set of integers will be converted to a list in
# the correct order every single time.
usecols = list(self.usecols)
usecols.sort()
elif (callable(self.usecols) or
self.usecols_dtype not in ('empty', None)):
# The names attribute should have the correct columns
Expand Down
16 changes: 12 additions & 4 deletions pandas/tests/indexes/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from pandas.tests.indexes.common import Base

from pandas.compat import (range, lrange, lzip, u,
text_type, zip, PY3, PY36)
text_type, zip, PY3, PY36, PYPY)
import operator
import numpy as np

Expand Down Expand Up @@ -1370,13 +1370,21 @@ def test_isin(self):
assert len(result) == 0
assert result.dtype == np.bool_

def test_isin_nan(self):
@pytest.mark.skipif(PYPY, reason="np.nan is float('nan') on PyPy")
def test_isin_nan_not_pypy(self):
tm.assert_numpy_array_equal(Index(['a', np.nan]).isin([float('nan')]),
np.array([False, False]))

@pytest.mark.skipif(not PYPY, reason="np.nan is float('nan') on PyPy")
def test_isin_nan_pypy(self):
tm.assert_numpy_array_equal(Index(['a', np.nan]).isin([float('nan')]),
np.array([False, True]))

def test_isin_nan_common(self):
tm.assert_numpy_array_equal(Index(['a', np.nan]).isin([np.nan]),
np.array([False, True]))
tm.assert_numpy_array_equal(Index(['a', pd.NaT]).isin([pd.NaT]),
np.array([False, True]))
tm.assert_numpy_array_equal(Index(['a', np.nan]).isin([float('nan')]),
np.array([False, False]))
tm.assert_numpy_array_equal(Index(['a', np.nan]).isin([pd.NaT]),
np.array([False, False]))

Expand Down
13 changes: 11 additions & 2 deletions pandas/tests/indexes/test_multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

from pandas import (CategoricalIndex, DataFrame, Index, MultiIndex,
compat, date_range, period_range)
from pandas.compat import PY3, long, lrange, lzip, range, u
from pandas.compat import PY3, long, lrange, lzip, range, u, PYPY
from pandas.errors import PerformanceWarning, UnsortedIndexError
from pandas.core.indexes.base import InvalidIndexError
from pandas._libs import lib
Expand Down Expand Up @@ -2571,13 +2571,22 @@ def test_isin(self):
assert len(result) == 0
assert result.dtype == np.bool_

def test_isin_nan(self):
@pytest.mark.skipif(PYPY, reason="tuples cmp recursively on PyPy")
def test_isin_nan_not_pypy(self):
idx = MultiIndex.from_arrays([['foo', 'bar'], [1.0, np.nan]])
tm.assert_numpy_array_equal(idx.isin([('bar', np.nan)]),
np.array([False, False]))
tm.assert_numpy_array_equal(idx.isin([('bar', float('nan'))]),
np.array([False, False]))

@pytest.mark.skipif(not PYPY, reason="tuples cmp recursively on PyPy")
def test_isin_nan_pypy(self):
idx = MultiIndex.from_arrays([['foo', 'bar'], [1.0, np.nan]])
tm.assert_numpy_array_equal(idx.isin([('bar', np.nan)]),
np.array([False, True]))
tm.assert_numpy_array_equal(idx.isin([('bar', float('nan'))]),
np.array([False, True]))

def test_isin_level_kwarg(self):
idx = MultiIndex.from_arrays([['qux', 'baz', 'foo', 'bar'], np.arange(
4)])
Expand Down
52 changes: 51 additions & 1 deletion pandas/tests/io/parser/test_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@
import os
import pandas.util.testing as tm

from pandas import read_csv, read_table
from pandas import read_csv, read_table, DataFrame
from pandas.core.common import AbstractMethodError
from pandas._libs.lib import Timestamp
from pandas.compat import StringIO

from .common import ParserTests
from .header import HeaderTests
Expand Down Expand Up @@ -100,3 +102,51 @@ def read_table(self, *args, **kwds):
kwds = kwds.copy()
kwds['engine'] = self.engine
return read_table(*args, **kwds)


class TestUnsortedUsecols(object):
def test_override__set_noconvert_columns(self):
# GH 17351 - usecols needs to be sorted in _setnoconvert_columns
# based on the test_usecols_with_parse_dates test from usecols.py
from pandas.io.parsers import CParserWrapper, TextFileReader

s = """a,b,c,d,e
0,1,20140101,0900,4
0,1,20140102,1000,4"""

parse_dates = [[1, 2]]
cols = {
'a': [0, 0],
'c_d': [
Timestamp('2014-01-01 09:00:00'),
Timestamp('2014-01-02 10:00:00')
]
}
expected = DataFrame(cols, columns=['c_d', 'a'])

class MyTextFileReader(TextFileReader):
def __init__(self):
self._currow = 0
self.squeeze = False

class MyCParserWrapper(CParserWrapper):
def _set_noconvert_columns(self):
if self.usecols_dtype == 'integer':
# self.usecols is a set, which is documented as unordered
# but in practice, a CPython set of integers is sorted.
# In other implementations this assumption does not hold.
# The following code simulates a different order, which
# before GH 17351 would cause the wrong columns to be
# converted via the parse_dates parameter
self.usecols = list(self.usecols)
self.usecols.reverse()
return CParserWrapper._set_noconvert_columns(self)

parser = MyTextFileReader()
parser.options = {'usecols': [0, 2, 3],
'parse_dates': parse_dates,
'delimiter': ','}
parser._engine = MyCParserWrapper(StringIO(s), **parser.options)
df = parser.read()

tm.assert_frame_equal(df, expected)

0 comments on commit aafa941

Please sign in to comment.