From 44e9fef3bd2d6b1aee16a4b4119898f2f19e9936 Mon Sep 17 00:00:00 2001 From: mattip Date: Sat, 12 Aug 2017 22:44:00 +0300 Subject: [PATCH] COMPAT: avoid calling getsizeof() on PyPy --- doc/source/whatsnew/v0.21.0.txt | 4 +++- pandas/compat/__init__.py | 2 ++ pandas/core/base.py | 6 ++--- pandas/core/indexes/multi.py | 3 ++- pandas/core/indexes/range.py | 7 ++++-- pandas/tests/frame/test_repr_info.py | 36 +++++++++++++++++++--------- pandas/tests/test_base.py | 4 +++- pandas/tests/test_categorical.py | 11 +++++---- pandas/util/testing.py | 1 - 9 files changed, 49 insertions(+), 25 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index cc9ab81ce0955..e79c319f8d73c 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -307,8 +307,10 @@ Bug Fixes Conversion ^^^^^^^^^^ -- Bug in assignment against datetime-like data with ``int`` may incorrectly converte to datetime-like (:issue:`14145`) +- Bug in assignment against datetime-like data with ``int`` may incorrectly convert to datetime-like (:issue:`14145`) - Bug in assignment against ``int64`` data with ``np.ndarray`` with ``float64`` dtype may keep ``int64`` dtype (:issue:`14001`) +- Add a ``default`` argument to ``sys.getsizeof`` used in ``memory_usage`` to support PyPy. Objects on PyPy do not have a + fixed size, so an approximage guess is used instead (:issue: `17228`) Indexing diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 33b41d61aa978..b367fda002b74 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -31,6 +31,7 @@ from distutils.version import LooseVersion from itertools import product import sys +import platform import types from unicodedata import east_asian_width import struct @@ -41,6 +42,7 @@ PY3 = (sys.version_info[0] >= 3) PY35 = (sys.version_info >= (3, 5)) PY36 = (sys.version_info >= (3, 6)) +PYPY = (platform.python_implementation() == 'PyPy') try: import __builtin__ as builtins diff --git a/pandas/core/base.py b/pandas/core/base.py index 8f21e3125a27e..4ae4736035793 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -15,6 +15,7 @@ import pandas.core.nanops as nanops import pandas._libs.lib as lib from pandas.compat.numpy import function as nv +from pandas.compat import PYPY from pandas.util._decorators import (Appender, cache_readonly, deprecate_kwarg, Substitution) from pandas.core.common import AbstractMethodError @@ -1061,7 +1062,7 @@ def memory_usage(self, deep=False): Notes ----- Memory usage does not include memory consumed by elements that - are not components of the array if deep=False + are not components of the array if deep=False or if used on PyPy See Also -------- @@ -1071,9 +1072,8 @@ def memory_usage(self, deep=False): return self.values.memory_usage(deep=deep) v = self.values.nbytes - if deep and is_object_dtype(self): + if deep and is_object_dtype(self) and not PYPY: v += lib.memory_usage_of_objects(self.values) - return v def factorize(self, sort=False, na_sentinel=-1): diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 420788f9008cd..4d917407d4aae 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -467,7 +467,8 @@ def _nbytes(self, deep=False): """ level_nbytes = sum((i.memory_usage(deep=deep) for i in self.levels)) label_nbytes = sum((i.nbytes for i in self.labels)) - names_nbytes = sum((getsizeof(i) for i in self.names)) + objsize = 24 # for implementations with no useful getsizeof (PyPy) + names_nbytes = sum((getsizeof(i, objsize) for i in self.names)) result = level_nbytes + label_nbytes + names_nbytes # include our engine hashtable diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 5071b50bbebdf..671d581b33583 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -194,8 +194,11 @@ def _format_data(self): @cache_readonly def nbytes(self): - """ return the number of bytes in the underlying data """ - return sum([getsizeof(getattr(self, v)) for v in + """ return the number of bytes in the underlying data + On implementations where this is problematic (PyPy) + assume 24 bytes for each value + """ + return sum([getsizeof(getattr(self, v), 24) for v in ['_start', '_stop', '_step']]) def memory_usage(self, deep=False): diff --git a/pandas/tests/frame/test_repr_info.py b/pandas/tests/frame/test_repr_info.py index c317ad542659a..188ad37c7e617 100644 --- a/pandas/tests/frame/test_repr_info.py +++ b/pandas/tests/frame/test_repr_info.py @@ -11,7 +11,7 @@ import pytest from pandas import (DataFrame, compat, option_context) -from pandas.compat import StringIO, lrange, u +from pandas.compat import StringIO, lrange, u, PYPY import pandas.io.formats.format as fmt import pandas as pd @@ -332,13 +332,25 @@ def test_info_memory_usage(self): res = buf.getvalue().splitlines() assert re.match(r"memory usage: [^+]+$", res[-1]) - assert (df_with_object_index.memory_usage( - index=True, deep=True).sum() > df_with_object_index.memory_usage( - index=True).sum()) + if PYPY: + assert (df_with_object_index.memory_usage( + index=True, deep=True).sum() == + df_with_object_index.memory_usage( + index=True).sum()) - df_object = pd.DataFrame({'a': ['a']}) - assert (df_object.memory_usage(deep=True).sum() > - df_object.memory_usage().sum()) + df_object = pd.DataFrame({'a': ['a']}) + assert (df_object.memory_usage(deep=True).sum() == + df_object.memory_usage().sum()) + + else: + assert (df_with_object_index.memory_usage( + index=True, deep=True).sum() > + df_with_object_index.memory_usage( + index=True).sum()) + + df_object = pd.DataFrame({'a': ['a']}) + assert (df_object.memory_usage(deep=True).sum() > + df_object.memory_usage().sum()) # Test a DataFrame with duplicate columns dtypes = ['int64', 'int64', 'int64', 'float64'] @@ -377,10 +389,12 @@ def test_info_memory_usage(self): df.memory_usage(index=True) df.index.values.nbytes - # sys.getsizeof will call the .memory_usage with - # deep=True, and add on some GC overhead - diff = df.memory_usage(deep=True).sum() - sys.getsizeof(df) - assert abs(diff) < 100 + mem = df.memory_usage(deep=True).sum() + if not PYPY: + # sys.getsizeof will call the .memory_usage with + # deep=True, and add on some GC overhead + diff = mem - sys.getsizeof(df) + assert abs(diff) < 100 def test_info_memory_usage_qualified(self): diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 9af4a9edeb8b1..9e92c7cf1a9b8 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -15,7 +15,7 @@ import pandas.util.testing as tm from pandas import (Series, Index, DatetimeIndex, TimedeltaIndex, PeriodIndex, Timedelta, IntervalIndex, Interval) -from pandas.compat import StringIO +from pandas.compat import StringIO, PYPY from pandas.compat.numpy import np_array_datetime64_compat from pandas.core.base import PandasDelegate, NoNewAttributesMixin from pandas.core.indexes.datetimelike import DatetimeIndexOpsMixin @@ -144,6 +144,7 @@ def f(): pytest.raises(TypeError, f) + @pytest.mark.skipif(PYPY, reason="not relevant for PyPy") def test_memory_usage(self): # Delegate does not implement memory_usage. # Check that we fall back to in-built `__sizeof__` @@ -941,6 +942,7 @@ def test_fillna(self): # check shallow_copied assert o is not result + @pytest.mark.skipif(PYPY, reason="not relevant for PyPy") def test_memory_usage(self): for o in self.objs: res = o.memory_usage() diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index eecdd672095b0..a0b585a16ad9a 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -24,7 +24,7 @@ period_range, PeriodIndex, timedelta_range, TimedeltaIndex, NaT, Interval, IntervalIndex) -from pandas.compat import range, lrange, u, PY3 +from pandas.compat import range, lrange, u, PY3, PYPY from pandas.core.config import option_context @@ -1448,10 +1448,11 @@ def test_memory_usage(self): cat = pd.Categorical(['foo', 'foo', 'bar']) assert cat.memory_usage(deep=True) > cat.nbytes - # sys.getsizeof will call the .memory_usage with - # deep=True, and add on some GC overhead - diff = cat.memory_usage(deep=True) - sys.getsizeof(cat) - assert abs(diff) < 100 + if not PYPY: + # sys.getsizeof will call the .memory_usage with + # deep=True, and add on some GC overhead + diff = cat.memory_usage(deep=True) - sys.getsizeof(cat) + assert abs(diff) < 100 def test_searchsorted(self): # https://github.com/pandas-dev/pandas/issues/8420 diff --git a/pandas/util/testing.py b/pandas/util/testing.py index d6ba9561340cc..b9ef6135a1833 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -56,7 +56,6 @@ K = 4 _RAISE_NETWORK_ERROR_DEFAULT = False - # set testing_mode _testing_mode_warnings = (DeprecationWarning, compat.ResourceWarning)