Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

COMPAT: avoid calling getsizeof() on PyPy #17229

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion doc/source/whatsnew/v0.21.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -307,8 +307,10 @@ Bug Fixes
Conversion
^^^^^^^^^^

- Bug in assignment against datetime-like data with ``int`` may incorrectly converte to datetime-like (:issue:`14145`)
- Bug in assignment against datetime-like data with ``int`` may incorrectly convert to datetime-like (:issue:`14145`)
- Bug in assignment against ``int64`` data with ``np.ndarray`` with ``float64`` dtype may keep ``int64`` dtype (:issue:`14001`)
- Fix ``memory_usage`` to support PyPy. Objects on PyPy do not have a
fixed size, so an approximation is used instead (:issue: `17228`)


Indexing
Expand Down
2 changes: 2 additions & 0 deletions pandas/compat/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
from distutils.version import LooseVersion
from itertools import product
import sys
import platform
import types
from unicodedata import east_asian_width
import struct
Expand All @@ -41,6 +42,7 @@
PY3 = (sys.version_info[0] >= 3)
PY35 = (sys.version_info >= (3, 5))
PY36 = (sys.version_info >= (3, 6))
PYPY = (platform.python_implementation() == 'PyPy')

try:
import __builtin__ as builtins
Expand Down
6 changes: 3 additions & 3 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import pandas.core.nanops as nanops
import pandas._libs.lib as lib
from pandas.compat.numpy import function as nv
from pandas.compat import PYPY
from pandas.util._decorators import (Appender, cache_readonly,
deprecate_kwarg, Substitution)
from pandas.core.common import AbstractMethodError
Expand Down Expand Up @@ -1061,7 +1062,7 @@ def memory_usage(self, deep=False):
Notes
-----
Memory usage does not include memory consumed by elements that
are not components of the array if deep=False
are not components of the array if deep=False or if used on PyPy

See Also
--------
Expand All @@ -1071,9 +1072,8 @@ def memory_usage(self, deep=False):
return self.values.memory_usage(deep=deep)

v = self.values.nbytes
if deep and is_object_dtype(self):
if deep and is_object_dtype(self) and not PYPY:
v += lib.memory_usage_of_objects(self.values)

return v

def factorize(self, sort=False, na_sentinel=-1):
Expand Down
6 changes: 5 additions & 1 deletion pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -465,9 +465,13 @@ def _nbytes(self, deep=False):
*this is in internal routine*

"""

# for implementations with no useful getsizeof (PyPy)
objsize = 24

level_nbytes = sum((i.memory_usage(deep=deep) for i in self.levels))
label_nbytes = sum((i.nbytes for i in self.labels))
names_nbytes = sum((getsizeof(i) for i in self.names))
names_nbytes = sum((getsizeof(i, objsize) for i in self.names))
result = level_nbytes + label_nbytes + names_nbytes

# include our engine hashtable
Expand Down
8 changes: 6 additions & 2 deletions pandas/core/indexes/range.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,8 +194,12 @@ def _format_data(self):

@cache_readonly
def nbytes(self):
""" return the number of bytes in the underlying data """
return sum([getsizeof(getattr(self, v)) for v in
"""
Return the number of bytes in the underlying data
On implementations where this is undetermined (PyPy)
assume 24 bytes for each value
"""
return sum([getsizeof(getattr(self, v), 24) for v in
['_start', '_stop', '_step']])

def memory_usage(self, deep=False):
Expand Down
68 changes: 49 additions & 19 deletions pandas/tests/frame/test_repr_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import pytest

from pandas import (DataFrame, compat, option_context)
from pandas.compat import StringIO, lrange, u
from pandas.compat import StringIO, lrange, u, PYPY
import pandas.io.formats.format as fmt
import pandas as pd

Expand Down Expand Up @@ -323,23 +323,6 @@ def test_info_memory_usage(self):
# excluded column with object dtype, so estimate is accurate
assert not re.match(r"memory usage: [^+]+\+", res[-1])

df_with_object_index = pd.DataFrame({'a': [1]}, index=['foo'])
df_with_object_index.info(buf=buf, memory_usage=True)
res = buf.getvalue().splitlines()
assert re.match(r"memory usage: [^+]+\+", res[-1])

df_with_object_index.info(buf=buf, memory_usage='deep')
res = buf.getvalue().splitlines()
assert re.match(r"memory usage: [^+]+$", res[-1])

assert (df_with_object_index.memory_usage(
index=True, deep=True).sum() > df_with_object_index.memory_usage(
index=True).sum())

df_object = pd.DataFrame({'a': ['a']})
assert (df_object.memory_usage(deep=True).sum() >
df_object.memory_usage().sum())

# Test a DataFrame with duplicate columns
dtypes = ['int64', 'int64', 'int64', 'float64']
data = {}
Expand All @@ -349,6 +332,15 @@ def test_info_memory_usage(self):
df = DataFrame(data)
df.columns = dtypes

df_with_object_index = pd.DataFrame({'a': [1]}, index=['foo'])
df_with_object_index.info(buf=buf, memory_usage=True)
res = buf.getvalue().splitlines()
assert re.match(r"memory usage: [^+]+\+", res[-1])

df_with_object_index.info(buf=buf, memory_usage='deep')
res = buf.getvalue().splitlines()
assert re.match(r"memory usage: [^+]+$", res[-1])

# Ensure df size is as expected
# (cols * rows * bytes) + index size
df_size = df.memory_usage().sum()
Expand Down Expand Up @@ -377,9 +369,47 @@ def test_info_memory_usage(self):
df.memory_usage(index=True)
df.index.values.nbytes

mem = df.memory_usage(deep=True).sum()
assert mem > 0

@pytest.mark.skipif(PYPY,
reason="on PyPy deep=True doesn't change result")
def test_info_memory_usage_deep_not_pypy(self):
df_with_object_index = pd.DataFrame({'a': [1]}, index=['foo'])
assert (df_with_object_index.memory_usage(
index=True, deep=True).sum() >
df_with_object_index.memory_usage(
index=True).sum())

df_object = pd.DataFrame({'a': ['a']})
assert (df_object.memory_usage(deep=True).sum() >
df_object.memory_usage().sum())

@pytest.mark.skipif(not PYPY,
reason="on PyPy deep=True does not change result")
def test_info_memory_usage_deep_pypy(self):
df_with_object_index = pd.DataFrame({'a': [1]}, index=['foo'])
assert (df_with_object_index.memory_usage(
index=True, deep=True).sum() ==
df_with_object_index.memory_usage(
index=True).sum())

df_object = pd.DataFrame({'a': ['a']})
assert (df_object.memory_usage(deep=True).sum() ==
df_object.memory_usage().sum())

@pytest.mark.skipif(PYPY, reason="PyPy getsizeof() fails by design")
def test_usage_via_getsizeof(self):
df = DataFrame(
data=1,
index=pd.MultiIndex.from_product(
[['a'], range(1000)]),
columns=['A']
)
mem = df.memory_usage(deep=True).sum()
# sys.getsizeof will call the .memory_usage with
# deep=True, and add on some GC overhead
diff = df.memory_usage(deep=True).sum() - sys.getsizeof(df)
diff = mem - sys.getsizeof(df)
assert abs(diff) < 100

def test_info_memory_usage_qualified(self):
Expand Down
4 changes: 3 additions & 1 deletion pandas/tests/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import pandas.util.testing as tm
from pandas import (Series, Index, DatetimeIndex, TimedeltaIndex, PeriodIndex,
Timedelta, IntervalIndex, Interval)
from pandas.compat import StringIO
from pandas.compat import StringIO, PYPY
from pandas.compat.numpy import np_array_datetime64_compat
from pandas.core.base import PandasDelegate, NoNewAttributesMixin
from pandas.core.indexes.datetimelike import DatetimeIndexOpsMixin
Expand Down Expand Up @@ -144,6 +144,7 @@ def f():

pytest.raises(TypeError, f)

@pytest.mark.skipif(PYPY, reason="not relevant for PyPy")
def test_memory_usage(self):
# Delegate does not implement memory_usage.
# Check that we fall back to in-built `__sizeof__`
Expand Down Expand Up @@ -941,6 +942,7 @@ def test_fillna(self):
# check shallow_copied
assert o is not result

@pytest.mark.skipif(PYPY, reason="not relevant for PyPy")
def test_memory_usage(self):
for o in self.objs:
res = o.memory_usage()
Expand Down
11 changes: 6 additions & 5 deletions pandas/tests/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
period_range, PeriodIndex,
timedelta_range, TimedeltaIndex, NaT,
Interval, IntervalIndex)
from pandas.compat import range, lrange, u, PY3
from pandas.compat import range, lrange, u, PY3, PYPY
from pandas.core.config import option_context


Expand Down Expand Up @@ -1448,10 +1448,11 @@ def test_memory_usage(self):
cat = pd.Categorical(['foo', 'foo', 'bar'])
assert cat.memory_usage(deep=True) > cat.nbytes

# sys.getsizeof will call the .memory_usage with
# deep=True, and add on some GC overhead
diff = cat.memory_usage(deep=True) - sys.getsizeof(cat)
assert abs(diff) < 100
if not PYPY:
# sys.getsizeof will call the .memory_usage with
# deep=True, and add on some GC overhead
diff = cat.memory_usage(deep=True) - sys.getsizeof(cat)
assert abs(diff) < 100

def test_searchsorted(self):
# https://github.com/pandas-dev/pandas/issues/8420
Expand Down
1 change: 0 additions & 1 deletion pandas/util/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,6 @@
K = 4
_RAISE_NETWORK_ERROR_DEFAULT = False


# set testing_mode
_testing_mode_warnings = (DeprecationWarning, compat.ResourceWarning)

Expand Down