From d2623e423c3f27e6c13553d84c0143061830d0a1 Mon Sep 17 00:00:00 2001
From: mattip <matti.picus@gmail.com>
Date: Mon, 14 Aug 2017 23:23:14 +0300
Subject: [PATCH] COMPAT: avoid calling getsizeof() on PyPy

---
 doc/source/whatsnew/v0.21.0.txt      |  4 +-
 pandas/compat/__init__.py            |  2 +
 pandas/core/base.py                  |  6 +--
 pandas/core/indexes/multi.py         |  6 ++-
 pandas/core/indexes/range.py         |  8 +++-
 pandas/tests/frame/test_repr_info.py | 68 ++++++++++++++++++++--------
 pandas/tests/test_base.py            |  4 +-
 pandas/tests/test_categorical.py     | 11 +++--
 pandas/util/testing.py               |  1 -
 9 files changed, 77 insertions(+), 33 deletions(-)

diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt
index cc9ab81ce0955..c898f4f28067a 100644
--- a/doc/source/whatsnew/v0.21.0.txt
+++ b/doc/source/whatsnew/v0.21.0.txt
@@ -307,8 +307,10 @@ Bug Fixes
 Conversion
 ^^^^^^^^^^
 
-- Bug in assignment against datetime-like data with ``int`` may incorrectly converte to datetime-like (:issue:`14145`)
+- Bug in assignment against datetime-like data with ``int`` may incorrectly convert to datetime-like (:issue:`14145`)
 - Bug in assignment against ``int64`` data with ``np.ndarray`` with ``float64`` dtype may keep ``int64`` dtype (:issue:`14001`)
+- Fix ``memory_usage`` to support PyPy. Objects on PyPy do not have a
+  fixed size, so an approximation is used instead (:issue: `17228`)
 
 
 Indexing
diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py
index 33b41d61aa978..b367fda002b74 100644
--- a/pandas/compat/__init__.py
+++ b/pandas/compat/__init__.py
@@ -31,6 +31,7 @@
 from distutils.version import LooseVersion
 from itertools import product
 import sys
+import platform
 import types
 from unicodedata import east_asian_width
 import struct
@@ -41,6 +42,7 @@
 PY3 = (sys.version_info[0] >= 3)
 PY35 = (sys.version_info >= (3, 5))
 PY36 = (sys.version_info >= (3, 6))
+PYPY = (platform.python_implementation() == 'PyPy')
 
 try:
     import __builtin__ as builtins
diff --git a/pandas/core/base.py b/pandas/core/base.py
index 8f21e3125a27e..4ae4736035793 100644
--- a/pandas/core/base.py
+++ b/pandas/core/base.py
@@ -15,6 +15,7 @@
 import pandas.core.nanops as nanops
 import pandas._libs.lib as lib
 from pandas.compat.numpy import function as nv
+from pandas.compat import PYPY
 from pandas.util._decorators import (Appender, cache_readonly,
                                      deprecate_kwarg, Substitution)
 from pandas.core.common import AbstractMethodError
@@ -1061,7 +1062,7 @@ def memory_usage(self, deep=False):
         Notes
         -----
         Memory usage does not include memory consumed by elements that
-        are not components of the array if deep=False
+        are not components of the array if deep=False or if used on PyPy
 
         See Also
         --------
@@ -1071,9 +1072,8 @@ def memory_usage(self, deep=False):
             return self.values.memory_usage(deep=deep)
 
         v = self.values.nbytes
-        if deep and is_object_dtype(self):
+        if deep and is_object_dtype(self) and not PYPY:
             v += lib.memory_usage_of_objects(self.values)
-
         return v
 
     def factorize(self, sort=False, na_sentinel=-1):
diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
index 420788f9008cd..ea45b4700172f 100644
--- a/pandas/core/indexes/multi.py
+++ b/pandas/core/indexes/multi.py
@@ -465,9 +465,13 @@ def _nbytes(self, deep=False):
         *this is in internal routine*
 
         """
+
+        # for implementations with no useful getsizeof (PyPy)
+        objsize = 24
+
         level_nbytes = sum((i.memory_usage(deep=deep) for i in self.levels))
         label_nbytes = sum((i.nbytes for i in self.labels))
-        names_nbytes = sum((getsizeof(i) for i in self.names))
+        names_nbytes = sum((getsizeof(i, objsize) for i in self.names))
         result = level_nbytes + label_nbytes + names_nbytes
 
         # include our engine hashtable
diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py
index 5071b50bbebdf..ac4cc6986cace 100644
--- a/pandas/core/indexes/range.py
+++ b/pandas/core/indexes/range.py
@@ -194,8 +194,12 @@ def _format_data(self):
 
     @cache_readonly
     def nbytes(self):
-        """ return the number of bytes in the underlying data """
-        return sum([getsizeof(getattr(self, v)) for v in
+        """
+        Return the number of bytes in the underlying data
+        On implementations where this is undetermined (PyPy)
+        assume 24 bytes for each value
+        """
+        return sum([getsizeof(getattr(self, v), 24) for v in
                     ['_start', '_stop', '_step']])
 
     def memory_usage(self, deep=False):
diff --git a/pandas/tests/frame/test_repr_info.py b/pandas/tests/frame/test_repr_info.py
index c317ad542659a..37f8c0cc85b23 100644
--- a/pandas/tests/frame/test_repr_info.py
+++ b/pandas/tests/frame/test_repr_info.py
@@ -11,7 +11,7 @@
 import pytest
 
 from pandas import (DataFrame, compat, option_context)
-from pandas.compat import StringIO, lrange, u
+from pandas.compat import StringIO, lrange, u, PYPY
 import pandas.io.formats.format as fmt
 import pandas as pd
 
@@ -323,23 +323,6 @@ def test_info_memory_usage(self):
         # excluded column with object dtype, so estimate is accurate
         assert not re.match(r"memory usage: [^+]+\+", res[-1])
 
-        df_with_object_index = pd.DataFrame({'a': [1]}, index=['foo'])
-        df_with_object_index.info(buf=buf, memory_usage=True)
-        res = buf.getvalue().splitlines()
-        assert re.match(r"memory usage: [^+]+\+", res[-1])
-
-        df_with_object_index.info(buf=buf, memory_usage='deep')
-        res = buf.getvalue().splitlines()
-        assert re.match(r"memory usage: [^+]+$", res[-1])
-
-        assert (df_with_object_index.memory_usage(
-            index=True, deep=True).sum() > df_with_object_index.memory_usage(
-            index=True).sum())
-
-        df_object = pd.DataFrame({'a': ['a']})
-        assert (df_object.memory_usage(deep=True).sum() >
-                df_object.memory_usage().sum())
-
         # Test a DataFrame with duplicate columns
         dtypes = ['int64', 'int64', 'int64', 'float64']
         data = {}
@@ -349,6 +332,15 @@ def test_info_memory_usage(self):
         df = DataFrame(data)
         df.columns = dtypes
 
+        df_with_object_index = pd.DataFrame({'a': [1]}, index=['foo'])
+        df_with_object_index.info(buf=buf, memory_usage=True)
+        res = buf.getvalue().splitlines()
+        assert re.match(r"memory usage: [^+]+\+", res[-1])
+
+        df_with_object_index.info(buf=buf, memory_usage='deep')
+        res = buf.getvalue().splitlines()
+        assert re.match(r"memory usage: [^+]+$", res[-1])
+
         # Ensure df size is as expected
         # (cols * rows * bytes) + index size
         df_size = df.memory_usage().sum()
@@ -377,9 +369,47 @@ def test_info_memory_usage(self):
         df.memory_usage(index=True)
         df.index.values.nbytes
 
+        mem = df.memory_usage(deep=True).sum()
+        assert mem > 0
+
+    @pytest.mark.skipif(PYPY,
+                        reason="on PyPy deep=True doesn't change result")
+    def test_info_memory_usage_deep_not_pypy(self):
+        df_with_object_index = pd.DataFrame({'a': [1]}, index=['foo'])
+        assert (df_with_object_index.memory_usage(
+                index=True, deep=True).sum() >
+                df_with_object_index.memory_usage(
+                    index=True).sum())
+
+        df_object = pd.DataFrame({'a': ['a']})
+        assert (df_object.memory_usage(deep=True).sum() >
+                df_object.memory_usage().sum())
+
+    @pytest.mark.skipif(not PYPY,
+                        reason="on PyPy deep=True does not change result")
+    def test_info_memory_usage_deep_pypy(self):
+        df_with_object_index = pd.DataFrame({'a': [1]}, index=['foo'])
+        assert (df_with_object_index.memory_usage(
+                index=True, deep=True).sum() ==
+                df_with_object_index.memory_usage(
+                    index=True).sum())
+
+        df_object = pd.DataFrame({'a': ['a']})
+        assert (df_object.memory_usage(deep=True).sum() ==
+                df_object.memory_usage().sum())
+
+    @pytest.mark.skipif(PYPY, reason="PyPy getsizeof() fails by design")
+    def test_usage_via_getsizeof(self):
+        df = DataFrame(
+            data=1,
+            index=pd.MultiIndex.from_product(
+                [['a'], range(1000)]),
+            columns=['A']
+        )
+        mem = df.memory_usage(deep=True).sum()
         # sys.getsizeof will call the .memory_usage with
         # deep=True, and add on some GC overhead
-        diff = df.memory_usage(deep=True).sum() - sys.getsizeof(df)
+        diff = mem - sys.getsizeof(df)
         assert abs(diff) < 100
 
     def test_info_memory_usage_qualified(self):
diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py
index 9af4a9edeb8b1..9e92c7cf1a9b8 100644
--- a/pandas/tests/test_base.py
+++ b/pandas/tests/test_base.py
@@ -15,7 +15,7 @@
 import pandas.util.testing as tm
 from pandas import (Series, Index, DatetimeIndex, TimedeltaIndex, PeriodIndex,
                     Timedelta, IntervalIndex, Interval)
-from pandas.compat import StringIO
+from pandas.compat import StringIO, PYPY
 from pandas.compat.numpy import np_array_datetime64_compat
 from pandas.core.base import PandasDelegate, NoNewAttributesMixin
 from pandas.core.indexes.datetimelike import DatetimeIndexOpsMixin
@@ -144,6 +144,7 @@ def f():
 
         pytest.raises(TypeError, f)
 
+    @pytest.mark.skipif(PYPY, reason="not relevant for PyPy")
     def test_memory_usage(self):
         # Delegate does not implement memory_usage.
         # Check that we fall back to in-built `__sizeof__`
@@ -941,6 +942,7 @@ def test_fillna(self):
                 # check shallow_copied
                 assert o is not result
 
+    @pytest.mark.skipif(PYPY, reason="not relevant for PyPy")
     def test_memory_usage(self):
         for o in self.objs:
             res = o.memory_usage()
diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py
index eecdd672095b0..a0b585a16ad9a 100644
--- a/pandas/tests/test_categorical.py
+++ b/pandas/tests/test_categorical.py
@@ -24,7 +24,7 @@
                     period_range, PeriodIndex,
                     timedelta_range, TimedeltaIndex, NaT,
                     Interval, IntervalIndex)
-from pandas.compat import range, lrange, u, PY3
+from pandas.compat import range, lrange, u, PY3, PYPY
 from pandas.core.config import option_context
 
 
@@ -1448,10 +1448,11 @@ def test_memory_usage(self):
         cat = pd.Categorical(['foo', 'foo', 'bar'])
         assert cat.memory_usage(deep=True) > cat.nbytes
 
-        # sys.getsizeof will call the .memory_usage with
-        # deep=True, and add on some GC overhead
-        diff = cat.memory_usage(deep=True) - sys.getsizeof(cat)
-        assert abs(diff) < 100
+        if not PYPY:
+            # sys.getsizeof will call the .memory_usage with
+            # deep=True, and add on some GC overhead
+            diff = cat.memory_usage(deep=True) - sys.getsizeof(cat)
+            assert abs(diff) < 100
 
     def test_searchsorted(self):
         # https://github.com/pandas-dev/pandas/issues/8420
diff --git a/pandas/util/testing.py b/pandas/util/testing.py
index d6ba9561340cc..b9ef6135a1833 100644
--- a/pandas/util/testing.py
+++ b/pandas/util/testing.py
@@ -56,7 +56,6 @@
 K = 4
 _RAISE_NETWORK_ERROR_DEFAULT = False
 
-
 # set testing_mode
 _testing_mode_warnings = (DeprecationWarning, compat.ResourceWarning)