Closes #2960 int version of memory_usage (#3018)

* Closes #2960 int version of memory_usage * add Categorical.nbytes function * Fix doc string bug in Index.memory_usage * parameterize PROTO_test/tests for memory_usage functions --------- Co-authored-by: Amanda Potts <[email protected]>
Bears-R-Us · Mar 11, 2024 · ae7f6d5 · ae7f6d5
1 parent 7fb37ad
commit ae7f6d5
Show file tree

Hide file tree

Showing 15 changed files with 563 additions and 71 deletions.
diff --git a/PROTO_tests/tests/dataframe_test.py b/PROTO_tests/tests/dataframe_test.py
@@ -879,13 +879,13 @@ def test_isin(self):
         assert test_df["col_B"].to_list() == [False, False]
 
     def test_corr(self):
-        df = ak.DataFrame({'col1': [1, 2], 'col2': [-1, -2]})
+        df = ak.DataFrame({"col1": [1, 2], "col2": [-1, -2]})
         corr = df.corr()
         pd_corr = df.to_pandas().corr()
         assert_frame_equal(corr.to_pandas(retain_index=True), pd_corr)
 
         for i in range(5):
-            df = ak.DataFrame({'col1': ak.randint(0, 10, 10), 'col2': ak.randint(0, 10, 10)})
+            df = ak.DataFrame({"col1": ak.randint(0, 10, 10), "col2": ak.randint(0, 10, 10)})
             corr = df.corr()
             pd_corr = df.to_pandas().corr()
             assert_frame_equal(corr.to_pandas(retain_index=True), pd_corr)
@@ -990,6 +990,31 @@ def test_multi_col_merge(self):
                     # assert_frame_equal(sorted_ak.to_pandas()[sorted_column_names],
                     # sorted_pd[sorted_column_names])
 
+    def test_memory_usage(self):
+        dtypes = [ak.int64, ak.float64, ak.bool]
+        data = dict([(str(t), ak.ones(5000, dtype=ak.int64).astype(t)) for t in dtypes])
+        df = ak.DataFrame(data)
+        ak_memory_usage = df.memory_usage()
+        pd_memory_usage = pd.Series(
+            [40000, 40000, 40000, 5000], index=["Index", "int64", "float64", "bool"]
+        )
+        assert_series_equal(ak_memory_usage.to_pandas(), pd_memory_usage)
+
+        assert df.memory_usage_info(unit="B") == "125000.00 B"
+        assert df.memory_usage_info(unit="KB") == "122.07 KB"
+        assert df.memory_usage_info(unit="MB") == "0.12 MB"
+        assert df.memory_usage_info(unit="GB") == "0.00 GB"
+
+        ak_memory_usage = df.memory_usage(index=False)
+        pd_memory_usage = pd.Series([40000, 40000, 5000], index=["int64", "float64", "bool"])
+        assert_series_equal(ak_memory_usage.to_pandas(), pd_memory_usage)
+
+        ak_memory_usage = df.memory_usage(unit="KB")
+        pd_memory_usage = pd.Series(
+            [39.0625, 39.0625, 39.0625, 4.88281], index=["Index", "int64", "float64", "bool"]
+        )
+        assert_series_equal(ak_memory_usage.to_pandas(), pd_memory_usage)
+
 
 def pda_to_str_helper(pda):
     return ak.array([f"str {i}" for i in pda.to_list()])
diff --git a/PROTO_tests/tests/dtypes_test.py b/PROTO_tests/tests/dtypes_test.py
@@ -72,6 +72,29 @@ def test_resolve_scalar_dtype(self):
         assert "uint64" == dtypes.resolve_scalar_dtype(2**63 + 1)
         assert "bigint" == dtypes.resolve_scalar_dtype(2**64)
 
+    @pytest.mark.parametrize("size", pytest.prob_size)
+    def test_nbytes(self, size):
+        from arkouda.dtypes import BigInt
+
+        a = ak.cast(ak.arange(size), dt="bigint")
+        assert a.nbytes == size * BigInt.itemsize
+
+        dtype_list = [
+            ak.dtypes.uint8,
+            ak.dtypes.uint64,
+            ak.dtypes.int64,
+            ak.dtypes.float64,
+            ak.dtypes.bool,
+        ]
+
+        for dt in dtype_list:
+            a = ak.array(ak.arange(size), dtype=dt)
+            assert a.nbytes == size * dt.itemsize
+
+        a = ak.array(["a", "b", "c"])
+        c = ak.Categorical(a)
+        assert c.nbytes == 82
+
     def test_pdarrays_datatypes(self):
         assert dtypes.dtype("int64") == ak.array(np.arange(10)).dtype
         assert dtypes.dtype("uint64") == ak.array(np.arange(10), ak.uint64).dtype
@@ -176,8 +199,8 @@ def test_scalars(self):
         ) == str(ak.int_scalars)
 
         assert (
-            "typing.Union[float, numpy.float64, numpy.float32, int, numpy.int8, numpy.int16, numpy.int32, "
-            + "numpy.int64, numpy.uint8, numpy.uint16, numpy.uint32, numpy.uint64]"
+            "typing.Union[float, numpy.float64, numpy.float32, int, numpy.int8, numpy.int16, "
+            + "numpy.int32, numpy.int64, numpy.uint8, numpy.uint16, numpy.uint32, numpy.uint64]"
         ) == str(ak.numeric_scalars)
 
         assert "typing.Union[str, numpy.str_]" == str(ak.str_scalars)

diff --git a/PROTO_tests/tests/index_test.py b/PROTO_tests/tests/index_test.py
@@ -50,6 +50,27 @@ def test_multiindex_creation(self, size):
         with pytest.raises(ValueError):
             idx = ak.MultiIndex([ak.arange(size), ak.arange(size - 1)])
 
+    @pytest.mark.parametrize("size", pytest.prob_size)
+    def test_memory_usage(self, size):
+        from arkouda.dtypes import BigInt
+        from arkouda.index import Index, MultiIndex
+
+        idx = Index(ak.cast(ak.array([1, 2, 3]), dt="bigint"))
+        assert idx.memory_usage() == 3 * BigInt.itemsize
+
+        idx = Index(ak.cast(ak.arange(size), dt="int64"))
+        assert idx.memory_usage(unit="GB") == size * ak.dtypes.int64.itemsize / (1024 * 1024 * 1024)
+        assert idx.memory_usage(unit="MB") == size * ak.dtypes.int64.itemsize / (1024 * 1024)
+        assert idx.memory_usage(unit="KB") == size * ak.dtypes.int64.itemsize / 1024
+        assert idx.memory_usage(unit="B") == size * ak.dtypes.int64.itemsize
+
+        midx = MultiIndex([ak.cast(ak.arange(size), dt="int64"), ak.cast(ak.arange(size), dt="int64")])
+        assert midx.memory_usage(unit="GB") == 2 * size * ak.dtypes.int64.itemsize / (1024 * 1024 * 1024)
+
+        assert midx.memory_usage(unit="MB") == 2 * size * ak.dtypes.int64.itemsize / (1024 * 1024)
+        assert midx.memory_usage(unit="KB") == 2 * size * ak.dtypes.int64.itemsize / 1024
+        assert midx.memory_usage(unit="B") == 2 * size * ak.dtypes.int64.itemsize
+
     def test_is_unique(self):
         i = ak.Index(ak.array([0, 1, 2]))
         assert i.is_unique

diff --git a/PROTO_tests/tests/series_test.py b/PROTO_tests/tests/series_test.py
@@ -189,3 +189,22 @@ def test_index_as_index_compat(self):
         g = df.groupby(["a", "b"])
         series = ak.Series(data=g.sum("c")["c"], index=g.sum("c").index)
         g.broadcast(series)
+
+    @pytest.mark.parametrize("size", pytest.prob_size)
+    def test_memory_usage(self, size):
+        s = ak.Series(ak.arange(size))
+        assert s.memory_usage(unit="GB", index=False) == size * ak.dtypes.int64.itemsize / (
+            1024 * 1024 * 1024
+        )
+        assert s.memory_usage(unit="MB", index=False) == size * ak.dtypes.int64.itemsize / (1024 * 1024)
+        assert s.memory_usage(unit="KB", index=False) == size * ak.dtypes.int64.itemsize / 1024
+        assert s.memory_usage(unit="B", index=False) == size * ak.dtypes.int64.itemsize
+
+        assert s.memory_usage(unit="GB", index=True) == 2 * size * ak.dtypes.int64.itemsize / (
+            1024 * 1024 * 1024
+        )
+        assert s.memory_usage(unit="MB", index=True) == 2 * size * ak.dtypes.int64.itemsize / (
+            1024 * 1024
+        )
+        assert s.memory_usage(unit="KB", index=True) == 2 * size * ak.dtypes.int64.itemsize / 1024
+        assert s.memory_usage(unit="B", index=True) == 2 * size * ak.dtypes.int64.itemsize
diff --git a/arkouda/categorical.py b/arkouda/categorical.py
@@ -138,6 +138,38 @@ def __init__(self, values, **kwargs) -> None:
         self.dtype = str_
         self.registered_name: Optional[str] = None
 
+    @property
+    def nbytes(self):
+        """
+        The size of the Categorical in bytes.
+
+        Returns
+        -------
+        int
+            The size of the Categorical in bytes.
+
+        """
+        nbytes = 0
+        if self.categories is not None:
+            nbytes += self.categories.nbytes
+
+        if isinstance(self.codes, pdarray):
+            nbytes += self.codes.nbytes
+        elif isinstance(self.codes, akint64):
+            nbytes += 1
+
+        if isinstance(self.permutation, pdarray):
+            nbytes += self.permutation.nbytes
+        elif isinstance(self.permutation, akint64):
+            nbytes += 1
+
+        if isinstance(self.segments, pdarray):
+            nbytes += self.segments.nbytes
+        elif isinstance(self.segments, akint64):
+            nbytes += 1
+
+        return nbytes
+
     @classmethod
     @typechecked
     def from_codes(