Skip to content

Commit

Permalink
Closes #2960 int version of memory_usage (#3018)
Browse files Browse the repository at this point in the history
* Closes #2960 int version of memory_usage

* add Categorical.nbytes function

* Fix doc string bug in Index.memory_usage

* parameterize PROTO_test/tests for memory_usage functions

---------

Co-authored-by: Amanda Potts <[email protected]>
  • Loading branch information
ajpotts and ajpotts authored Mar 11, 2024
1 parent 7fb37ad commit ae7f6d5
Show file tree
Hide file tree
Showing 15 changed files with 563 additions and 71 deletions.
29 changes: 27 additions & 2 deletions PROTO_tests/tests/dataframe_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -879,13 +879,13 @@ def test_isin(self):
assert test_df["col_B"].to_list() == [False, False]

def test_corr(self):
df = ak.DataFrame({'col1': [1, 2], 'col2': [-1, -2]})
df = ak.DataFrame({"col1": [1, 2], "col2": [-1, -2]})
corr = df.corr()
pd_corr = df.to_pandas().corr()
assert_frame_equal(corr.to_pandas(retain_index=True), pd_corr)

for i in range(5):
df = ak.DataFrame({'col1': ak.randint(0, 10, 10), 'col2': ak.randint(0, 10, 10)})
df = ak.DataFrame({"col1": ak.randint(0, 10, 10), "col2": ak.randint(0, 10, 10)})
corr = df.corr()
pd_corr = df.to_pandas().corr()
assert_frame_equal(corr.to_pandas(retain_index=True), pd_corr)
Expand Down Expand Up @@ -990,6 +990,31 @@ def test_multi_col_merge(self):
# assert_frame_equal(sorted_ak.to_pandas()[sorted_column_names],
# sorted_pd[sorted_column_names])

def test_memory_usage(self):
dtypes = [ak.int64, ak.float64, ak.bool]
data = dict([(str(t), ak.ones(5000, dtype=ak.int64).astype(t)) for t in dtypes])
df = ak.DataFrame(data)
ak_memory_usage = df.memory_usage()
pd_memory_usage = pd.Series(
[40000, 40000, 40000, 5000], index=["Index", "int64", "float64", "bool"]
)
assert_series_equal(ak_memory_usage.to_pandas(), pd_memory_usage)

assert df.memory_usage_info(unit="B") == "125000.00 B"
assert df.memory_usage_info(unit="KB") == "122.07 KB"
assert df.memory_usage_info(unit="MB") == "0.12 MB"
assert df.memory_usage_info(unit="GB") == "0.00 GB"

ak_memory_usage = df.memory_usage(index=False)
pd_memory_usage = pd.Series([40000, 40000, 5000], index=["int64", "float64", "bool"])
assert_series_equal(ak_memory_usage.to_pandas(), pd_memory_usage)

ak_memory_usage = df.memory_usage(unit="KB")
pd_memory_usage = pd.Series(
[39.0625, 39.0625, 39.0625, 4.88281], index=["Index", "int64", "float64", "bool"]
)
assert_series_equal(ak_memory_usage.to_pandas(), pd_memory_usage)


def pda_to_str_helper(pda):
return ak.array([f"str {i}" for i in pda.to_list()])
27 changes: 25 additions & 2 deletions PROTO_tests/tests/dtypes_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,29 @@ def test_resolve_scalar_dtype(self):
assert "uint64" == dtypes.resolve_scalar_dtype(2**63 + 1)
assert "bigint" == dtypes.resolve_scalar_dtype(2**64)

@pytest.mark.parametrize("size", pytest.prob_size)
def test_nbytes(self, size):
from arkouda.dtypes import BigInt

a = ak.cast(ak.arange(size), dt="bigint")
assert a.nbytes == size * BigInt.itemsize

dtype_list = [
ak.dtypes.uint8,
ak.dtypes.uint64,
ak.dtypes.int64,
ak.dtypes.float64,
ak.dtypes.bool,
]

for dt in dtype_list:
a = ak.array(ak.arange(size), dtype=dt)
assert a.nbytes == size * dt.itemsize

a = ak.array(["a", "b", "c"])
c = ak.Categorical(a)
assert c.nbytes == 82

def test_pdarrays_datatypes(self):
assert dtypes.dtype("int64") == ak.array(np.arange(10)).dtype
assert dtypes.dtype("uint64") == ak.array(np.arange(10), ak.uint64).dtype
Expand Down Expand Up @@ -176,8 +199,8 @@ def test_scalars(self):
) == str(ak.int_scalars)

assert (
"typing.Union[float, numpy.float64, numpy.float32, int, numpy.int8, numpy.int16, numpy.int32, "
+ "numpy.int64, numpy.uint8, numpy.uint16, numpy.uint32, numpy.uint64]"
"typing.Union[float, numpy.float64, numpy.float32, int, numpy.int8, numpy.int16, "
+ "numpy.int32, numpy.int64, numpy.uint8, numpy.uint16, numpy.uint32, numpy.uint64]"
) == str(ak.numeric_scalars)

assert "typing.Union[str, numpy.str_]" == str(ak.str_scalars)
Expand Down
21 changes: 21 additions & 0 deletions PROTO_tests/tests/index_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,27 @@ def test_multiindex_creation(self, size):
with pytest.raises(ValueError):
idx = ak.MultiIndex([ak.arange(size), ak.arange(size - 1)])

@pytest.mark.parametrize("size", pytest.prob_size)
def test_memory_usage(self, size):
from arkouda.dtypes import BigInt
from arkouda.index import Index, MultiIndex

idx = Index(ak.cast(ak.array([1, 2, 3]), dt="bigint"))
assert idx.memory_usage() == 3 * BigInt.itemsize

idx = Index(ak.cast(ak.arange(size), dt="int64"))
assert idx.memory_usage(unit="GB") == size * ak.dtypes.int64.itemsize / (1024 * 1024 * 1024)
assert idx.memory_usage(unit="MB") == size * ak.dtypes.int64.itemsize / (1024 * 1024)
assert idx.memory_usage(unit="KB") == size * ak.dtypes.int64.itemsize / 1024
assert idx.memory_usage(unit="B") == size * ak.dtypes.int64.itemsize

midx = MultiIndex([ak.cast(ak.arange(size), dt="int64"), ak.cast(ak.arange(size), dt="int64")])
assert midx.memory_usage(unit="GB") == 2 * size * ak.dtypes.int64.itemsize / (1024 * 1024 * 1024)

assert midx.memory_usage(unit="MB") == 2 * size * ak.dtypes.int64.itemsize / (1024 * 1024)
assert midx.memory_usage(unit="KB") == 2 * size * ak.dtypes.int64.itemsize / 1024
assert midx.memory_usage(unit="B") == 2 * size * ak.dtypes.int64.itemsize

def test_is_unique(self):
i = ak.Index(ak.array([0, 1, 2]))
assert i.is_unique
Expand Down
19 changes: 19 additions & 0 deletions PROTO_tests/tests/series_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,3 +189,22 @@ def test_index_as_index_compat(self):
g = df.groupby(["a", "b"])
series = ak.Series(data=g.sum("c")["c"], index=g.sum("c").index)
g.broadcast(series)

@pytest.mark.parametrize("size", pytest.prob_size)
def test_memory_usage(self, size):
s = ak.Series(ak.arange(size))
assert s.memory_usage(unit="GB", index=False) == size * ak.dtypes.int64.itemsize / (
1024 * 1024 * 1024
)
assert s.memory_usage(unit="MB", index=False) == size * ak.dtypes.int64.itemsize / (1024 * 1024)
assert s.memory_usage(unit="KB", index=False) == size * ak.dtypes.int64.itemsize / 1024
assert s.memory_usage(unit="B", index=False) == size * ak.dtypes.int64.itemsize

assert s.memory_usage(unit="GB", index=True) == 2 * size * ak.dtypes.int64.itemsize / (
1024 * 1024 * 1024
)
assert s.memory_usage(unit="MB", index=True) == 2 * size * ak.dtypes.int64.itemsize / (
1024 * 1024
)
assert s.memory_usage(unit="KB", index=True) == 2 * size * ak.dtypes.int64.itemsize / 1024
assert s.memory_usage(unit="B", index=True) == 2 * size * ak.dtypes.int64.itemsize
32 changes: 32 additions & 0 deletions arkouda/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,38 @@ def __init__(self, values, **kwargs) -> None:
self.dtype = str_
self.registered_name: Optional[str] = None

@property
def nbytes(self):
"""
The size of the Categorical in bytes.
Returns
-------
int
The size of the Categorical in bytes.
"""
nbytes = 0
if self.categories is not None:
nbytes += self.categories.nbytes

if isinstance(self.codes, pdarray):
nbytes += self.codes.nbytes
elif isinstance(self.codes, akint64):
nbytes += 1

if isinstance(self.permutation, pdarray):
nbytes += self.permutation.nbytes
elif isinstance(self.permutation, akint64):
nbytes += 1

if isinstance(self.segments, pdarray):
nbytes += self.segments.nbytes
elif isinstance(self.segments, akint64):
nbytes += 1

return nbytes

@classmethod
@typechecked
def from_codes(
Expand Down
Loading

0 comments on commit ae7f6d5

Please sign in to comment.