Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Some consolidation of indexed frame methods #10167

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 0 additions & 23 deletions python/cudf/cudf/core/_base_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from __future__ import annotations, division, print_function

import pickle
import warnings
from typing import Any, Set

import pandas as pd
Expand Down Expand Up @@ -1350,28 +1349,6 @@ def isin(self, values):

return self._values.isin(values).values

def memory_usage(self, deep=False):
"""
Memory usage of the values.

Parameters
----------
deep : bool
Introspect the data deeply,
interrogate `object` dtypes for system-level
memory consumption.

Returns
-------
bytes used
"""
if deep:
warnings.warn(
"The deep parameter is ignored and is only included "
"for pandas compatibility."
)
return self._values.memory_usage()

@classmethod
def from_pandas(cls, index, nan_as_null=None):
"""
Expand Down
7 changes: 2 additions & 5 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,12 +77,12 @@
pandas_dtypes_alias_to_cudf_alias,
pandas_dtypes_to_np_dtypes,
)
from cudf.utils.utils import mask_dtype
from cudf.utils.utils import NotIterable, mask_dtype

T = TypeVar("T", bound="ColumnBase")


class ColumnBase(Column, Serializable):
class ColumnBase(Column, Serializable, NotIterable):
def as_frame(self) -> "cudf.core.frame.Frame":
"""
Converts a Column to Frame
Expand Down Expand Up @@ -130,9 +130,6 @@ def to_pandas(self, index: pd.Index = None, **kwargs) -> "pd.Series":
pd_series.index = index
return pd_series

def __iter__(self):
cudf.utils.utils.raise_iteration_error(obj=self)

@property
def values_host(self) -> "np.ndarray":
"""
Expand Down
68 changes: 3 additions & 65 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1242,66 +1242,9 @@ def _slice(self: T, arg: slice) -> T:
return result

def memory_usage(self, index=True, deep=False):
"""
Return the memory usage of each column in bytes.
The memory usage can optionally include the contribution of
the index and elements of `object` dtype.

Parameters
----------
index : bool, default True
Specifies whether to include the memory usage of the DataFrame's
index in returned Series. If ``index=True``, the memory usage of
the index is the first item in the output.
deep : bool, default False
If True, introspect the data deeply by interrogating
`object` dtypes for system-level memory consumption, and include
it in the returned values.

Returns
-------
Series
A Series whose index is the original column names and whose values
is the memory usage of each column in bytes.

Examples
--------
>>> dtypes = ['int64', 'float64', 'object', 'bool']
>>> data = dict([(t, np.ones(shape=5000).astype(t))
... for t in dtypes])
>>> df = cudf.DataFrame(data)
>>> df.head()
int64 float64 object bool
0 1 1.0 1.0 True
1 1 1.0 1.0 True
2 1 1.0 1.0 True
3 1 1.0 1.0 True
4 1 1.0 1.0 True
>>> df.memory_usage(index=False)
int64 40000
float64 40000
object 40000
bool 5000
dtype: int64

Use a Categorical for efficient storage of an object-dtype column with
many repeated values.

>>> df['object'].astype('category').memory_usage(deep=True)
5008
"""
if deep:
warnings.warn(
"The deep parameter is ignored and is only included "
"for pandas compatibility."
)
ind = list(self.columns)
sizes = [col.memory_usage() for col in self._data.columns]
if index:
ind.append("Index")
ind = cudf.Index(ind, dtype="str")
sizes.append(self.index.memory_usage())
return Series(sizes, index=ind)
return Series(
{str(k): v for k, v in super().memory_usage(index, deep).items()}
)

def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
if method == "__call__" and hasattr(cudf, ufunc.__name__):
Expand Down Expand Up @@ -2547,11 +2490,6 @@ def reset_index(
inplace=inplace,
)

def take(self, indices, axis=0):
out = super().take(indices)
out.columns = self.columns
return out

@annotate("INSERT", color="green", domain="cudf_python")
def insert(self, loc, name, value, nan_as_null=None):
"""Add a column to DataFrame at the index specified by loc.
Expand Down
20 changes: 20 additions & 0 deletions python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,6 +336,26 @@ def empty(self):
"""
return self.size == 0

def memory_usage(self, deep=False):
"""Return the memory usage of an object.

Parameters
----------
deep : bool
The deep parameter is ignored and is only included for pandas
compatibility.

Returns
-------
The total bytes used.
"""
if deep:
warnings.warn(
"The deep parameter is ignored and is only included "
"for pandas compatibility."
)
return {name: col.memory_usage() for name, col in self._data.items()}

def __len__(self):
return self._num_rows

Expand Down
3 changes: 3 additions & 0 deletions python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -826,6 +826,9 @@ def _concat(cls, objs):
result.name = name
return result

def memory_usage(self, deep=False):
return sum(super().memory_usage(deep=deep).values())

@annotate("INDEX_EQUALS", color="green", domain="cudf_python")
def equals(self, other, **kwargs):
"""
Expand Down
62 changes: 62 additions & 0 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -473,6 +473,68 @@ def sort_index(
out = out.reset_index(drop=True)
return self._mimic_inplace(out, inplace=inplace)

def memory_usage(self, index=True, deep=False):
"""Return the memory usage of an object.

Parameters
----------
index : bool, default True
Specifies whether to include the memory usage of the index.
deep : bool, default False
The deep parameter is ignored and is only included for pandas
compatibility.

Returns
-------
Series or scalar
For DataFrame, a Series whose index is the original column names
and whose values is the memory usage of each column in bytes. For a
Series the total memory usage.

Examples
--------
**DataFrame**

>>> dtypes = ['int64', 'float64', 'object', 'bool']
>>> data = dict([(t, np.ones(shape=5000).astype(t))
... for t in dtypes])
>>> df = cudf.DataFrame(data)
>>> df.head()
int64 float64 object bool
0 1 1.0 1.0 True
1 1 1.0 1.0 True
2 1 1.0 1.0 True
3 1 1.0 1.0 True
4 1 1.0 1.0 True
>>> df.memory_usage(index=False)
int64 40000
float64 40000
object 40000
bool 5000
dtype: int64

Use a Categorical for efficient storage of an object-dtype column with
many repeated values.

>>> df['object'].astype('category').memory_usage(deep=True)
5008

**Series**
>>> s = cudf.Series(range(3), index=['a','b','c'])
>>> s.memory_usage()
43

Not including the index gives the size of the rest of the data, which
is necessarily smaller:

>>> s.memory_usage(index=False)
24
"""
usage = super().memory_usage(deep=deep)
if index:
usage["Index"] = self.index.memory_usage()
return usage

def hash_values(self, method="murmur3"):
"""Compute the hash of values in this column.

Expand Down
28 changes: 10 additions & 18 deletions python/cudf/cudf/core/multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import itertools
import numbers
import pickle
import warnings
from collections.abc import Sequence
from numbers import Integral
from typing import Any, List, MutableMapping, Optional, Tuple, Union
Expand All @@ -23,10 +22,14 @@
from cudf.core._compat import PANDAS_GE_120
from cudf.core.frame import Frame
from cudf.core.index import BaseIndex, _lexsorted_equal_range, as_index
from cudf.utils.utils import _maybe_indices_to_slice, cached_property
from cudf.utils.utils import (
NotIterable,
_maybe_indices_to_slice,
cached_property,
)


class MultiIndex(Frame, BaseIndex):
class MultiIndex(Frame, BaseIndex, NotIterable):
"""A multi-level or hierarchical index.

Provides N-Dimensional indexing into Series and DataFrame objects.
Expand Down Expand Up @@ -367,9 +370,6 @@ def copy(

return mi

def __iter__(self):
cudf.utils.utils.raise_iteration_error(obj=self)

def __repr__(self):
max_seq_items = get_option("display.max_seq_items") or len(self)

Expand Down Expand Up @@ -1412,22 +1412,14 @@ def _clean_nulls_from_index(self):
)

def memory_usage(self, deep=False):
if deep:
warnings.warn(
"The deep parameter is ignored and is only included "
"for pandas compatibility."
)

n = 0
for col in self._data.columns:
n += col.memory_usage()
usage = sum(super().memory_usage(deep=deep).values())
if self.levels:
for level in self.levels:
n += level.memory_usage(deep=deep)
usage += level.memory_usage(deep=deep)
if self.codes:
for col in self.codes._data.columns:
n += col.memory_usage()
return n
usage += col.memory_usage()
return usage

def difference(self, other, sort=None):
if hasattr(other, "to_pandas"):
Expand Down
52 changes: 6 additions & 46 deletions python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -953,52 +953,7 @@ def to_frame(self, name=None):
return cudf.DataFrame({col: self._column}, index=self.index)

def memory_usage(self, index=True, deep=False):
"""
Return the memory usage of the Series.

The memory usage can optionally include the contribution of
the index and of elements of `object` dtype.

Parameters
----------
index : bool, default True
Specifies whether to include the memory usage of the Series index.
deep : bool, default False
If True, introspect the data deeply by interrogating
`object` dtypes for system-level memory consumption, and include
it in the returned value.

Returns
-------
int
Bytes of memory consumed.

See Also
--------
cudf.DataFrame.memory_usage : Bytes consumed by
a DataFrame.

Examples
--------
>>> s = cudf.Series(range(3), index=['a','b','c'])
>>> s.memory_usage()
43

Not including the index gives the size of the rest of the data, which
is necessarily smaller:

>>> s.memory_usage(index=False)
24
"""
if deep:
warnings.warn(
"The deep parameter is ignored and is only included "
"for pandas compatibility."
)
n = self._column.memory_usage()
if index:
n += self._index.memory_usage()
return n
return sum(super().memory_usage(index, deep).values())

def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
if method == "__call__":
Expand Down Expand Up @@ -3327,6 +3282,11 @@ def merge(
method="hash",
suffixes=("_x", "_y"),
):
warnings.warn(
"Series.merge is deprecated and will be removed in a future "
"release. Use cudf.merge instead.",
FutureWarning,
)
if left_on not in (self.name, None):
raise ValueError(
"Series to other merge uses series name as key implicitly"
Expand Down
Loading