Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

REF: Implement BaseMaskedArray class for integer/boolean ExtensionArrays #30789

Merged
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 6 additions & 96 deletions pandas/core/arrays/boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
is_extension_array_dtype,
is_float,
is_float_dtype,
is_integer,
is_integer_dtype,
is_list_like,
is_numeric_dtype,
Expand All @@ -27,10 +26,8 @@
from pandas.core.dtypes.missing import isna, notna

from pandas.core import nanops, ops
from pandas.core.algorithms import take
from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin
import pandas.core.common as com
from pandas.core.indexers import check_bool_array_indexer

from .masked import BaseMaskedArray
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you use absolute imports


if TYPE_CHECKING:
from pandas._typing import Scalar
Expand Down Expand Up @@ -197,7 +194,7 @@ def coerce_to_array(values, mask=None, copy: bool = False):
return values, mask


class BooleanArray(ExtensionArray, ExtensionOpsMixin):
class BooleanArray(BaseMaskedArray):
"""
Array of boolean (True/False) data with missing values.

Expand Down Expand Up @@ -251,6 +248,9 @@ class BooleanArray(ExtensionArray, ExtensionOpsMixin):
Length: 3, dtype: boolean
"""

# The value used to fill '_data' to avoid upcasting
_internal_fill_value = False

def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False):
if not (isinstance(values, np.ndarray) and values.dtype == np.bool_):
raise TypeError(
Expand Down Expand Up @@ -298,24 +298,6 @@ def _from_factorized(cls, values, original: "BooleanArray"):
def _formatter(self, boxed=False):
return str

@property
def _hasna(self) -> bool:
# Note: this is expensive right now! The hope is that we can
# make this faster by having an optional mask, but not have to change
# source code using it..
return self._mask.any()

def __getitem__(self, item):
if is_integer(item):
if self._mask[item]:
return self.dtype.na_value
return self._data[item]

elif com.is_bool_indexer(item):
item = check_bool_array_indexer(self, item)

return type(self)(self._data[item], self._mask[item])

def to_numpy(
self, dtype=None, copy=False, na_value: "Scalar" = lib._no_default,
):
Expand Down Expand Up @@ -393,24 +375,6 @@ def to_numpy(
data = self._data.astype(dtype, copy=copy)
return data

__array_priority__ = 1000 # higher than ndarray so ops dispatch to us

def __array__(self, dtype=None):
"""
the array interface, return my values
We return an object array here to preserve our scalar values
"""
# by default (no dtype specified), return an object array
return self.to_numpy(dtype=dtype)

def __arrow_array__(self, type=None):
"""
Convert myself into a pyarrow Array.
"""
import pyarrow as pa

return pa.array(self._data, mask=self._mask, type=type)

_HANDLED_TYPES = (np.ndarray, numbers.Number, bool, np.bool_)

def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
Expand Down Expand Up @@ -458,40 +422,6 @@ def reconstruct(x):
else:
return reconstruct(result)

def __iter__(self):
for i in range(len(self)):
if self._mask[i]:
yield self.dtype.na_value
else:
yield self._data[i]

def take(self, indexer, allow_fill=False, fill_value=None):
# we always fill with False internally
# to avoid upcasting
data_fill_value = False if isna(fill_value) else fill_value
result = take(
self._data, indexer, fill_value=data_fill_value, allow_fill=allow_fill
)

mask = take(self._mask, indexer, fill_value=True, allow_fill=allow_fill)

# if we are filling
# we only fill where the indexer is null
# not existing missing values
# TODO(jreback) what if we have a non-na float as a fill value?
if allow_fill and notna(fill_value):
fill_mask = np.asarray(indexer) == -1
result[fill_mask] = fill_value
mask = mask ^ fill_mask

return type(self)(result, mask, copy=False)

def copy(self):
data, mask = self._data, self._mask
data = data.copy()
mask = mask.copy()
return type(self)(data, mask, copy=False)

def __setitem__(self, key, value):
_is_scalar = is_scalar(value)
if _is_scalar:
Expand All @@ -505,26 +435,6 @@ def __setitem__(self, key, value):
self._data[key] = value
self._mask[key] = mask

def __len__(self):
return len(self._data)

@property
def nbytes(self):
return self._data.nbytes + self._mask.nbytes

def isna(self):
return self._mask

@property
def _na_value(self):
return self._dtype.na_value

@classmethod
def _concat_same_type(cls, to_concat):
data = np.concatenate([x._data for x in to_concat])
mask = np.concatenate([x._mask for x in to_concat])
return cls(data, mask)

def astype(self, dtype, copy=True):
"""
Cast to a NumPy array or ExtensionArray with 'dtype'.
Expand Down
87 changes: 7 additions & 80 deletions pandas/core/arrays/integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,12 @@
from pandas.core.dtypes.missing import isna, notna

from pandas.core import nanops, ops
from pandas.core.algorithms import take
from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin
import pandas.core.common as com
from pandas.core.indexers import check_bool_array_indexer
from pandas.core.ops import invalid_comparison
from pandas.core.ops.common import unpack_zerodim_and_defer
from pandas.core.tools.numeric import to_numeric

from .masked import BaseMaskedArray
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same



class _IntegerDtype(ExtensionDtype):
"""
Expand Down Expand Up @@ -259,7 +257,7 @@ def coerce_to_array(values, dtype, mask=None, copy=False):
return values, mask


class IntegerArray(ExtensionArray, ExtensionOpsMixin):
class IntegerArray(BaseMaskedArray):
"""
Array of integer (optional missing) values.

Expand Down Expand Up @@ -329,6 +327,9 @@ class IntegerArray(ExtensionArray, ExtensionOpsMixin):
Length: 3, dtype: UInt16
"""

# The value used to fill '_data' to avoid upcasting
_internal_fill_value = 1

@cache_readonly
def dtype(self):
return _dtypes[str(self._data.dtype)]
Expand Down Expand Up @@ -365,17 +366,6 @@ def _from_sequence_of_strings(cls, strings, dtype=None, copy=False):
def _from_factorized(cls, values, original):
return integer_array(values, dtype=original.dtype)

def __getitem__(self, item):
if is_integer(item):
if self._mask[item]:
return self.dtype.na_value
return self._data[item]

elif com.is_bool_indexer(item):
item = check_bool_array_indexer(self, item)

return type(self)(self._data[item], self._mask[item])

def _coerce_to_ndarray(self, dtype=None, na_value=lib._no_default):
"""
coerce to an ndarary of object dtype
Expand All @@ -402,23 +392,14 @@ def _coerce_to_ndarray(self, dtype=None, na_value=lib._no_default):
data[self._mask] = na_value
return data

__array_priority__ = 1000 # higher than ndarray so ops dispatch to us

# TODO: remove this when _coerce_to_ndarray is replace with to_numpy
jorisvandenbossche marked this conversation as resolved.
Show resolved Hide resolved
def __array__(self, dtype=None):
"""
the array interface, return my values
We return an object array here to preserve our scalar values
"""
return self._coerce_to_ndarray(dtype=dtype)

def __arrow_array__(self, type=None):
"""
Convert myself into a pyarrow Array.
"""
import pyarrow as pa

return pa.array(self._data, mask=self._mask, type=type)

_HANDLED_TYPES = (np.ndarray, numbers.Number)

def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
Expand Down Expand Up @@ -466,40 +447,6 @@ def reconstruct(x):
else:
return reconstruct(result)

def __iter__(self):
for i in range(len(self)):
if self._mask[i]:
yield self.dtype.na_value
else:
yield self._data[i]

def take(self, indexer, allow_fill=False, fill_value=None):
# we always fill with 1 internally
# to avoid upcasting
data_fill_value = 1 if isna(fill_value) else fill_value
result = take(
self._data, indexer, fill_value=data_fill_value, allow_fill=allow_fill
)

mask = take(self._mask, indexer, fill_value=True, allow_fill=allow_fill)

# if we are filling
# we only fill where the indexer is null
# not existing missing values
# TODO(jreback) what if we have a non-na float as a fill value?
if allow_fill and notna(fill_value):
fill_mask = np.asarray(indexer) == -1
result[fill_mask] = fill_value
mask = mask ^ fill_mask

return type(self)(result, mask, copy=False)

def copy(self):
data, mask = self._data, self._mask
data = data.copy()
mask = mask.copy()
return type(self)(data, mask, copy=False)

def __setitem__(self, key, value):
_is_scalar = is_scalar(value)
if _is_scalar:
Expand All @@ -513,26 +460,6 @@ def __setitem__(self, key, value):
self._data[key] = value
self._mask[key] = mask

def __len__(self) -> int:
return len(self._data)

@property
def nbytes(self):
return self._data.nbytes + self._mask.nbytes

def isna(self):
return self._mask

@property
def _na_value(self):
return self.dtype.na_value

@classmethod
def _concat_same_type(cls, to_concat):
data = np.concatenate([x._data for x in to_concat])
mask = np.concatenate([x._mask for x in to_concat])
return cls(data, mask)

def astype(self, dtype, copy=True):
"""
Cast to a NumPy array or IntegerArray with 'dtype'.
Expand Down
Loading