diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py index ee32b12f0e712..f8adcf520c15b 100644 --- a/pandas/core/arrays/__init__.py +++ b/pandas/core/arrays/__init__.py @@ -1 +1,2 @@ +from .base import ExtensionArray # noqa from .categorical import Categorical # noqa diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py new file mode 100644 index 0000000000000..1556b653819a6 --- /dev/null +++ b/pandas/core/arrays/base.py @@ -0,0 +1,247 @@ +"""An interface for extending pandas with custom arrays.""" +from pandas.errors import AbstractMethodError + +_not_implemented_message = "{} does not implement {}." + + +class ExtensionArray(object): + """Abstract base class for custom 1-D array types. + + pandas will recognize instances of this class as proper arrays + with a custom type and will not attempt to coerce them to objects. They + may be stored directly inside a :class:`DataFrame` or :class:`Series`. + + Notes + ----- + The interface includes the following abstract methods that must be + implemented by subclasses: + + * __getitem__ + * __len__ + * dtype + * nbytes + * isna + * take + * copy + * _formatting_values + * _concat_same_type + + Some additional methods are required to satisfy pandas' internal, private + block API. + + * _concat_same_type + * _can_hold_na + + This class does not inherit from 'abc.ABCMeta' for performance reasons. + Methods and properties required by the interface raise + ``pandas.errors.AbstractMethodError`` and no ``register`` method is + provided for registering virtual subclasses. + + ExtensionArrays are limited to 1 dimension. + + They may be backed by none, one, or many NumPy ararys. For example, + ``pandas.Categorical`` is an extension array backed by two arrays, + one for codes and one for categories. An array of IPv6 address may + be backed by a NumPy structured array with two fields, one for the + lower 64 bits and one for the upper 64 bits. Or they may be backed + by some other storage type, like Python lists. Pandas makes no + assumptions on how the data are stored, just that it can be converted + to a NumPy array. + + Extension arrays should be able to be constructed with instances of + the class, i.e. ``ExtensionArray(extension_array)`` should return + an instance, not error. + + Additionally, certain methods and interfaces are required for proper + this array to be properly stored inside a ``DataFrame`` or ``Series``. + """ + # ------------------------------------------------------------------------ + # Must be a Sequence + # ------------------------------------------------------------------------ + def __getitem__(self, item): + # type (Any) -> Any + """Select a subset of self. + + Parameters + ---------- + item : int, slice, or ndarray + * int: The position in 'self' to get. + + * slice: A slice object, where 'start', 'stop', and 'step' are + integers or None + + * ndarray: A 1-d boolean NumPy ndarray the same length as 'self' + + Returns + ------- + item : scalar or ExtensionArray + + Notes + ----- + For scalar ``item``, return a scalar value suitable for the array's + type. This should be an instance of ``self.dtype.type``. + + For slice ``key``, return an instance of ``ExtensionArray``, even + if the slice is length 0 or 1. + + For a boolean mask, return an instance of ``ExtensionArray``, filtered + to the values where ``item`` is True. + """ + raise AbstractMethodError(self) + + def __setitem__(self, key, value): + # type: (Any, Any) -> None + raise NotImplementedError(_not_implemented_message.format( + type(self), '__setitem__') + ) + + def __len__(self): + """Length of this array + + Returns + ------- + length : int + """ + # type: () -> int + raise AbstractMethodError(self) + + # ------------------------------------------------------------------------ + # Required attributes + # ------------------------------------------------------------------------ + @property + def dtype(self): + # type: () -> ExtensionDtype + """An instance of 'ExtensionDtype'.""" + raise AbstractMethodError(self) + + @property + def shape(self): + # type: () -> Tuple[int, ...] + return (len(self),) + + @property + def ndim(self): + # type: () -> int + """Extension Arrays are only allowed to be 1-dimensional.""" + return 1 + + @property + def nbytes(self): + # type: () -> int + """The number of bytes needed to store this object in memory. + + If this is expensive to compute, return an approximate lower bound + on the number of bytes needed. + """ + raise AbstractMethodError(self) + + # ------------------------------------------------------------------------ + # Additional Methods + # ------------------------------------------------------------------------ + def isna(self): + # type: () -> np.ndarray + """Boolean NumPy array indicating if each value is missing. + + This should return a 1-D array the same length as 'self'. + """ + raise AbstractMethodError(self) + + # ------------------------------------------------------------------------ + # Indexing methods + # ------------------------------------------------------------------------ + def take(self, indexer, allow_fill=True, fill_value=None): + # type: (Sequence[int], bool, Optional[Any]) -> ExtensionArray + """Take elements from an array. + + Parameters + ---------- + indexer : sequence of integers + indices to be taken. -1 is used to indicate values + that are missing. + allow_fill : bool, default True + If False, indexer is assumed to contain no -1 values so no filling + will be done. This short-circuits computation of a mask. Result is + undefined if allow_fill == False and -1 is present in indexer. + fill_value : any, default None + Fill value to replace -1 values with. By default, this uses + the missing value sentinel for this type, ``self._fill_value``. + + Notes + ----- + This should follow pandas' semantics where -1 indicates missing values. + Positions where indexer is ``-1`` should be filled with the missing + value for this type. + + This is called by ``Series.__getitem__``, ``.loc``, ``iloc``, when the + indexer is a sequence of values. + + Examples + -------- + Suppose the extension array somehow backed by a NumPy structured array + and that the underlying structured array is stored as ``self.data``. + Then ``take`` may be written as + + .. code-block:: python + + def take(self, indexer, allow_fill=True, fill_value=None): + mask = indexer == -1 + result = self.data.take(indexer) + result[mask] = self._fill_value + return type(self)(result) + """ + raise AbstractMethodError(self) + + def copy(self, deep=False): + # type: (bool) -> ExtensionArray + """Return a copy of the array. + + Parameters + ---------- + deep : bool, default False + Also copy the underlying data backing this array. + + Returns + ------- + ExtensionArray + """ + raise AbstractMethodError(self) + + # ------------------------------------------------------------------------ + # Block-related methods + # ------------------------------------------------------------------------ + @property + def _fill_value(self): + # type: () -> Any + """The missing value for this type, e.g. np.nan""" + return None + + def _formatting_values(self): + # type: () -> np.ndarray + # At the moment, this has to be an array since we use result.dtype + """An array of values to be printed in, e.g. the Series repr""" + raise AbstractMethodError(self) + + @classmethod + def _concat_same_type(cls, to_concat): + # type: (Sequence[ExtensionArray]) -> ExtensionArray + """Concatenate multiple array + + Parameters + ---------- + to_concat : sequence of this type + + Returns + ------- + ExtensionArray + """ + raise AbstractMethodError(cls) + + def _can_hold_na(self): + # type: () -> bool + """Whether your array can hold missing values. True by default. + + Notes + ----- + Setting this to false will optimize some operations like fillna. + """ + return True diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index b50e01b0fb55a..62c6a6b16cbe9 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -43,6 +43,8 @@ from pandas.util._validators import validate_bool_kwarg from pandas.core.config import get_option +from .base import ExtensionArray + def _cat_compare_op(op): def f(self, other): @@ -148,7 +150,7 @@ def _maybe_to_categorical(array): """ -class Categorical(PandasObject): +class Categorical(ExtensionArray, PandasObject): """ Represents a categorical variable in classic R / S-plus fashion @@ -2130,6 +2132,20 @@ def repeat(self, repeats, *args, **kwargs): return self._constructor(values=codes, categories=self.categories, ordered=self.ordered, fastpath=True) + # Implement the ExtensionArray interface + @property + def _can_hold_na(self): + return True + + @classmethod + def _concat_same_type(self, to_concat): + from pandas.core.dtypes.concat import _concat_categorical + + return _concat_categorical(to_concat) + + def _formatting_values(self): + return self + # The Series.cat accessor diff --git a/pandas/core/common.py b/pandas/core/common.py index e606be3cc2a23..6748db825acf0 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -25,7 +25,8 @@ # compat from pandas.errors import ( # noqa - PerformanceWarning, UnsupportedFunctionCall, UnsortedIndexError) + PerformanceWarning, UnsupportedFunctionCall, UnsortedIndexError, + AbstractMethodError) # back-compat of public API # deprecate these functions @@ -88,19 +89,6 @@ class SettingWithCopyWarning(Warning): pass -class AbstractMethodError(NotImplementedError): - """Raise this error instead of NotImplementedError for abstract methods - while keeping compatibility with Python 2 and Python 3. - """ - - def __init__(self, class_instance): - self.class_instance = class_instance - - def __str__(self): - msg = "This method must be defined in the concrete class of {name}" - return (msg.format(name=self.class_instance.__class__.__name__)) - - def flatten(l): """Flatten an arbitrarily nested sequence. diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py new file mode 100644 index 0000000000000..c7c5378801f02 --- /dev/null +++ b/pandas/core/dtypes/base.py @@ -0,0 +1,129 @@ +"""Extend pandas with custom array types""" +from pandas.errors import AbstractMethodError + + +class ExtensionDtype(object): + """A custom data type, to be paired with an ExtensionArray. + + Notes + ----- + The interface includes the following abstract methods that must + be implemented by subclasses: + + * type + * name + * construct_from_string + + This class does not inherit from 'abc.ABCMeta' for performance reasons. + Methods and properties required by the interface raise + ``pandas.errors.AbstractMethodError`` and no ``register`` method is + provided for registering virtual subclasses. + """ + + def __str__(self): + return self.name + + @property + def type(self): + # type: () -> type + """The scalar type for the array, e.g. ``int`` + + It's expected ``ExtensionArray[item]`` returns an instance + of ``ExtensionDtype.type`` for scalar ``item``. + """ + raise AbstractMethodError(self) + + @property + def kind(self): + # type () -> str + """A character code (one of 'biufcmMOSUV'), default 'O' + + This should match the NumPy dtype used when the array is + converted to an ndarray, which is probably 'O' for object if + the extension type cannot be represented as a built-in NumPy + type. + + See Also + -------- + numpy.dtype.kind + """ + return 'O' + + @property + def name(self): + # type: () -> str + """A string identifying the data type. + + Will be used for display in, e.g. ``Series.dtype`` + """ + raise AbstractMethodError(self) + + @property + def names(self): + # type: () -> Optional[List[str]] + """Ordered list of field names, or None if there are no fields. + + This is for compatibility with NumPy arrays, and may be removed in the + future. + """ + return None + + @classmethod + def construct_from_string(cls, string): + """Attempt to construct this type from a string. + + Parameters + ---------- + string : str + + Returns + ------- + self : instance of 'cls' + + Raises + ------ + TypeError + If a class cannot be constructed from this 'string'. + + Examples + -------- + If the extension dtype can be constructed without any arguments, + the following may be an adequate implementation. + + >>> @classmethod + ... def construct_from_string(cls, string) + ... if string == cls.name: + ... return cls() + ... else: + ... raise TypeError("Cannot construct a '{}' from " + ... "'{}'".format(cls, string)) + """ + raise AbstractMethodError(cls) + + @classmethod + def is_dtype(cls, dtype): + """Check if we match 'dtype' + + Parameters + ---------- + dtype : str or dtype + + Returns + ------- + is_dtype : bool + + Notes + ----- + The default implementation is True if + + 1. ``cls.construct_from_string(dtype)`` is an instance + of ``cls``. + 2. 'dtype' is ``cls`` or a subclass of ``cls``. + """ + if isinstance(dtype, str): + try: + return isinstance(cls.construct_from_string(dtype), cls) + except TypeError: + return False + else: + return issubclass(dtype, cls) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index dca9a5fde0d74..c66e7fcfc6978 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1685,6 +1685,35 @@ def is_extension_type(arr): return False +def is_extension_array_dtype(arr_or_dtype): + """Check if an object is a pandas extension array type. + + Parameters + ---------- + arr_or_dtype : object + + Returns + ------- + bool + + Notes + ----- + This checks whether an object implements the pandas extension + array interface. In pandas, this includes: + + * Categorical + + Third-party libraries may implement arrays or types satisfying + this interface as well. + """ + from pandas.core.arrays import ExtensionArray + + # we want to unpack series, anything else? + if isinstance(arr_or_dtype, ABCSeries): + arr_or_dtype = arr_or_dtype._values + return isinstance(arr_or_dtype, (ExtensionDtype, ExtensionArray)) + + def is_complex_dtype(arr_or_dtype): """ Check whether the provided array or dtype is of a complex dtype. diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 1eb87aa99fd1e..d8d3a96992757 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -5,15 +5,15 @@ from pandas import compat from pandas.core.dtypes.generic import ABCIndexClass, ABCCategoricalIndex +from .base import ExtensionDtype -class ExtensionDtype(object): + +class PandasExtensionDtype(ExtensionDtype): """ A np.dtype duck-typed class, suitable for holding a custom dtype. THIS IS NOT A REAL NUMPY DTYPE """ - name = None - names = None type = None subdtype = None kind = None @@ -108,7 +108,7 @@ class CategoricalDtypeType(type): pass -class CategoricalDtype(ExtensionDtype): +class CategoricalDtype(PandasExtensionDtype): """ Type for categorical data with the categories and orderedness @@ -387,7 +387,7 @@ class DatetimeTZDtypeType(type): pass -class DatetimeTZDtype(ExtensionDtype): +class DatetimeTZDtype(PandasExtensionDtype): """ A np.dtype duck-typed class, suitable for holding a custom datetime with tz @@ -501,8 +501,7 @@ class PeriodDtypeType(type): pass -class PeriodDtype(ExtensionDtype): - __metaclass__ = PeriodDtypeType +class PeriodDtype(PandasExtensionDtype): """ A Period duck-typed class, suitable for holding a period with freq dtype. @@ -619,8 +618,7 @@ class IntervalDtypeType(type): pass -class IntervalDtype(ExtensionDtype): - __metaclass__ = IntervalDtypeType +class IntervalDtype(PandasExtensionDtype): """ A Interval duck-typed class, suitable for holding an interval diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 52e8317f5209a..f553e1a02c9d6 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -33,6 +33,7 @@ is_datetimelike_v_numeric, is_float_dtype, is_numeric_dtype, is_numeric_v_string_like, is_extension_type, + is_extension_array_dtype, is_list_like, is_re, is_re_compilable, @@ -61,8 +62,9 @@ from pandas.core.index import Index, MultiIndex, _ensure_index from pandas.core.indexing import maybe_convert_indices, length_of_indexer -from pandas.core.arrays.categorical import Categorical, _maybe_to_categorical +from pandas.core.arrays import Categorical from pandas.core.indexes.datetimes import DatetimeIndex +from pandas.core.indexes.timedeltas import TimedeltaIndex from pandas.io.formats.printing import pprint_thing import pandas.core.missing as missing @@ -103,24 +105,58 @@ class Block(PandasObject): _verify_integrity = True _validate_ndim = True _ftype = 'dense' - _holder = None _concatenator = staticmethod(np.concatenate) def __init__(self, values, placement, ndim=None): - if ndim is None: - ndim = values.ndim - elif values.ndim != ndim: - raise ValueError('Wrong number of dimensions') - self.ndim = ndim - + self.ndim = self._check_ndim(values, ndim) self.mgr_locs = placement self.values = values - if ndim and len(self.mgr_locs) != len(self.values): + if (self._validate_ndim and self.ndim and + len(self.mgr_locs) != len(self.values)): raise ValueError( 'Wrong number of items passed {val}, placement implies ' '{mgr}'.format(val=len(self.values), mgr=len(self.mgr_locs))) + def _check_ndim(self, values, ndim): + """ndim inference and validation. + + Infers ndim from 'values' if not provided to __init__. + Validates that values.ndim and ndim are consistent if and only if + the class variable '_validate_ndim' is True. + + Parameters + ---------- + values : array-like + ndim : int or None + + Returns + ------- + ndim : int + + Raises + ------ + ValueError : the number of dimensions do not match + """ + if ndim is None: + ndim = values.ndim + + if self._validate_ndim and values.ndim != ndim: + msg = ("Wrong number of dimensions. values.ndim != ndim " + "[{} != {}]") + raise ValueError(msg.format(values.ndim, ndim)) + + return ndim + + @property + def _holder(self): + """The array-like that can hold the underlying values. + + None for 'Block', overridden by subclasses that don't + use an ndarray. + """ + return None + @property def _consolidate_key(self): return (self._can_consolidate, self.dtype.name) @@ -279,7 +315,6 @@ def reshape_nd(self, labels, shape, ref_items, mgr=None): return a new block that is transformed to a nd block """ - return _block2d_to_blocknd(values=self.get_values().T, placement=self.mgr_locs, shape=shape, labels=labels, ref_items=ref_items) @@ -535,15 +570,20 @@ def astype(self, dtype, copy=False, errors='raise', values=None, **kwargs): def _astype(self, dtype, copy=False, errors='raise', values=None, klass=None, mgr=None, **kwargs): - """ - Coerce to the new type + """Coerce to the new type + Parameters + ---------- dtype : str, dtype convertible copy : boolean, default False copy if indicated errors : str, {'raise', 'ignore'}, default 'ignore' - ``raise`` : allow exceptions to be raised - ``ignore`` : suppress exceptions. On error return original object + + Returns + ------- + Block """ errors_legal_values = ('raise', 'ignore') @@ -1674,27 +1714,28 @@ class NonConsolidatableMixIn(object): _can_consolidate = False _verify_integrity = False _validate_ndim = False - _holder = None def __init__(self, values, placement, ndim=None): + """Initialize a non-consolidatable block. - # Placement must be converted to BlockPlacement via property setter - # before ndim logic, because placement may be a slice which doesn't - # have a length. - self.mgr_locs = placement + 'ndim' may be inferred from 'placement'. - # kludgetastic + This will call continue to call __init__ for the other base + classes mixed in with this Mixin. + """ + # Placement must be converted to BlockPlacement so that we can check + # its length + if not isinstance(placement, BlockPlacement): + placement = BlockPlacement(placement) + + # Maybe infer ndim from placement if ndim is None: - if len(self.mgr_locs) != 1: + if len(placement) != 1: ndim = 1 else: ndim = 2 - self.ndim = ndim - - if not isinstance(values, self._holder): - raise TypeError("values must be {0}".format(self._holder.__name__)) - - self.values = values + super(NonConsolidatableMixIn, self).__init__(values, placement, + ndim=ndim) @property def shape(self): @@ -1745,7 +1786,7 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0, Returns ------- - a new block(s), the result of the putmask + a new block, the result of the putmask """ inplace = validate_bool_kwarg(inplace, 'inplace') @@ -1803,6 +1844,92 @@ def _unstack(self, unstacker_func, new_columns): return blocks, mask +class ExtensionBlock(NonConsolidatableMixIn, Block): + """Block for holding extension types. + + Notes + ----- + This holds all 3rd-party extension array types. It's also the immediate + parent class for our internal extension types' blocks, CategoricalBlock. + + ExtensionArrays are limited to 1-D. + """ + @property + def _holder(self): + # For extension blocks, the holder is values-dependent. + return type(self.values) + + @property + def is_view(self): + """Extension arrays are never treated as views.""" + return False + + def get_values(self, dtype=None): + # ExtensionArrays must be iterable, so this works. + values = np.asarray(self.values) + if values.ndim == self.ndim - 1: + values = values.reshape((1,) + values.shape) + return values + + def to_dense(self): + return np.asarray(self.values) + + def take_nd(self, indexer, axis=0, new_mgr_locs=None, fill_tuple=None): + """ + Take values according to indexer and return them as a block. + """ + if fill_tuple is None: + fill_value = None + else: + fill_value = fill_tuple[0] + + # axis doesn't matter; we are really a single-dim object + # but are passed the axis depending on the calling routing + # if its REALLY axis 0, then this will be a reindex and not a take + new_values = self.values.take(indexer, fill_value=fill_value) + + # if we are a 1-dim object, then always place at 0 + if self.ndim == 1: + new_mgr_locs = [0] + else: + if new_mgr_locs is None: + new_mgr_locs = self.mgr_locs + + return self.make_block_same_class(new_values, new_mgr_locs) + + def _can_hold_element(self, element): + # XXX: We may need to think about pushing this onto the array. + # We're doing the same as CategoricalBlock here. + return True + + def _slice(self, slicer): + """ return a slice of my values """ + + # slice the category + # return same dims as we currently have + + if isinstance(slicer, tuple) and len(slicer) == 2: + if not com.is_null_slice(slicer[0]): + raise AssertionError("invalid slicing for a 1-ndim " + "categorical") + slicer = slicer[1] + + return self.values[slicer] + + def formatting_values(self): + return self.values._formatting_values() + + def concat_same_type(self, to_concat, placement=None): + """ + Concatenate list of single blocks of the same type. + """ + values = self._holder._concat_same_type( + [blk.values for blk in to_concat]) + placement = placement or slice(0, len(values), 1) + return self.make_block_same_class(values, ndim=self.ndim, + placement=placement) + + class NumericBlock(Block): __slots__ = () is_numeric = True @@ -1908,6 +2035,11 @@ def should_store(self, value): class DatetimeLikeBlockMixin(object): + """Mixin class for DatetimeBlock and DatetimeTZBlock.""" + + @property + def _holder(self): + return DatetimeIndex @property def _na_value(self): @@ -1940,6 +2072,10 @@ def __init__(self, values, placement, ndim=None): super(TimeDeltaBlock, self).__init__(values, placement=placement, ndim=ndim) + @property + def _holder(self): + return TimedeltaIndex + @property def _box_func(self): return lambda x: tslib.Timedelta(x, unit='ns') @@ -2315,30 +2451,24 @@ def re_replacer(s): return block -class CategoricalBlock(NonConsolidatableMixIn, ObjectBlock): +class CategoricalBlock(ExtensionBlock): __slots__ = () is_categorical = True _verify_integrity = True _can_hold_na = True - _holder = Categorical _concatenator = staticmethod(_concat._concat_categorical) def __init__(self, values, placement, ndim=None): + from pandas.core.arrays.categorical import _maybe_to_categorical # coerce to categorical if we can super(CategoricalBlock, self).__init__(_maybe_to_categorical(values), - placement=placement, ndim=ndim) + placement=placement, + ndim=ndim) @property - def is_view(self): - """ I am never a view """ - return False - - def to_dense(self): - return self.values.to_dense().view() - - def convert(self, copy=True, **kwargs): - return self.copy() if copy else self + def _holder(self): + return Categorical @property def array_dtype(self): @@ -2347,13 +2477,6 @@ def array_dtype(self): """ return np.object_ - def _slice(self, slicer): - """ return a slice of my values """ - - # slice the category - # return same dims as we currently have - return self.values._slice(slicer) - def _try_coerce_result(self, result): """ reverse of try_coerce_args """ @@ -2390,28 +2513,11 @@ def shift(self, periods, axis=0, mgr=None): return self.make_block_same_class(values=self.values.shift(periods), placement=self.mgr_locs) - def take_nd(self, indexer, axis=0, new_mgr_locs=None, fill_tuple=None): - """ - Take values according to indexer and return them as a block.bb - """ - if fill_tuple is None: - fill_value = None - else: - fill_value = fill_tuple[0] - - # axis doesn't matter; we are really a single-dim object - # but are passed the axis depending on the calling routing - # if its REALLY axis 0, then this will be a reindex and not a take - new_values = self.values.take_nd(indexer, fill_value=fill_value) - - # if we are a 1-dim object, then always place at 0 - if self.ndim == 1: - new_mgr_locs = [0] - else: - if new_mgr_locs is None: - new_mgr_locs = self.mgr_locs - - return self.make_block_same_class(new_values, new_mgr_locs) + def to_dense(self): + # Categorical.get_values returns a DatetimeIndex for datetime + # categories, so we can't simply use `np.asarray(self.values)` like + # other types. + return self.values.get_values() def to_native_types(self, slicer=None, na_rep='', quoting=None, **kwargs): """ convert to our native types format, slicing if desired """ @@ -2430,6 +2536,15 @@ def to_native_types(self, slicer=None, na_rep='', quoting=None, **kwargs): def concat_same_type(self, to_concat, placement=None): """ Concatenate list of single blocks of the same type. + + Note that this CategoricalBlock._concat_same_type *may* not + return a CategoricalBlock. When the categories in `to_concat` + differ, this will return an object ndarray. + + If / when we decide we don't like that behavior: + + 1. Change Categorical._concat_same_type to use union_categoricals + 2. Delete this method. """ values = self._concatenator([blk.values for blk in to_concat], axis=self.ndim - 1) @@ -2445,12 +2560,29 @@ class DatetimeBlock(DatetimeLikeBlockMixin, Block): _can_hold_na = True def __init__(self, values, placement, ndim=None): - if values.dtype != _NS_DTYPE: - values = conversion.ensure_datetime64ns(values) - + values = self._maybe_coerce_values(values) super(DatetimeBlock, self).__init__(values, placement=placement, ndim=ndim) + def _maybe_coerce_values(self, values): + """Input validation for values passed to __init__. Ensure that + we have datetime64ns, coercing if nescessary. + + Parametetrs + ----------- + values : array-like + Must be convertable to datetime64 + + Returns + ------- + values : ndarray[datetime64ns] + + Overridden by DatetimeTZBlock. + """ + if values.dtype != _NS_DTYPE: + values = conversion.ensure_datetime64ns(values) + return values + def _astype(self, dtype, mgr=None, **kwargs): """ these automatically copy, so copy=True has no effect @@ -2576,12 +2708,37 @@ def set(self, locs, values, check=False): class DatetimeTZBlock(NonConsolidatableMixIn, DatetimeBlock): """ implement a datetime64 block with a tz attribute """ __slots__ = () - _holder = DatetimeIndex _concatenator = staticmethod(_concat._concat_datetime) is_datetimetz = True def __init__(self, values, placement, ndim=2, dtype=None): + # XXX: This will end up calling _maybe_coerce_values twice + # when dtype is not None. It's relatively cheap (just an isinstance) + # but it'd nice to avoid. + # + # If we can remove dtype from __init__, and push that conversion + # push onto the callers, then we can remove this entire __init__ + # and just use DatetimeBlock's. + if dtype is not None: + values = self._maybe_coerce_values(values, dtype=dtype) + super(DatetimeTZBlock, self).__init__(values, placement=placement, + ndim=ndim) + + def _maybe_coerce_values(self, values, dtype=None): + """Input validation for values passed to __init__. Ensure that + we have datetime64TZ, coercing if nescessary. + Parametetrs + ----------- + values : array-like + Must be convertable to datetime64 + dtype : string or DatetimeTZDtype, optional + Does a shallow copy to this tz + + Returns + ------- + values : ndarray[datetime64ns] + """ if not isinstance(values, self._holder): values = self._holder(values) @@ -2593,8 +2750,7 @@ def __init__(self, values, placement, ndim=2, dtype=None): if values.tz is None: raise ValueError("cannot create a DatetimeTZBlock without a tz") - super(DatetimeTZBlock, self).__init__(values, placement=placement, - ndim=ndim) + return values def copy(self, deep=True, mgr=None): """ copy constructor """ @@ -2734,9 +2890,19 @@ class SparseBlock(NonConsolidatableMixIn, Block): _box_to_block_values = False _can_hold_na = True _ftype = 'sparse' - _holder = SparseArray _concatenator = staticmethod(_concat._concat_sparse) + def __init__(self, values, placement, ndim=None): + # Ensure that we have the underlying SparseArray here... + if isinstance(values, ABCSeries): + values = values.values + assert isinstance(values, SparseArray) + super(SparseBlock, self).__init__(values, placement, ndim=ndim) + + @property + def _holder(self): + return SparseArray + @property def shape(self): return (len(self.mgr_locs), self.sp_index.length) @@ -2910,6 +3076,8 @@ def get_block_type(values, dtype=None): cls = BoolBlock elif is_categorical(values): cls = CategoricalBlock + elif is_extension_array_dtype(values): + cls = ExtensionBlock else: cls = ObjectBlock return cls @@ -4663,6 +4831,19 @@ def form_blocks(arrays, names, axes): for i, _, array in items_dict['CategoricalBlock']] blocks.extend(cat_blocks) + if len(items_dict['ExtensionBlock']): + + external_blocks = [] + for i, _, array in items_dict['ExtensionBlock']: + if isinstance(array, ABCSeries): + array = array.values + # Allow our internal arrays to chose their block type. + block_type = getattr(array, '_block_type', ExtensionBlock) + external_blocks.append( + make_block(array, klass=block_type, + fastpath=True, placement=[i])) + blocks.extend(external_blocks) + if len(extra_locs): shape = (len(extra_locs),) + tuple(len(x) for x in axes[1:]) diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index 22b6d33be9d38..af4e83f506257 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -77,3 +77,26 @@ class NullFrequencyError(ValueError): class AccessorRegistrationWarning(Warning): """Warning for attribute conflicts in accessor registration.""" + + +class AbstractMethodError(NotImplementedError): + """Raise this error instead of NotImplementedError for abstract methods + while keeping compatibility with Python 2 and Python 3. + """ + + def __init__(self, class_instance, methodtype='method'): + types = {'method', 'classmethod', 'staticmethod', 'property'} + if methodtype not in types: + msg = 'methodtype must be one of {}, got {} instead.'.format( + methodtype, types) + raise ValueError(msg) + self.methodtype = methodtype + self.class_instance = class_instance + + def __str__(self): + if self.methodtype == 'classmethod': + name = self.class_instance.__name__ + else: + name = self.class_instance.__class__.__name__ + msg = "This {methodtype} must be defined in the concrete class {name}" + return (msg.format(methodtype=self.methodtype, name=name)) diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index d800a7b92b559..eca4dd4cf2106 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -10,12 +10,14 @@ Series, Categorical, CategoricalIndex, IntervalIndex, date_range) from pandas.compat import string_types +from pandas.core.arrays import ExtensionArray from pandas.core.dtypes.dtypes import ( DatetimeTZDtype, PeriodDtype, - IntervalDtype, CategoricalDtype) + IntervalDtype, CategoricalDtype, ExtensionDtype) from pandas.core.dtypes.common import ( is_categorical_dtype, is_categorical, is_datetime64tz_dtype, is_datetimetz, + is_extension_array_dtype, is_period_dtype, is_period, is_dtype_equal, is_datetime64_ns_dtype, is_datetime64_dtype, is_interval_dtype, @@ -742,3 +744,31 @@ def test_categorical_categories(self): tm.assert_index_equal(c1.categories, pd.Index(['a', 'b'])) c1 = CategoricalDtype(CategoricalIndex(['a', 'b'])) tm.assert_index_equal(c1.categories, pd.Index(['a', 'b'])) + + +class DummyArray(ExtensionArray): + pass + + +class DummyDtype(ExtensionDtype): + pass + + +class TestExtensionArrayDtype(object): + + @pytest.mark.parametrize('values', [ + pd.Categorical([]), + pd.Categorical([]).dtype, + pd.Series(pd.Categorical([])), + DummyDtype(), + DummyArray(), + ]) + def test_is_extension_array_dtype(self, values): + assert is_extension_array_dtype(values) + + @pytest.mark.parametrize('values', [ + np.array([]), + pd.Series(np.array([])), + ]) + def test_is_not_extension_array_dtype(self, values): + assert not is_extension_array_dtype(values) diff --git a/pandas/tests/internals/test_external_block.py b/pandas/tests/internals/test_external_block.py index 729ee0093b6dc..2487363df8f99 100644 --- a/pandas/tests/internals/test_external_block.py +++ b/pandas/tests/internals/test_external_block.py @@ -5,12 +5,12 @@ import pandas as pd from pandas.core.internals import ( - Block, BlockManager, SingleBlockManager, NonConsolidatableMixIn) + BlockManager, SingleBlockManager, ExtensionBlock) import pytest -class CustomBlock(NonConsolidatableMixIn, Block): +class CustomBlock(ExtensionBlock): _holder = np.ndarray diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index e3490f465b24a..9338aba90d7cb 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -11,9 +11,8 @@ from distutils.version import LooseVersion import itertools from pandas import (Index, MultiIndex, DataFrame, DatetimeIndex, - Series, Categorical) + Series, Categorical, TimedeltaIndex, SparseArray) from pandas.compat import OrderedDict, lrange -from pandas.core.sparse.array import SparseArray from pandas.core.internals import (BlockPlacement, SingleBlockManager, make_block, BlockManager) import pandas.core.algorithms as algos @@ -1263,9 +1262,30 @@ def test_binop_other(self, op, value, dtype): assert_series_equal(result, expected) +@pytest.mark.parametrize('typestr, holder', [ + ('category', Categorical), + ('M8[ns]', DatetimeIndex), + ('M8[ns, US/Central]', DatetimeIndex), + ('m8[ns]', TimedeltaIndex), + ('sparse', SparseArray), +]) +def test_holder(typestr, holder): + blk = create_block(typestr, [1]) + assert blk._holder is holder + + def test_deprecated_fastpath(): # GH#19265 values = np.random.rand(3, 3) with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): make_block(values, placement=np.arange(3), fastpath=True) + + +def test_validate_ndim(): + values = np.array([1.0, 2.0]) + placement = slice(2) + msg = "Wrong number of dimensions. values.ndim != ndim \[1 != 2\]" + + with tm.assert_raises_regex(ValueError, msg): + make_block(values, placement, ndim=2) diff --git a/pandas/tests/sparse/frame/test_frame.py b/pandas/tests/sparse/frame/test_frame.py index 0b7948cc32d24..54f567bcd2a8c 100644 --- a/pandas/tests/sparse/frame/test_frame.py +++ b/pandas/tests/sparse/frame/test_frame.py @@ -574,6 +574,15 @@ def test_setitem_array(self): self.frame['F'].reindex(index), check_names=False) + def test_setitem_chained_no_consolidate(self): + # https://github.com/pandas-dev/pandas/pull/19268 + # issuecomment-361696418 + # chained setitem used to cause consolidation + sdf = pd.SparseDataFrame([[np.nan, 1], [2, np.nan]]) + with pd.option_context('mode.chained_assignment', None): + sdf[0][1] = 2 + assert len(sdf._data.blocks) == 2 + def test_delitem(self): A = self.frame['A'] C = self.frame['C'] diff --git a/pandas/tests/test_errors.py b/pandas/tests/test_errors.py index babf88ef1df8d..e2a142366a89e 100644 --- a/pandas/tests/test_errors.py +++ b/pandas/tests/test_errors.py @@ -4,6 +4,8 @@ from warnings import catch_warnings import pandas # noqa import pandas as pd +from pandas.errors import AbstractMethodError +import pandas.util.testing as tm @pytest.mark.parametrize( @@ -50,3 +52,30 @@ def test_error_rename(): raise ParserError() except pd.parser.CParserError: pass + + +class Foo: + @classmethod + def classmethod(cls): + raise AbstractMethodError(cls, methodtype='classmethod') + + @property + def property(self): + raise AbstractMethodError(self, methodtype='property') + + def method(self): + raise AbstractMethodError(self) + + +def test_AbstractMethodError_classmethod(): + xpr = "This classmethod must be defined in the concrete class Foo" + with tm.assert_raises_regex(AbstractMethodError, xpr): + Foo.classmethod() + + xpr = "This property must be defined in the concrete class Foo" + with tm.assert_raises_regex(AbstractMethodError, xpr): + Foo().property + + xpr = "This method must be defined in the concrete class Foo" + with tm.assert_raises_regex(AbstractMethodError, xpr): + Foo().method()