From 2ef52169767c50682e2e9ee7a5fda2163b80754e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 15 Jan 2018 10:06:34 -0600 Subject: [PATCH 01/40] REF: Define extension base classes --- pandas/core/arrays/__init__.py | 1 + pandas/core/arrays/base.py | 201 ++++++++++++++ pandas/core/arrays/categorical.py | 18 +- pandas/core/dtypes/base.py | 92 +++++++ pandas/core/dtypes/common.py | 32 +++ pandas/core/dtypes/dtypes.py | 14 +- pandas/core/internals.py | 248 +++++++++++++----- pandas/tests/dtypes/test_dtypes.py | 36 ++- pandas/tests/internals/test_external_block.py | 4 +- 9 files changed, 566 insertions(+), 80 deletions(-) create mode 100644 pandas/core/arrays/base.py create mode 100644 pandas/core/dtypes/base.py diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py index ee32b12f0e712..f8adcf520c15b 100644 --- a/pandas/core/arrays/__init__.py +++ b/pandas/core/arrays/__init__.py @@ -1 +1,2 @@ +from .base import ExtensionArray # noqa from .categorical import Categorical # noqa diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py new file mode 100644 index 0000000000000..ad29edde34ce6 --- /dev/null +++ b/pandas/core/arrays/base.py @@ -0,0 +1,201 @@ +"""An interface for extending pandas with custom arrays.""" +import abc + +import numpy as np + +from pandas.compat import add_metaclass + + +_not_implemented_message = "{} does not implement {}." + + +@add_metaclass(abc.ABCMeta) +class ExtensionArray(object): + """Abstract base class for custom array types + + pandas will recognize instances of this class as proper arrays + with a custom type and will not attempt to coerce them to objects. + + Subclasses are expected to implement the following methods. + """ + # ------------------------------------------------------------------------ + # Must be a Sequence + # ------------------------------------------------------------------------ + @abc.abstractmethod + def __getitem__(self, item): + """Select a subset of self + + Notes + ----- + As a sequence, __getitem__ should expect integer or slice ``key``. + + For slice ``key``, you should return an instance of yourself, even + if the slice is length 0 or 1. + + For scalar ``key``, you may return a scalar suitable for your type. + The scalar need not be an instance or subclass of your array type. + """ + # type (Any) -> Any + + def __setitem__(self, key, value): + # type: (Any, Any) -> None + raise NotImplementedError(_not_implemented_message.format( + type(self), '__setitem__') + ) + + @abc.abstractmethod + def __iter__(self): + # type: () -> Iterator + pass + + @abc.abstractmethod + def __len__(self): + # type: () -> int + pass + + # ------------------------------------------------------------------------ + # Required attributes + # ------------------------------------------------------------------------ + @property + def base(self): + """The base array I am a view of. None by default.""" + + @property + @abc.abstractmethod + def dtype(self): + """An instance of 'ExtensionDtype'.""" + # type: () -> ExtensionDtype + pass + + @property + def shape(self): + # type: () -> Tuple[int, ...] + return (len(self),) + + @property + def ndim(self): + # type: () -> int + """Extension Arrays are only allowed to be 1-dimensional.""" + return 1 + + @property + @abc.abstractmethod + def nbytes(self): + """The number of bytes needed to store this object in memory.""" + # type: () -> int + pass + + # ------------------------------------------------------------------------ + # Additional Methods + # ------------------------------------------------------------------------ + @abc.abstractmethod + def isna(self): + """Boolean NumPy array indicating if each value is missing.""" + # type: () -> np.ndarray + pass + + # ------------------------------------------------------------------------ + # Indexing methods + # ------------------------------------------------------------------------ + @abc.abstractmethod + def take(self, indexer, allow_fill=True, fill_value=None): + # type: (Sequence, bool, Optional[Any]) -> ExtensionArray + """For slicing""" + + def take_nd(self, indexer, allow_fill=True, fill_value=None): + """For slicing""" + # TODO: this isn't really nescessary for 1-D + return self.take(indexer, allow_fill=allow_fill, + fill_value=fill_value) + + @abc.abstractmethod + def copy(self, deep=False): + # type: (bool) -> ExtensionArray + """Return a copy of the array.""" + + # ------------------------------------------------------------------------ + # Block-related methods + # ------------------------------------------------------------------------ + @property + def _fill_value(self): + """The missing value for this type, e.g. np.nan""" + # type: () -> Any + return None + + @abc.abstractmethod + def _formatting_values(self): + # type: () -> np.ndarray + # At the moment, this has to be an array since we use result.dtype + """An array of values to be printed in, e.g. the Series repr""" + + @classmethod + @abc.abstractmethod + def _concat_same_type(cls, to_concat): + # type: (Sequence[ExtensionArray]) -> ExtensionArray + """Concatenate multiple array + + Parameters + ---------- + to_concat : sequence of this type + + Returns + ------- + ExtensionArray + """ + + @abc.abstractmethod + def get_values(self): + # type: () -> np.ndarray + """Get the underlying values backing your data + """ + pass + + def _can_hold_na(self): + """Whether your array can hold missing values. True by default. + + Notes + ----- + Setting this to false will optimize some operations like fillna. + """ + # type: () -> bool + return True + + @property + def is_sparse(self): + """Whether your array is sparse. True by default.""" + # type: () -> bool + return False + + def _slice(self, slicer): + # type: (Union[tuple, Sequence, int]) -> 'ExtensionArray' + """Return a new array sliced by `slicer`. + + Parameters + ---------- + slicer : slice or np.ndarray + If an array, it should just be a boolean mask + + Returns + ------- + array : ExtensionArray + Should return an ExtensionArray, even if ``self[slicer]`` + would return a scalar. + """ + return type(self)(self[slicer]) + + def value_counts(self, dropna=True): + """Optional method for computing the histogram of the counts. + + Parameters + ---------- + dropna : bool, default True + whether to exclude missing values from the computation + + Returns + ------- + counts : Series + """ + from pandas.core.algorithms import value_counts + mask = ~np.asarray(self.isna()) + values = self[mask] # XXX: this imposes boolean indexing + return value_counts(np.asarray(values), dropna=dropna) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 708f903cd73cb..f0ec046e00e65 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -44,6 +44,8 @@ from pandas.util._validators import validate_bool_kwarg from pandas.core.config import get_option +from .base import ExtensionArray + def _cat_compare_op(op): def f(self, other): @@ -149,7 +151,7 @@ def _maybe_to_categorical(array): """ -class Categorical(PandasObject): +class Categorical(ExtensionArray, PandasObject): """ Represents a categorical variable in classic R / S-plus fashion @@ -2131,6 +2133,20 @@ def repeat(self, repeats, *args, **kwargs): return self._constructor(values=codes, categories=self.categories, ordered=self.ordered, fastpath=True) + # Interface things + # can_hold_na, concat_same_type, formatting_values + @property + def _can_hold_na(self): + return True + + @classmethod + def _concat_same_type(self, to_concat): + from pandas.types.concat import union_categoricals + return union_categoricals(to_concat) + + def _formatting_values(self): + return self + # The Series.cat accessor diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py new file mode 100644 index 0000000000000..348b4f077673a --- /dev/null +++ b/pandas/core/dtypes/base.py @@ -0,0 +1,92 @@ +"""Extend pandas with custom array types""" +import abc + +from pandas.compat import add_metaclass + + +@add_metaclass(abc.ABCMeta) +class ExtensionDtype(object): + """A custom data type for your array. + """ + @property + def type(self): + """Typically a metaclass inheriting from 'type' with no methods.""" + return type(self.name, (), {}) + + @property + def kind(self): + """A character code (one of 'biufcmMOSUV'), default 'O' + + See Also + -------- + numpy.dtype.kind + """ + return 'O' + + @property + @abc.abstractmethod + def name(self): + """An string identifying the data type. + + Will be used in, e.g. ``Series.dtype`` + """ + + @property + def names(self): + """Ordered list of field names, or None if there are no fields""" + return None + + @classmethod + def construct_from_string(cls, string): + """Attempt to construct this type from a string. + + Parameters + ---------- + string : str + + Returns + ------- + self : instance of 'cls' + + Raises + ------ + TypeError + + Notes + ----- + The default implementation checks if 'string' matches your + type's name. If so, it calls your class with no arguments. + """ + if string == cls.name: + return cls() + else: + raise TypeError("Cannot construct a '{}' from " + "'{}'".format(cls, string)) + + @classmethod + def is_dtype(cls, dtype): + """Check if we match 'dtype' + + Parameters + ---------- + dtype : str or dtype + + Returns + ------- + is_dtype : bool + + Notes + ----- + The default implementation is True if + + 1. 'dtype' is a string that returns true for + ``cls.construct_from_string`` + 2. 'dtype' is ``cls`` or a subclass of ``cls``. + """ + if isinstance(dtype, str): + try: + return isinstance(cls.construct_from_string(dtype), cls) + except TypeError: + return False + else: + return issubclass(dtype, cls) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index dca9a5fde0d74..2e4d0d884bf95 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1685,6 +1685,38 @@ def is_extension_type(arr): return False +def is_extension_array_dtype(arr_or_dtype): + """Check if an object is a pandas extension array type + + Parameters + ---------- + arr_or_dtype : object + + Returns + ------- + bool + + Notes + ----- + This checks whether an object implements the pandas extension + array interface. In pandas, this includes: + + * Categorical + * PeriodArray + * IntervalArray + * SparseArray + + Third-party libraries may implement arrays or types satisfying + this interface as well. + """ + from pandas.core.arrays import ExtensionArray + + # we want to unpack series, anything else? + if isinstance(arr_or_dtype, ABCSeries): + arr_or_dtype = arr_or_dtype.values + return isinstance(arr_or_dtype, (ExtensionDtype, ExtensionArray)) + + def is_complex_dtype(arr_or_dtype): """ Check whether the provided array or dtype is of a complex dtype. diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 1eb87aa99fd1e..df7b0dc9ea60e 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -5,15 +5,15 @@ from pandas import compat from pandas.core.dtypes.generic import ABCIndexClass, ABCCategoricalIndex +from .base import ExtensionDtype -class ExtensionDtype(object): + +class PandasExtensionDtype(ExtensionDtype): """ A np.dtype duck-typed class, suitable for holding a custom dtype. THIS IS NOT A REAL NUMPY DTYPE """ - name = None - names = None type = None subdtype = None kind = None @@ -108,7 +108,7 @@ class CategoricalDtypeType(type): pass -class CategoricalDtype(ExtensionDtype): +class CategoricalDtype(PandasExtensionDtype): """ Type for categorical data with the categories and orderedness @@ -387,7 +387,7 @@ class DatetimeTZDtypeType(type): pass -class DatetimeTZDtype(ExtensionDtype): +class DatetimeTZDtype(PandasExtensionDtype): """ A np.dtype duck-typed class, suitable for holding a custom datetime with tz @@ -501,7 +501,7 @@ class PeriodDtypeType(type): pass -class PeriodDtype(ExtensionDtype): +class PeriodDtype(PandasExtensionDtype): __metaclass__ = PeriodDtypeType """ A Period duck-typed class, suitable for holding a period with freq dtype. @@ -619,7 +619,7 @@ class IntervalDtypeType(type): pass -class IntervalDtype(ExtensionDtype): +class IntervalDtype(PandasExtensionDtype): __metaclass__ = IntervalDtypeType """ A Interval duck-typed class, suitable for holding an interval diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 764e06c19e76c..fb52a60c4cdd5 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -31,6 +31,7 @@ is_datetimelike_v_numeric, is_float_dtype, is_numeric_dtype, is_numeric_v_string_like, is_extension_type, + is_extension_array_dtype, is_list_like, is_re, is_re_compilable, @@ -59,7 +60,7 @@ from pandas.core.index import Index, MultiIndex, _ensure_index from pandas.core.indexing import maybe_convert_indices, length_of_indexer -from pandas.core.arrays.categorical import Categorical, _maybe_to_categorical +from pandas.core.arrays import Categorical from pandas.core.indexes.datetimes import DatetimeIndex from pandas.io.formats.printing import pprint_thing @@ -95,6 +96,7 @@ class Block(PandasObject): is_object = False is_categorical = False is_sparse = False + is_extension = False _box_to_block_values = True _can_hold_na = False _can_consolidate = True @@ -107,14 +109,15 @@ class Block(PandasObject): def __init__(self, values, placement, ndim=None, fastpath=False): if ndim is None: ndim = values.ndim - elif values.ndim != ndim: + elif self._validate_ndim and values.ndim != ndim: raise ValueError('Wrong number of dimensions') self.ndim = ndim self.mgr_locs = placement self.values = values - if ndim and len(self.mgr_locs) != len(self.values): + if (self._validate_ndim and ndim and + len(self.mgr_locs) != len(self.values)): raise ValueError( 'Wrong number of items passed {val}, placement implies ' '{mgr}'.format(val=len(self.values), mgr=len(self.mgr_locs))) @@ -273,7 +276,6 @@ def reshape_nd(self, labels, shape, ref_items, mgr=None): return a new block that is transformed to a nd block """ - return _block2d_to_blocknd(values=self.get_values().T, placement=self.mgr_locs, shape=shape, labels=labels, ref_items=ref_items) @@ -548,15 +550,20 @@ def astype(self, dtype, copy=False, errors='raise', values=None, **kwargs): def _astype(self, dtype, copy=False, errors='raise', values=None, klass=None, mgr=None, **kwargs): - """ - Coerce to the new type + """Coerce to the new type + Parameters + ---------- dtype : str, dtype convertible copy : boolean, default False copy if indicated errors : str, {'raise', 'ignore'}, default 'ignore' - ``raise`` : allow exceptions to be raised - ``ignore`` : suppress exceptions. On error return original object + + Returns + ------- + IntervalArray """ errors_legal_values = ('raise', 'ignore') @@ -1695,24 +1702,20 @@ class NonConsolidatableMixIn(object): _holder = None def __init__(self, values, placement, ndim=None, fastpath=False, **kwargs): + # Placement must be converted to BlockPlacement so that we can check + # its length + if not isinstance(placement, BlockPlacement): + placement = BlockPlacement(placement) - # Placement must be converted to BlockPlacement via property setter - # before ndim logic, because placement may be a slice which doesn't - # have a length. - self.mgr_locs = placement - - # kludgetastic + # Maybe infer ndim from placement if ndim is None: - if len(self.mgr_locs) != 1: + if len(placement) != 1: ndim = 1 else: ndim = 2 - self.ndim = ndim - - if not isinstance(values, self._holder): - raise TypeError("values must be {0}".format(self._holder.__name__)) - - self.values = values + super(NonConsolidatableMixIn, self).__init__(values, placement, + ndim=ndim, + fastpath=fastpath) @property def shape(self): @@ -1763,7 +1766,7 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0, Returns ------- - a new block(s), the result of the putmask + a new block, the result of the putmask """ inplace = validate_bool_kwarg(inplace, 'inplace') @@ -1821,6 +1824,130 @@ def _unstack(self, unstacker_func, new_columns): return blocks, mask +class ExtensionBlock(NonConsolidatableMixIn, Block): + """Block for holding extension types. + + Notes + ----- + This is the holds all 3rd-party extension types. It's also the immediate + parent class for our internal extension types' blocks, CategoricalBlock. + + All extension arrays *must* be 1-D, which simplifies things a bit. + """ + # Some questions / notes as comments, will be removed. + # + # Currently inherited from NCB. We'll keep it around until SparseBlock + # and DatetimeTZBlock are refactored. + # - set + # - iget + # - should_store + # - putmask + # - _slice + # - _try_cast_result + # - unstack + + # Think about overriding these methods from Block + # - _maybe_downcast: (never downcast) + + # Methods we can (probably) ignore and just use Block's: + + # * replace / replace_single + # Categorical got Object, but was hopefully unnescessary. + # DatetimeTZ, Sparse got Block + # * is_view + # Categorical overrides to say that it is not. + # DatetimeTZ, Sparse inherits Base anyway + + is_extension = True + + # XXX + # is_bool is is a change for CategoricalBlock. Used to inherit + # from Object to infer from values. If this matters, we should + # override it directly in CategoricalBlock so that we infer from + # the categories, not the codes. + is_bool = False + + def __init__(self, values, placement, ndim=None, fastpath=False): + self._holder = type(values) + super(ExtensionBlock, self).__init__(values, placement, ndim=ndim, + fastpath=fastpath) + + def get_values(self, dtype=None): + # ExtensionArrays must be iterable, so this works. + values = np.asarray(self.values) + if values.ndim == self.ndim - 1: + values = values.reshape((1,) + values.shape) + return values + + def to_dense(self): + return self.values.to_dense().view() + + def take_nd(self, indexer, axis=0, new_mgr_locs=None, fill_tuple=None): + """ + Take values according to indexer and return them as a block.bb + """ + if fill_tuple is None: + fill_value = None + else: + fill_value = fill_tuple[0] + + # axis doesn't matter; we are really a single-dim object + # but are passed the axis depending on the calling routing + # if its REALLY axis 0, then this will be a reindex and not a take + new_values = self.values.take_nd(indexer, fill_value=fill_value) + + # if we are a 1-dim object, then always place at 0 + if self.ndim == 1: + new_mgr_locs = [0] + else: + if new_mgr_locs is None: + new_mgr_locs = self.mgr_locs + + return self.make_block_same_class(new_values, new_mgr_locs) + + def _can_hold_element(self, element): + # XXX: + # Not defined on NCM. + # Categorical got True from ObjectBlock + # DatetimeTZ gets DatetimeBlock + # Sparse gets Block + # Let's just assume yes for now, but we can maybe push + # this onto the array. + return True + + def convert(self, copy=True, **kwargs): + # We're dedicated to a type, we don't convert. + # Taken from CategoricalBlock / Block. + return self.copy() if copy else self + + def _slice(self, slicer): + """ return a slice of my values """ + + # slice the category + # return same dims as we currently have + + if isinstance(slicer, tuple) and len(slicer) == 2: + if not is_null_slice(slicer[0]): + raise AssertionError("invalid slicing for a 1-ndim " + "categorical") + slicer = slicer[1] + + return self.values._slice(slicer) + + def formatting_values(self): + return self.values._formatting_values() + + def concat_same_type(self, to_concat, placement=None): + """ + Concatenate list of single blocks of the same type. + """ + values = self._holder._concat_same_type( + [blk.values for blk in to_concat]) + placement = placement or slice(0, len(values), 1) + return self.make_block_same_class(values, ndim=self.ndim, + placement=placement) + + class NumericBlock(Block): __slots__ = () is_numeric = True @@ -2334,7 +2461,7 @@ def re_replacer(s): return block -class CategoricalBlock(NonConsolidatableMixIn, ObjectBlock): +class CategoricalBlock(ExtensionBlock): __slots__ = () is_categorical = True _verify_integrity = True @@ -2343,6 +2470,7 @@ class CategoricalBlock(NonConsolidatableMixIn, ObjectBlock): _concatenator = staticmethod(_concat._concat_categorical) def __init__(self, values, placement, fastpath=False, **kwargs): + from pandas.core.arrays.categorical import _maybe_to_categorical # coerce to categorical if we can super(CategoricalBlock, self).__init__(_maybe_to_categorical(values), @@ -2354,12 +2482,6 @@ def is_view(self): """ I am never a view """ return False - def to_dense(self): - return self.values.to_dense().view() - - def convert(self, copy=True, **kwargs): - return self.copy() if copy else self - @property def array_dtype(self): """ the dtype to return if I want to construct this block as an @@ -2367,13 +2489,6 @@ def array_dtype(self): """ return np.object_ - def _slice(self, slicer): - """ return a slice of my values """ - - # slice the category - # return same dims as we currently have - return self.values._slice(slicer) - def _try_coerce_result(self, result): """ reverse of try_coerce_args """ @@ -2410,29 +2525,6 @@ def shift(self, periods, axis=0, mgr=None): return self.make_block_same_class(values=self.values.shift(periods), placement=self.mgr_locs) - def take_nd(self, indexer, axis=0, new_mgr_locs=None, fill_tuple=None): - """ - Take values according to indexer and return them as a block.bb - """ - if fill_tuple is None: - fill_value = None - else: - fill_value = fill_tuple[0] - - # axis doesn't matter; we are really a single-dim object - # but are passed the axis depending on the calling routing - # if its REALLY axis 0, then this will be a reindex and not a take - new_values = self.values.take_nd(indexer, fill_value=fill_value) - - # if we are a 1-dim object, then always place at 0 - if self.ndim == 1: - new_mgr_locs = [0] - else: - if new_mgr_locs is None: - new_mgr_locs = self.mgr_locs - - return self.make_block_same_class(new_values, new_mgr_locs) - def to_native_types(self, slicer=None, na_rep='', quoting=None, **kwargs): """ convert to our native types format, slicing if desired """ @@ -2447,17 +2539,6 @@ def to_native_types(self, slicer=None, na_rep='', quoting=None, **kwargs): # we are expected to return a 2-d ndarray return values.reshape(1, len(values)) - def concat_same_type(self, to_concat, placement=None): - """ - Concatenate list of single blocks of the same type. - """ - values = self._concatenator([blk.values for blk in to_concat], - axis=self.ndim - 1) - # not using self.make_block_same_class as values can be object dtype - return make_block( - values, placement=placement or slice(0, len(values), 1), - ndim=self.ndim) - class DatetimeBlock(DatetimeLikeBlockMixin, Block): __slots__ = () @@ -2465,7 +2546,8 @@ class DatetimeBlock(DatetimeLikeBlockMixin, Block): _can_hold_na = True def __init__(self, values, placement, fastpath=False, **kwargs): - if values.dtype != _NS_DTYPE: + if values.dtype != _NS_DTYPE and values.dtype.base != _NS_DTYPE: + # not datetime64 or datetime64tz values = conversion.ensure_datetime64ns(values) super(DatetimeBlock, self).__init__(values, fastpath=True, @@ -2954,6 +3036,8 @@ def get_block_type(values, dtype=None): cls = BoolBlock elif is_categorical(values): cls = CategoricalBlock + elif is_extension_array_dtype(values): + cls = ExtensionBlock else: cls = ObjectBlock return cls @@ -4681,6 +4765,7 @@ def form_blocks(arrays, names, axes): # generalize? items_dict = defaultdict(list) extra_locs = [] + external_items = [] names_idx = Index(names) if names_idx.equals(axes[0]): @@ -4748,6 +4833,31 @@ def form_blocks(arrays, names, axes): for i, _, array in items_dict['CategoricalBlock']] blocks.extend(cat_blocks) + if len(items_dict['ExtensionBlock']): + + external_blocks = [] + for i, _, array in items_dict['ExtensionBlock']: + if isinstance(array, ABCSeries): + array = array.values + # Allow our internal arrays to chose their block type. + block_type = getattr(array, '_block_type', ExtensionBlock) + external_blocks.append( + make_block(array, klass=block_type, + fastpath=True, placement=[i])) + blocks.extend(external_blocks) + + if len(external_items): + external_blocks = [] + for i, _, array in external_items: + if isinstance(array, ABCSeries): + array = array.values + # Allow our internal arrays to chose their block type. + block_type = getattr(array, '_block_type', ExtensionBlock) + external_blocks.append( + make_block(array, klass=block_type, + fastpath=True, placement=[i])) + blocks.extend(external_blocks) + if len(extra_locs): shape = (len(extra_locs),) + tuple(len(x) for x in axes[1:]) diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index d800a7b92b559..3423e22a4c64e 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -10,12 +10,14 @@ Series, Categorical, CategoricalIndex, IntervalIndex, date_range) from pandas.compat import string_types +from pandas.core.arrays import ExtensionArray from pandas.core.dtypes.dtypes import ( DatetimeTZDtype, PeriodDtype, - IntervalDtype, CategoricalDtype) + IntervalDtype, CategoricalDtype, ExtensionDtype) from pandas.core.dtypes.common import ( is_categorical_dtype, is_categorical, is_datetime64tz_dtype, is_datetimetz, + is_extension_array_dtype, is_period_dtype, is_period, is_dtype_equal, is_datetime64_ns_dtype, is_datetime64_dtype, is_interval_dtype, @@ -742,3 +744,35 @@ def test_categorical_categories(self): tm.assert_index_equal(c1.categories, pd.Index(['a', 'b'])) c1 = CategoricalDtype(CategoricalIndex(['a', 'b'])) tm.assert_index_equal(c1.categories, pd.Index(['a', 'b'])) + + +class DummyArray(object): + pass + + +class DummyDtype(object): + pass + + +ExtensionArray.register(DummyArray) +ExtensionDtype.register(DummyDtype) + + +class TestExtensionArrayDtype(object): + + @pytest.mark.parametrize('values', [ + pd.Categorical([]), + pd.Categorical([]).dtype, + pd.Series(pd.Categorical([])), + DummyDtype(), + DummyArray(), + ]) + def test_is_extension_array_dtype(self, values): + assert is_extension_array_dtype(values) + + @pytest.mark.parametrize('values', [ + np.array([]), + pd.Series(np.array([])), + ]) + def test_is_not_extension_array_dtype(self, values): + assert not is_extension_array_dtype(values) diff --git a/pandas/tests/internals/test_external_block.py b/pandas/tests/internals/test_external_block.py index 729ee0093b6dc..2487363df8f99 100644 --- a/pandas/tests/internals/test_external_block.py +++ b/pandas/tests/internals/test_external_block.py @@ -5,12 +5,12 @@ import pandas as pd from pandas.core.internals import ( - Block, BlockManager, SingleBlockManager, NonConsolidatableMixIn) + BlockManager, SingleBlockManager, ExtensionBlock) import pytest -class CustomBlock(NonConsolidatableMixIn, Block): +class CustomBlock(ExtensionBlock): _holder = np.ndarray From 57e8b0fb81b8bfaeeae366e84f94ae1b20f55b35 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 18 Jan 2018 14:12:52 -0600 Subject: [PATCH 02/40] Updated for comments * removed take_nd * Changed to_dense to return get_values * Fixed docstrings, types * Removed is_sparse --- pandas/core/arrays/base.py | 41 ++++++++++++++++---------------------- pandas/core/dtypes/base.py | 19 +++++++++++++----- pandas/core/internals.py | 2 +- 3 files changed, 32 insertions(+), 30 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index ad29edde34ce6..82c3b9f53e498 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -13,16 +13,26 @@ class ExtensionArray(object): """Abstract base class for custom array types + Notes + ----- pandas will recognize instances of this class as proper arrays with a custom type and will not attempt to coerce them to objects. - Subclasses are expected to implement the following methods. + **Restrictions on your class constructor** + + * Your class should be able to be constructed with no arguments, + i.e. ``ExtensionArray()`` returns an instance. + TODO: See comment in ``ExtensionDtype.construct_from_string`` + * Your class should be able to be constructed with instances of + our class, i.e. ``ExtensionArray(extension_array)`` should returns + an instance. """ # ------------------------------------------------------------------------ # Must be a Sequence # ------------------------------------------------------------------------ @abc.abstractmethod def __getitem__(self, item): + # type (Any) -> Any """Select a subset of self Notes @@ -35,7 +45,6 @@ def __getitem__(self, item): For scalar ``key``, you may return a scalar suitable for your type. The scalar need not be an instance or subclass of your array type. """ - # type (Any) -> Any def __setitem__(self, key, value): # type: (Any, Any) -> None @@ -63,9 +72,8 @@ def base(self): @property @abc.abstractmethod def dtype(self): - """An instance of 'ExtensionDtype'.""" # type: () -> ExtensionDtype - pass + """An instance of 'ExtensionDtype'.""" @property def shape(self): @@ -81,18 +89,16 @@ def ndim(self): @property @abc.abstractmethod def nbytes(self): - """The number of bytes needed to store this object in memory.""" # type: () -> int - pass + """The number of bytes needed to store this object in memory.""" # ------------------------------------------------------------------------ # Additional Methods # ------------------------------------------------------------------------ @abc.abstractmethod def isna(self): - """Boolean NumPy array indicating if each value is missing.""" # type: () -> np.ndarray - pass + """Boolean NumPy array indicating if each value is missing.""" # ------------------------------------------------------------------------ # Indexing methods @@ -102,12 +108,6 @@ def take(self, indexer, allow_fill=True, fill_value=None): # type: (Sequence, bool, Optional[Any]) -> ExtensionArray """For slicing""" - def take_nd(self, indexer, allow_fill=True, fill_value=None): - """For slicing""" - # TODO: this isn't really nescessary for 1-D - return self.take(indexer, allow_fill=allow_fill, - fill_value=fill_value) - @abc.abstractmethod def copy(self, deep=False): # type: (bool) -> ExtensionArray @@ -118,8 +118,8 @@ def copy(self, deep=False): # ------------------------------------------------------------------------ @property def _fill_value(self): - """The missing value for this type, e.g. np.nan""" # type: () -> Any + """The missing value for this type, e.g. np.nan""" return None @abc.abstractmethod @@ -146,26 +146,19 @@ def _concat_same_type(cls, to_concat): @abc.abstractmethod def get_values(self): # type: () -> np.ndarray - """Get the underlying values backing your data + """A NumPy array representing your data. """ - pass def _can_hold_na(self): + # type: () -> bool """Whether your array can hold missing values. True by default. Notes ----- Setting this to false will optimize some operations like fillna. """ - # type: () -> bool return True - @property - def is_sparse(self): - """Whether your array is sparse. True by default.""" - # type: () -> bool - return False - def _slice(self, slicer): # type: (Union[tuple, Sequence, int]) -> 'ExtensionArray' """Return a new array sliced by `slicer`. diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index 348b4f077673a..a8ef8b6b209b0 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -9,14 +9,19 @@ class ExtensionDtype(object): """A custom data type for your array. """ @property + @abc.abstractmethod def type(self): - """Typically a metaclass inheriting from 'type' with no methods.""" - return type(self.name, (), {}) + # type: () -> type + """The scalar type for your array, e.g. ``int`` or ``object``.""" @property def kind(self): + # type () -> str """A character code (one of 'biufcmMOSUV'), default 'O' + This should match the NumPy dtype used when your array is + converted to an ndarray, which is probably 'O' for object. + See Also -------- numpy.dtype.kind @@ -26,14 +31,16 @@ def kind(self): @property @abc.abstractmethod def name(self): - """An string identifying the data type. + # type: () -> str + """A string identifying the data type. - Will be used in, e.g. ``Series.dtype`` + Will be used for display in, e.g. ``Series.dtype`` """ @property def names(self): - """Ordered list of field names, or None if there are no fields""" + # type: () -> Optional[List[str]] + """Ordered list of field names, or None if there are no fields.""" return None @classmethod @@ -58,6 +65,8 @@ def construct_from_string(cls, string): type's name. If so, it calls your class with no arguments. """ if string == cls.name: + # XXX: Better to mandate a ``.from_empty`` classmethod + # rather than imposing this on the constructor? return cls() else: raise TypeError("Cannot construct a '{}' from " diff --git a/pandas/core/internals.py b/pandas/core/internals.py index fb52a60c4cdd5..dc64b471138bd 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1880,7 +1880,7 @@ def get_values(self, dtype=None): return values def to_dense(self): - return self.values.to_dense().view() + return self.values.get_values() def take_nd(self, indexer, axis=0, new_mgr_locs=None, fill_tuple=None): """ From 01bd42fde9ac38491ac3098c0f8865a56f5358a5 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 18 Jan 2018 14:36:12 -0600 Subject: [PATCH 03/40] Remove metaclasses from PeriodDtype and IntervalDtype --- pandas/core/dtypes/dtypes.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index df7b0dc9ea60e..d8d3a96992757 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -502,7 +502,6 @@ class PeriodDtypeType(type): class PeriodDtype(PandasExtensionDtype): - __metaclass__ = PeriodDtypeType """ A Period duck-typed class, suitable for holding a period with freq dtype. @@ -620,7 +619,6 @@ class IntervalDtypeType(type): class IntervalDtype(PandasExtensionDtype): - __metaclass__ = IntervalDtypeType """ A Interval duck-typed class, suitable for holding an interval From ce81706b8997fa5b7d84b470807bbbf058e66176 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 18 Jan 2018 14:43:18 -0600 Subject: [PATCH 04/40] Fixup form_blocks rebase --- pandas/core/internals.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index dc64b471138bd..dfe4d8100a2fb 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -4765,7 +4765,6 @@ def form_blocks(arrays, names, axes): # generalize? items_dict = defaultdict(list) extra_locs = [] - external_items = [] names_idx = Index(names) if names_idx.equals(axes[0]): @@ -4846,18 +4845,6 @@ def form_blocks(arrays, names, axes): fastpath=True, placement=[i])) blocks.extend(external_blocks) - if len(external_items): - external_blocks = [] - for i, _, array in external_items: - if isinstance(array, ABCSeries): - array = array.values - # Allow our internal arrays to chose their block type. - block_type = getattr(array, '_block_type', ExtensionBlock) - external_blocks.append( - make_block(array, klass=block_type, - fastpath=True, placement=[i])) - blocks.extend(external_blocks) - if len(extra_locs): shape = (len(extra_locs),) + tuple(len(x) for x in axes[1:]) From 87a70e3958d603868a0a5dff13b04f8779290965 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 18 Jan 2018 16:00:42 -0600 Subject: [PATCH 05/40] Restore concat casting cat -> object --- pandas/core/arrays/categorical.py | 5 +++-- pandas/core/internals.py | 20 ++++++++++++++++++++ 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index f0ec046e00e65..87fee8e8fd6ab 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2141,8 +2141,9 @@ def _can_hold_na(self): @classmethod def _concat_same_type(self, to_concat): - from pandas.types.concat import union_categoricals - return union_categoricals(to_concat) + from pandas.core.dtypes.concat import _concat_categorical + + return _concat_categorical(to_concat) def _formatting_values(self): return self diff --git a/pandas/core/internals.py b/pandas/core/internals.py index dfe4d8100a2fb..2a3a1cee33484 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -2539,6 +2539,26 @@ def to_native_types(self, slicer=None, na_rep='', quoting=None, **kwargs): # we are expected to return a 2-d ndarray return values.reshape(1, len(values)) + def concat_same_type(self, to_concat, placement=None): + """ + Concatenate list of single blocks of the same type. + + Note that this CategoricalBlock._concat_same_type *may* not + return a CategoricalBlock. When the categories in `to_concat` + differ, this will return an object ndarray. + + If / when we decide we don't like that behavior: + + 1. Change Categorical._concat_same_type to use union_categoricals + 2. Delete this method. + """ + values = self._concatenator([blk.values for blk in to_concat], + axis=self.ndim - 1) + # not using self.make_block_same_class as values can be object dtype + return make_block( + values, placement=placement or slice(0, len(values), 1), + ndim=self.ndim) + class DatetimeBlock(DatetimeLikeBlockMixin, Block): __slots__ = () From 8c61886011392cabeb1354c893015e84cb8049d8 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 19 Jan 2018 10:20:45 -0600 Subject: [PATCH 06/40] Remove _slice, clarify semantics around __getitem__ --- pandas/core/arrays/base.py | 36 +++++++++++++----------------------- pandas/core/internals.py | 2 +- 2 files changed, 14 insertions(+), 24 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 82c3b9f53e498..94f0b6c4b0e6b 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -33,17 +33,24 @@ class ExtensionArray(object): @abc.abstractmethod def __getitem__(self, item): # type (Any) -> Any - """Select a subset of self + """Select a subset of self. Notes ----- - As a sequence, __getitem__ should expect integer or slice ``key``. + ``item`` may be one of - For slice ``key``, you should return an instance of yourself, even + * A scalar integer position + * A slice object + * A boolean mask the same length as 'self' + + For scalar ``item``, return a scalar value suitable for the array's + type. This should be an instance of ``self.dtype.type``. + + For slice ``key``, return an instance of ``ExtensionArray``, even if the slice is length 0 or 1. - For scalar ``key``, you may return a scalar suitable for your type. - The scalar need not be an instance or subclass of your array type. + For a boolean mask, return an instance of ``ExtensionArray``, filtered + to the values where ``item`` is True. """ def __setitem__(self, key, value): @@ -159,23 +166,6 @@ def _can_hold_na(self): """ return True - def _slice(self, slicer): - # type: (Union[tuple, Sequence, int]) -> 'ExtensionArray' - """Return a new array sliced by `slicer`. - - Parameters - ---------- - slicer : slice or np.ndarray - If an array, it should just be a boolean mask - - Returns - ------- - array : ExtensionArray - Should return an ExtensionArray, even if ``self[slicer]`` - would return a scalar. - """ - return type(self)(self[slicer]) - def value_counts(self, dropna=True): """Optional method for computing the histogram of the counts. @@ -190,5 +180,5 @@ def value_counts(self, dropna=True): """ from pandas.core.algorithms import value_counts mask = ~np.asarray(self.isna()) - values = self[mask] # XXX: this imposes boolean indexing + values = self[mask] return value_counts(np.asarray(values), dropna=dropna) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 2a3a1cee33484..7620a3797b265 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1932,7 +1932,7 @@ def _slice(self, slicer): "categorical") slicer = slicer[1] - return self.values._slice(slicer) + return self.values[slicer] def formatting_values(self): return self.values._formatting_values() From cb41803fe0b7abb2995b6ec9220b2874472677c5 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 19 Jan 2018 11:16:58 -0600 Subject: [PATCH 07/40] Document and use take. --- pandas/core/arrays/base.py | 24 ++++++++++++++++++++++-- pandas/core/internals.py | 2 +- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 94f0b6c4b0e6b..7b70ccf908564 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -112,8 +112,28 @@ def isna(self): # ------------------------------------------------------------------------ @abc.abstractmethod def take(self, indexer, allow_fill=True, fill_value=None): - # type: (Sequence, bool, Optional[Any]) -> ExtensionArray - """For slicing""" + # type: (Sequence[int], bool, Optional[Any]) -> ExtensionArray + """Take elements from an array + + Parameters + ---------- + indexer : sequence of integers + indices to be taken. -1 is used to indicate values + that are missing. + allow_fill : bool, default True + If False, indexer is assumed to contain no -1 values so no filling + will be done. This short-circuits computation of a mask. Result is + undefined if allow_fill == False and -1 is present in indexer. + fill_value : any, default None + Fill value to replace -1 values with + + Notes + ----- + This should follow pandas' semantics where -1 indicates missing values. + + This is called by ``Series.__getitem__``, ``.loc``, ``iloc``, when the + indexer is a sequence of values. + """ @abc.abstractmethod def copy(self, deep=False): diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 7620a3797b265..1bd4e10a1f5f9 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1894,7 +1894,7 @@ def take_nd(self, indexer, axis=0, new_mgr_locs=None, fill_tuple=None): # axis doesn't matter; we are really a single-dim object # but are passed the axis depending on the calling routing # if its REALLY axis 0, then this will be a reindex and not a take - new_values = self.values.take_nd(indexer, fill_value=fill_value) + new_values = self.values.take(indexer, fill_value=fill_value) # if we are a 1-dim object, then always place at 0 if self.ndim == 1: From 65d5a61852c93d81eae986896f0ae68ffda5a675 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 19 Jan 2018 11:24:01 -0600 Subject: [PATCH 08/40] Clarify type, kind, init --- pandas/core/arrays/base.py | 3 --- pandas/core/dtypes/base.py | 10 ++++++++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 7b70ccf908564..090da1fbdbe87 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -20,9 +20,6 @@ class ExtensionArray(object): **Restrictions on your class constructor** - * Your class should be able to be constructed with no arguments, - i.e. ``ExtensionArray()`` returns an instance. - TODO: See comment in ``ExtensionDtype.construct_from_string`` * Your class should be able to be constructed with instances of our class, i.e. ``ExtensionArray(extension_array)`` should returns an instance. diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index a8ef8b6b209b0..c6e465999622d 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -12,7 +12,11 @@ class ExtensionDtype(object): @abc.abstractmethod def type(self): # type: () -> type - """The scalar type for your array, e.g. ``int`` or ``object``.""" + """The scalar type for your array, e.g. ``int`` + + It's expected ``ExtensionArray[item]`` returns an instance + of ``ExtensionDtype.type`` for scalar ``item``. + """ @property def kind(self): @@ -20,7 +24,9 @@ def kind(self): """A character code (one of 'biufcmMOSUV'), default 'O' This should match the NumPy dtype used when your array is - converted to an ndarray, which is probably 'O' for object. + converted to an ndarray, which is probably 'O' for object if + your extension type cannot be represented as a built-in NumPy + type. See Also -------- From 57c749bd15a0ed28be1ad0c6012d2ba3fe650687 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 19 Jan 2018 11:34:38 -0600 Subject: [PATCH 09/40] Remove base --- pandas/core/arrays/base.py | 18 ++++++++++-------- pandas/core/dtypes/base.py | 4 ++-- pandas/core/internals.py | 2 +- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 090da1fbdbe87..dd4db74ba3cc7 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -20,9 +20,9 @@ class ExtensionArray(object): **Restrictions on your class constructor** - * Your class should be able to be constructed with instances of - our class, i.e. ``ExtensionArray(extension_array)`` should returns - an instance. + * Extension arrays should be able to be constructed with instances of + the class, i.e. ``ExtensionArray(extension_array)`` should return + an instance, not error. """ # ------------------------------------------------------------------------ # Must be a Sequence @@ -69,10 +69,6 @@ def __len__(self): # ------------------------------------------------------------------------ # Required attributes # ------------------------------------------------------------------------ - @property - def base(self): - """The base array I am a view of. None by default.""" - @property @abc.abstractmethod def dtype(self): @@ -94,7 +90,11 @@ def ndim(self): @abc.abstractmethod def nbytes(self): # type: () -> int - """The number of bytes needed to store this object in memory.""" + """The number of bytes needed to store this object in memory. + + If this is expensive to compute, return an approximate lower bound + on the number of bytes needed. + """ # ------------------------------------------------------------------------ # Additional Methods @@ -127,6 +127,8 @@ def take(self, indexer, allow_fill=True, fill_value=None): Notes ----- This should follow pandas' semantics where -1 indicates missing values. + Positions where indexer is ``-1`` should be filled with the missing + value for this type. This is called by ``Series.__getitem__``, ``.loc``, ``iloc``, when the indexer is a sequence of values. diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index c6e465999622d..57e96b83a28c4 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -94,8 +94,8 @@ def is_dtype(cls, dtype): ----- The default implementation is True if - 1. 'dtype' is a string that returns true for - ``cls.construct_from_string`` + 1. ``cls.construct_from_string(dtype)`` is an instance + of ``cls``. 2. 'dtype' is ``cls`` or a subclass of ``cls``. """ if isinstance(dtype, str): diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 1bd4e10a1f5f9..17e588a85a6db 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -563,7 +563,7 @@ def _astype(self, dtype, copy=False, errors='raise', values=None, Returns ------- - IntervalArray + Block """ errors_legal_values = ('raise', 'ignore') From 6736b0ff59a33929cb59639564cb9bf38fac0ff9 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 21 Jan 2018 08:15:07 -0600 Subject: [PATCH 10/40] API: Remove unused __iter__ and get_values --- pandas/core/arrays/base.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index dd4db74ba3cc7..54e8030df2640 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -56,11 +56,6 @@ def __setitem__(self, key, value): type(self), '__setitem__') ) - @abc.abstractmethod - def __iter__(self): - # type: () -> Iterator - pass - @abc.abstractmethod def __len__(self): # type: () -> int @@ -169,12 +164,6 @@ def _concat_same_type(cls, to_concat): ExtensionArray """ - @abc.abstractmethod - def get_values(self): - # type: () -> np.ndarray - """A NumPy array representing your data. - """ - def _can_hold_na(self): # type: () -> bool """Whether your array can hold missing values. True by default. From e4acb598dc9ed1ef342b9898e02cd9f69e577273 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 21 Jan 2018 08:15:21 -0600 Subject: [PATCH 11/40] API: Implement repr and str --- pandas/core/dtypes/base.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index 57e96b83a28c4..1bc46641ee6ef 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -8,6 +8,13 @@ class ExtensionDtype(object): """A custom data type for your array. """ + + def __repr__(self): + return str(self) + + def __str__(self): + return self.name + @property @abc.abstractmethod def type(self): From df68f3bbf33edbe47d3b7beda99b24d187a2ed7b Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 26 Jan 2018 18:46:19 +0100 Subject: [PATCH 12/40] Remove default value_counts for now --- pandas/core/arrays/base.py | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 54e8030df2640..68783b86dbe68 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -173,20 +173,3 @@ def _can_hold_na(self): Setting this to false will optimize some operations like fillna. """ return True - - def value_counts(self, dropna=True): - """Optional method for computing the histogram of the counts. - - Parameters - ---------- - dropna : bool, default True - whether to exclude missing values from the computation - - Returns - ------- - counts : Series - """ - from pandas.core.algorithms import value_counts - mask = ~np.asarray(self.isna()) - values = self[mask] - return value_counts(np.asarray(values), dropna=dropna) From 2746a433ada4510aef44d784d2590da53954993e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 27 Jan 2018 11:11:24 +0100 Subject: [PATCH 13/40] Fixed merge conflicts --- pandas/core/internals.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 9b3f827b97af3..27f99469538bd 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1688,8 +1688,7 @@ def __init__(self, values, placement, ndim=None): else: ndim = 2 super(NonConsolidatableMixIn, self).__init__(values, placement, - ndim=ndim, - fastpath=fastpath) + ndim=ndim) @property def shape(self): @@ -1901,7 +1900,7 @@ def _slice(self, slicer): # return same dims as we currently have if isinstance(slicer, tuple) and len(slicer) == 2: - if not is_null_slice(slicer[0]): + if not com.is_null_slice(slicer[0]): raise AssertionError("invalid slicing for a 1-ndim " "categorical") slicer = slicer[1] @@ -2447,7 +2446,7 @@ def __init__(self, values, placement): # coerce to categorical if we can super(CategoricalBlock, self).__init__(_maybe_to_categorical(values), - placement=placement, ndim=ndim) + placement=placement) @property def is_view(self): From 34d2b99f20a1a487fa16cc2da0adf7f131544435 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 27 Jan 2018 11:20:40 +0100 Subject: [PATCH 14/40] Remove implementation of construct_from_string --- pandas/core/dtypes/base.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index 1bc46641ee6ef..27e3f736d211d 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -57,6 +57,7 @@ def names(self): return None @classmethod + @abc.abstractmethod def construct_from_string(cls, string): """Attempt to construct this type from a string. @@ -71,19 +72,26 @@ def construct_from_string(cls, string): Raises ------ TypeError + If a class cannot be constructed from this 'string'. Notes ----- The default implementation checks if 'string' matches your type's name. If so, it calls your class with no arguments. + + Examples + -------- + If the extension dtype can be constructed without any arguments, + the following may be an adequate implementation. + + >>> @classmethod + ... def construct_from_string(cls, string) + ... if string == cls.name: + ... return cls() + ... else: + ... raise TypeError("Cannot construct a '{}' from " + ... "'{}'".format(cls, string)) """ - if string == cls.name: - # XXX: Better to mandate a ``.from_empty`` classmethod - # rather than imposing this on the constructor? - return cls() - else: - raise TypeError("Cannot construct a '{}' from " - "'{}'".format(cls, string)) @classmethod def is_dtype(cls, dtype): From a484d615fce72483f5ef82c60b90042b917671b3 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 27 Jan 2018 11:21:04 +0100 Subject: [PATCH 15/40] Example implementation of take --- pandas/core/arrays/base.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 68783b86dbe68..402e65180c322 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -37,8 +37,9 @@ def __getitem__(self, item): ``item`` may be one of * A scalar integer position - * A slice object - * A boolean mask the same length as 'self' + * A slice object, where 'start', 'stop', and 'step' are + integers or None + * A 1-d boolean NumPy ndarray the same length as 'self' For scalar ``item``, return a scalar value suitable for the array's type. This should be an instance of ``self.dtype.type``. @@ -105,7 +106,7 @@ def isna(self): @abc.abstractmethod def take(self, indexer, allow_fill=True, fill_value=None): # type: (Sequence[int], bool, Optional[Any]) -> ExtensionArray - """Take elements from an array + """Take elements from an array. Parameters ---------- @@ -117,7 +118,8 @@ def take(self, indexer, allow_fill=True, fill_value=None): will be done. This short-circuits computation of a mask. Result is undefined if allow_fill == False and -1 is present in indexer. fill_value : any, default None - Fill value to replace -1 values with + Fill value to replace -1 values with. By default, this uses + the missing value sentinel for this type, ``self._fill_value``. Notes ----- @@ -127,6 +129,20 @@ def take(self, indexer, allow_fill=True, fill_value=None): This is called by ``Series.__getitem__``, ``.loc``, ``iloc``, when the indexer is a sequence of values. + + Examples + -------- + Suppose the extension array is actually a NumPy structured array with + two fields, and that the underlying structured array is stored as + ``self.data``. ``take`` may be written as + + >>> def take(self, indexer, allow_fill=True, fill_value=None): + ... mask = indexer == -1 + ... result = self.data.take(indexer) + ... result[mask] = self._fill_value + ... return type(self)(result) + + We ignore the 'allow_fill' and 'fill_value' arguments. """ @abc.abstractmethod From 04b2e723281fa5c4b1b2a8ad6b3ef3a98839880b Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 27 Jan 2018 17:46:28 +0100 Subject: [PATCH 16/40] Cleanup ExtensionBlock --- pandas/core/internals.py | 41 +++++----------------------------------- 1 file changed, 5 insertions(+), 36 deletions(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 27f99469538bd..5000f7336eb02 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1805,45 +1805,14 @@ class ExtensionBlock(NonConsolidatableMixIn, Block): This is the holds all 3rd-party extension types. It's also the immediate parent class for our internal extension types' blocks, CategoricalBlock. - All extension arrays *must* be 1-D, which simplifies things a bit. + ExtensionArrays are limited to 1-D. """ - # Some questions / notes as comments, will be removed. - # - # Currently inherited from NCB. We'll keep it around until SparseBlock - # and DatetimeTZBlock are refactored. - # - set - # - iget - # - should_store - # - putmask - # - _slice - # - _try_cast_result - # - unstack - - # Think about overriding these methods from Block - # - _maybe_downcast: (never downcast) - - # Methods we can (probably) ignore and just use Block's: - - # * replace / replace_single - # Categorical got Object, but was hopefully unnescessary. - # DatetimeTZ, Sparse got Block - # * is_view - # Categorical overrides to say that it is not. - # DatetimeTZ, Sparse inherits Base anyway - is_extension = True - - # XXX - # is_bool is is a change for CategoricalBlock. Used to inherit - # from Object to infer from values. If this matters, we should - # override it directly in CategoricalBlock so that we infer from - # the categories, not the codes. is_bool = False - def __init__(self, values, placement, ndim=None, fastpath=False): + def __init__(self, values, placement, ndim=None): self._holder = type(values) - super(ExtensionBlock, self).__init__(values, placement, ndim=ndim, - fastpath=fastpath) + super(ExtensionBlock, self).__init__(values, placement, ndim=ndim) def get_values(self, dtype=None): # ExtensionArrays must be iterable, so this works. @@ -1857,7 +1826,7 @@ def to_dense(self): def take_nd(self, indexer, axis=0, new_mgr_locs=None, fill_tuple=None): """ - Take values according to indexer and return them as a block.bb + Take values according to indexer and return them as a block. """ if fill_tuple is None: fill_value = None @@ -2441,7 +2410,7 @@ class CategoricalBlock(ExtensionBlock): _holder = Categorical _concatenator = staticmethod(_concat._concat_categorical) - def __init__(self, values, placement): + def __init__(self, values, placement, ndim=None): from pandas.core.arrays.categorical import _maybe_to_categorical # coerce to categorical if we can From e77805318b3a031d79e83ed33eccd7bfec6e82be Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 27 Jan 2018 18:00:04 +0100 Subject: [PATCH 17/40] Pass through ndim --- pandas/core/internals.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 5000f7336eb02..259696c23cf58 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -2415,7 +2415,8 @@ def __init__(self, values, placement, ndim=None): # coerce to categorical if we can super(CategoricalBlock, self).__init__(_maybe_to_categorical(values), - placement=placement) + placement=placement, + ndim=ndim) @property def is_view(self): From d15a7227078539941442bbaccd1a34bac2466057 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 27 Jan 2018 19:08:37 +0100 Subject: [PATCH 18/40] Use series._values --- pandas/core/dtypes/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 2e4d0d884bf95..bae9a2f866904 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1713,7 +1713,7 @@ def is_extension_array_dtype(arr_or_dtype): # we want to unpack series, anything else? if isinstance(arr_or_dtype, ABCSeries): - arr_or_dtype = arr_or_dtype.values + arr_or_dtype = arr_or_dtype._values return isinstance(arr_or_dtype, (ExtensionDtype, ExtensionArray)) From b5f736da3181f2f3004cad167f8101517977bacd Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 27 Jan 2018 19:35:24 +0100 Subject: [PATCH 19/40] Removed repr, updated take doc --- pandas/core/arrays/base.py | 18 +++++++++--------- pandas/core/dtypes/base.py | 3 --- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 402e65180c322..7381fc004d2f5 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -132,17 +132,17 @@ def take(self, indexer, allow_fill=True, fill_value=None): Examples -------- - Suppose the extension array is actually a NumPy structured array with - two fields, and that the underlying structured array is stored as - ``self.data``. ``take`` may be written as + Suppose the extension array somehow backed by a NumPy structured array + and that the underlying structured array is stored as ``self.data``. + Then ``take`` may be written as - >>> def take(self, indexer, allow_fill=True, fill_value=None): - ... mask = indexer == -1 - ... result = self.data.take(indexer) - ... result[mask] = self._fill_value - ... return type(self)(result) + .. code-block:: python - We ignore the 'allow_fill' and 'fill_value' arguments. + def take(self, indexer, allow_fill=True, fill_value=None): + mask = indexer == -1 + result = self.data.take(indexer) + result[mask] = self._fill_value + return type(self)(result) """ @abc.abstractmethod diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index 27e3f736d211d..ab0cde5431214 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -9,9 +9,6 @@ class ExtensionDtype(object): """A custom data type for your array. """ - def __repr__(self): - return str(self) - def __str__(self): return self.name From 240e8f6f1b44fd401217961486efd079137616cb Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 28 Jan 2018 22:03:43 +0100 Subject: [PATCH 20/40] Various cleanups --- pandas/core/arrays/base.py | 7 ++++--- pandas/core/arrays/categorical.py | 3 +-- pandas/core/dtypes/base.py | 13 ++++--------- pandas/core/dtypes/common.py | 5 +---- pandas/core/internals.py | 9 ++------- 5 files changed, 12 insertions(+), 25 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 7381fc004d2f5..a3bed7af63220 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1,8 +1,6 @@ """An interface for extending pandas with custom arrays.""" import abc -import numpy as np - from pandas.compat import add_metaclass @@ -11,7 +9,7 @@ @add_metaclass(abc.ABCMeta) class ExtensionArray(object): - """Abstract base class for custom array types + """Abstract base class for custom array types. Notes ----- @@ -23,6 +21,9 @@ class ExtensionArray(object): * Extension arrays should be able to be constructed with instances of the class, i.e. ``ExtensionArray(extension_array)`` should return an instance, not error. + + Additionally, certain methods and interfaces are required for proper + this array to be properly stored inside a ``DataFrame`` or ``Series``. """ # ------------------------------------------------------------------------ # Must be a Sequence diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 43829d0917d81..40987cfe0f484 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2132,8 +2132,7 @@ def repeat(self, repeats, *args, **kwargs): return self._constructor(values=codes, categories=self.categories, ordered=self.ordered, fastpath=True) - # Interface things - # can_hold_na, concat_same_type, formatting_values + # ExtensionArray Interface things @property def _can_hold_na(self): return True diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index ab0cde5431214..b5257c444e6d3 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -6,7 +6,7 @@ @add_metaclass(abc.ABCMeta) class ExtensionDtype(object): - """A custom data type for your array. + """A custom data type, to be paired with an ExtensionArray. """ def __str__(self): @@ -16,7 +16,7 @@ def __str__(self): @abc.abstractmethod def type(self): # type: () -> type - """The scalar type for your array, e.g. ``int`` + """The scalar type for the array, e.g. ``int`` It's expected ``ExtensionArray[item]`` returns an instance of ``ExtensionDtype.type`` for scalar ``item``. @@ -27,9 +27,9 @@ def kind(self): # type () -> str """A character code (one of 'biufcmMOSUV'), default 'O' - This should match the NumPy dtype used when your array is + This should match the NumPy dtype used when the array is converted to an ndarray, which is probably 'O' for object if - your extension type cannot be represented as a built-in NumPy + the extension type cannot be represented as a built-in NumPy type. See Also @@ -71,11 +71,6 @@ def construct_from_string(cls, string): TypeError If a class cannot be constructed from this 'string'. - Notes - ----- - The default implementation checks if 'string' matches your - type's name. If so, it calls your class with no arguments. - Examples -------- If the extension dtype can be constructed without any arguments, diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index bae9a2f866904..c66e7fcfc6978 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1686,7 +1686,7 @@ def is_extension_type(arr): def is_extension_array_dtype(arr_or_dtype): - """Check if an object is a pandas extension array type + """Check if an object is a pandas extension array type. Parameters ---------- @@ -1702,9 +1702,6 @@ def is_extension_array_dtype(arr_or_dtype): array interface. In pandas, this includes: * Categorical - * PeriodArray - * IntervalArray - * SparseArray Third-party libraries may implement arrays or types satisfying this interface as well. diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 259696c23cf58..9afb412f93781 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1848,13 +1848,8 @@ def take_nd(self, indexer, axis=0, new_mgr_locs=None, fill_tuple=None): return self.make_block_same_class(new_values, new_mgr_locs) def _can_hold_element(self, element): - # XXX: - # Not defined on NCM. - # Categorical got True from ObjectBlock - # DatetimeTZ gets DatetimeBlock - # Sparse gets Block - # Let's just assume yes for now, but we can maybe push - # this onto the array. + # XXX: We may need to think about pushing this onto the array. + # We're doing the same as CategoricalBlock here. return True def convert(self, copy=True, **kwargs): From f9b0b49b20fcb8096049fe44de40309c6a03f758 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 29 Jan 2018 00:56:51 +0100 Subject: [PATCH 21/40] Handle get_values, to_dense, is_view --- pandas/core/internals.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 9afb412f93781..1f7d7f47f83eb 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1814,6 +1814,11 @@ def __init__(self, values, placement, ndim=None): self._holder = type(values) super(ExtensionBlock, self).__init__(values, placement, ndim=ndim) + @property + def is_view(self): + """Extension arrays are never treated as views.""" + return False + def get_values(self, dtype=None): # ExtensionArrays must be iterable, so this works. values = np.asarray(self.values) @@ -1822,7 +1827,7 @@ def get_values(self, dtype=None): return values def to_dense(self): - return self.values.get_values() + return np.asarray(self.values) def take_nd(self, indexer, axis=0, new_mgr_locs=None, fill_tuple=None): """ @@ -2412,12 +2417,6 @@ def __init__(self, values, placement, ndim=None): super(CategoricalBlock, self).__init__(_maybe_to_categorical(values), placement=placement, ndim=ndim) - - @property - def is_view(self): - """ I am never a view """ - return False - @property def array_dtype(self): """ the dtype to return if I want to construct this block as an @@ -2461,6 +2460,12 @@ def shift(self, periods, axis=0, mgr=None): return self.make_block_same_class(values=self.values.shift(periods), placement=self.mgr_locs) + def to_dense(self): + # Categorical.get_values returns a DatetimeIndex for datetime + # categories, so we can't simply use `np.asarray(self.values)` like + # other types. + return self.values.get_values() + def to_native_types(self, slicer=None, na_rep='', quoting=None, **kwargs): """ convert to our native types format, slicing if desired """ From 79131861d17c40ab843ee3e00a4afed6e0c39b01 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 30 Jan 2018 09:07:59 -0600 Subject: [PATCH 22/40] Docs --- pandas/core/arrays/base.py | 37 ++++++++++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 7 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index a3bed7af63220..f6796eb1cedcf 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -33,15 +33,22 @@ def __getitem__(self, item): # type (Any) -> Any """Select a subset of self. - Notes - ----- - ``item`` may be one of + Parameters + ---------- + item : int, slice, or ndarray + * int: The position in 'self' to get. - * A scalar integer position - * A slice object, where 'start', 'stop', and 'step' are + * slice: A slice object, where 'start', 'stop', and 'step' are integers or None - * A 1-d boolean NumPy ndarray the same length as 'self' + * ndarray: A 1-d boolean NumPy ndarray the same length as 'self' + + Returns + ------- + item : scalar or ExtensionArray + + Notes + ----- For scalar ``item``, return a scalar value suitable for the array's type. This should be an instance of ``self.dtype.type``. @@ -60,6 +67,12 @@ def __setitem__(self, key, value): @abc.abstractmethod def __len__(self): + """Length of this array + + Returns + ------- + length : int + """ # type: () -> int pass @@ -149,7 +162,17 @@ def take(self, indexer, allow_fill=True, fill_value=None): @abc.abstractmethod def copy(self, deep=False): # type: (bool) -> ExtensionArray - """Return a copy of the array.""" + """Return a copy of the array. + + Parameters + ---------- + deep : bool, default False + Also copy the underlying data backing this array. + + Returns + ------- + ExtensionArray + """ # ------------------------------------------------------------------------ # Block-related methods From df18c3b95de1ab75dfa9a4cc56a647a2a4c7f6ca Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 30 Jan 2018 09:08:05 -0600 Subject: [PATCH 23/40] Remove is_extension, is_bool Remove inherited convert --- pandas/core/internals.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 1f7d7f47f83eb..c725fa6103c45 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -98,7 +98,6 @@ class Block(PandasObject): is_object = False is_categorical = False is_sparse = False - is_extension = False _box_to_block_values = True _can_hold_na = False _can_consolidate = True @@ -1807,9 +1806,6 @@ class ExtensionBlock(NonConsolidatableMixIn, Block): ExtensionArrays are limited to 1-D. """ - is_extension = True - is_bool = False - def __init__(self, values, placement, ndim=None): self._holder = type(values) super(ExtensionBlock, self).__init__(values, placement, ndim=ndim) @@ -1857,11 +1853,6 @@ def _can_hold_element(self, element): # We're doing the same as CategoricalBlock here. return True - def convert(self, copy=True, **kwargs): - # We're dedicated to a type, we don't convert. - # Taken from CategoricalBlock / Block. - return self.copy() if copy else self - def _slice(self, slicer): """ return a slice of my values """ @@ -2417,6 +2408,7 @@ def __init__(self, values, placement, ndim=None): super(CategoricalBlock, self).__init__(_maybe_to_categorical(values), placement=placement, ndim=ndim) + @property def array_dtype(self): """ the dtype to return if I want to construct this block as an From ab2f0457839fece3b3ef067f29994b42908bd037 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 30 Jan 2018 10:02:49 -0600 Subject: [PATCH 24/40] Sparse formatter --- pandas/io/formats/format.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 2293032ebb8a1..c5805fa3b6c46 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -21,6 +21,7 @@ is_integer, is_float, is_scalar, + is_sparse, is_numeric_dtype, is_datetime64_dtype, is_timedelta64_dtype, @@ -1803,6 +1804,8 @@ def format_array(values, formatter, float_format=None, na_rep='NaN', fmt_klass = CategoricalArrayFormatter elif is_interval_dtype(values): fmt_klass = IntervalArrayFormatter + elif is_sparse(values): + fmt_klass = SparseArrayFormatter elif is_float_dtype(values.dtype): fmt_klass = FloatArrayFormatter elif is_period_arraylike(values): @@ -2115,6 +2118,15 @@ def _format_strings(self): return fmt_values +class SparseArrayFormatter(GenericArrayFormatter): + + def _format_strings(self): + return format_array(self.values.get_values(), self.formatter, + float_format=self.float_format, + na_rep=self.na_rep, digits=self.digits, + space=self.space, justify=self.justify) + + def format_percentiles(percentiles): """ Outputs rounded and formatted percentiles. From 520876f73650232c30e8ececd869d24e9ec28a60 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 30 Jan 2018 12:47:57 -0600 Subject: [PATCH 25/40] Revert "Sparse formatter" This reverts commit ab2f0457839fece3b3ef067f29994b42908bd037. --- pandas/io/formats/format.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index c5805fa3b6c46..2293032ebb8a1 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -21,7 +21,6 @@ is_integer, is_float, is_scalar, - is_sparse, is_numeric_dtype, is_datetime64_dtype, is_timedelta64_dtype, @@ -1804,8 +1803,6 @@ def format_array(values, formatter, float_format=None, na_rep='NaN', fmt_klass = CategoricalArrayFormatter elif is_interval_dtype(values): fmt_klass = IntervalArrayFormatter - elif is_sparse(values): - fmt_klass = SparseArrayFormatter elif is_float_dtype(values.dtype): fmt_klass = FloatArrayFormatter elif is_period_arraylike(values): @@ -2118,15 +2115,6 @@ def _format_strings(self): return fmt_values -class SparseArrayFormatter(GenericArrayFormatter): - - def _format_strings(self): - return format_array(self.values.get_values(), self.formatter, - float_format=self.float_format, - na_rep=self.na_rep, digits=self.digits, - space=self.space, justify=self.justify) - - def format_percentiles(percentiles): """ Outputs rounded and formatted percentiles. From 4dfa39ca239d409b8bbb02b253dced775098ca9a Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 30 Jan 2018 12:49:12 -0600 Subject: [PATCH 26/40] Unbox SparseSeries --- pandas/core/internals.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index c725fa6103c45..09a5877005dc3 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -2792,6 +2792,13 @@ class SparseBlock(NonConsolidatableMixIn, Block): _holder = SparseArray _concatenator = staticmethod(_concat._concat_sparse) + def __init__(self, values, placement, ndim=None): + # Ensure that we have the underlying SparseArray here... + if isinstance(values, ABCSeries): + values = values.values + assert isinstance(values, SparseArray) + super(SparseBlock, self).__init__(values, placement, ndim=ndim) + @property def shape(self): return (len(self.mgr_locs), self.sp_index.length) From e252103266b5e303ba54b092777b0e07f83baee6 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 30 Jan 2018 13:03:47 -0600 Subject: [PATCH 27/40] Added test for sparse consolidation --- pandas/tests/sparse/frame/test_frame.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pandas/tests/sparse/frame/test_frame.py b/pandas/tests/sparse/frame/test_frame.py index 2b589ebd4735e..4d49b82e67946 100644 --- a/pandas/tests/sparse/frame/test_frame.py +++ b/pandas/tests/sparse/frame/test_frame.py @@ -574,6 +574,15 @@ def test_setitem_array(self): self.frame['F'].reindex(index), check_names=False) + def test_setitem_chained_no_consolidate(self): + # https://github.com/pandas-dev/pandas/pull/19268 + # issuecomment-361696418 + # chained setitem used to cause consolidation + sdf = pd.SparseDataFrame([[np.nan, 1], [2, np.nan]]) + with pd.option_context('mode.chained_assignment', None): + sdf[0][1] = 2 + assert len(sdf._data.blocks) == 2 + def test_delitem(self): A = self.frame['A'] C = self.frame['C'] From 7110b2a78d759174e6df811ae64f93806120805d Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 30 Jan 2018 13:31:18 -0600 Subject: [PATCH 28/40] Docs --- pandas/core/arrays/base.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index f6796eb1cedcf..e219652334cd5 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -9,18 +9,27 @@ @add_metaclass(abc.ABCMeta) class ExtensionArray(object): - """Abstract base class for custom array types. + """Abstract base class for custom 1-D array types. Notes ----- pandas will recognize instances of this class as proper arrays with a custom type and will not attempt to coerce them to objects. - **Restrictions on your class constructor** + ExtensionArrays are limited to 1 dimension. - * Extension arrays should be able to be constructed with instances of - the class, i.e. ``ExtensionArray(extension_array)`` should return - an instance, not error. + They may be backed by none, one, or many NumPy ararys. For example, + ``pandas.Categorical`` is an extension array backed by two arrays, + one for codes and one for categories. An array of IPv6 address may + be backed by a NumPy structured array with two fields, one for the + lower 64 bits and one for the upper 64 bits. Or they may be backed + by some other storage type, like Python lists. Pandas makes no + assumptions on how the data are stored, just that it can be converted + to a NumPy array. + + Extension arrays should be able to be constructed with instances of + the class, i.e. ``ExtensionArray(extension_array)`` should return + an instance, not error. Additionally, certain methods and interfaces are required for proper this array to be properly stored inside a ``DataFrame`` or ``Series``. From fc688a56e20e62e6b7b806aedc4a4a760af079b1 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 31 Jan 2018 11:45:24 -0600 Subject: [PATCH 29/40] Moved to errors --- pandas/core/common.py | 16 ++-------------- pandas/errors/__init__.py | 14 ++++++++++++++ 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/pandas/core/common.py b/pandas/core/common.py index e606be3cc2a23..6748db825acf0 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -25,7 +25,8 @@ # compat from pandas.errors import ( # noqa - PerformanceWarning, UnsupportedFunctionCall, UnsortedIndexError) + PerformanceWarning, UnsupportedFunctionCall, UnsortedIndexError, + AbstractMethodError) # back-compat of public API # deprecate these functions @@ -88,19 +89,6 @@ class SettingWithCopyWarning(Warning): pass -class AbstractMethodError(NotImplementedError): - """Raise this error instead of NotImplementedError for abstract methods - while keeping compatibility with Python 2 and Python 3. - """ - - def __init__(self, class_instance): - self.class_instance = class_instance - - def __str__(self): - msg = "This method must be defined in the concrete class of {name}" - return (msg.format(name=self.class_instance.__class__.__name__)) - - def flatten(l): """Flatten an arbitrarily nested sequence. diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index 22b6d33be9d38..cfdcada801b9d 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -77,3 +77,17 @@ class NullFrequencyError(ValueError): class AccessorRegistrationWarning(Warning): """Warning for attribute conflicts in accessor registration.""" + + +class AbstractMethodError(NotImplementedError): + """Raise this error instead of NotImplementedError for abstract methods + while keeping compatibility with Python 2 and Python 3. + """ + + def __init__(self, class_instance): + self.class_instance = class_instance + + def __str__(self): + msg = "This method must be defined in the concrete class of {name}" + return (msg.format(name=self.class_instance.__class__.__name__)) + From fbc846644be3e97314b203dd3eab9a36ef50a274 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 31 Jan 2018 11:50:36 -0600 Subject: [PATCH 30/40] Handle classmethods, properties --- pandas/errors/__init__.py | 16 +++++++++++++--- pandas/tests/test_errors.py | 29 +++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 3 deletions(-) diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index cfdcada801b9d..c1e855732f915 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -84,10 +84,20 @@ class AbstractMethodError(NotImplementedError): while keeping compatibility with Python 2 and Python 3. """ - def __init__(self, class_instance): + def __init__(self, class_instance, methodtype='method'): + types = {'method', 'classmethod', 'staticmethod', 'property'} + if methodtype not in types: + msg = 'methodtype must be one of {}, got {} instead.'.format( + methodtype, types) + raise ValueError(msg) + self.methodtype = methodtype self.class_instance = class_instance def __str__(self): - msg = "This method must be defined in the concrete class of {name}" - return (msg.format(name=self.class_instance.__class__.__name__)) + if self.methodtype == 'classmethod': + name = self.class_instance.__name__ + else: + name = self.class_instance.__class__.__name__ + msg = "This {methodtype} must be defined in the concrete class {name}" + return (msg.format(methodtype=self.methodtype, name=name)) diff --git a/pandas/tests/test_errors.py b/pandas/tests/test_errors.py index babf88ef1df8d..e2a142366a89e 100644 --- a/pandas/tests/test_errors.py +++ b/pandas/tests/test_errors.py @@ -4,6 +4,8 @@ from warnings import catch_warnings import pandas # noqa import pandas as pd +from pandas.errors import AbstractMethodError +import pandas.util.testing as tm @pytest.mark.parametrize( @@ -50,3 +52,30 @@ def test_error_rename(): raise ParserError() except pd.parser.CParserError: pass + + +class Foo: + @classmethod + def classmethod(cls): + raise AbstractMethodError(cls, methodtype='classmethod') + + @property + def property(self): + raise AbstractMethodError(self, methodtype='property') + + def method(self): + raise AbstractMethodError(self) + + +def test_AbstractMethodError_classmethod(): + xpr = "This classmethod must be defined in the concrete class Foo" + with tm.assert_raises_regex(AbstractMethodError, xpr): + Foo.classmethod() + + xpr = "This property must be defined in the concrete class Foo" + with tm.assert_raises_regex(AbstractMethodError, xpr): + Foo().property + + xpr = "This method must be defined in the concrete class Foo" + with tm.assert_raises_regex(AbstractMethodError, xpr): + Foo().method() From 030bb194e523b24442db77c8cbabe16315880c73 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 31 Jan 2018 11:50:45 -0600 Subject: [PATCH 31/40] Use our AbstractMethodError --- pandas/core/arrays/base.py | 48 +++++++++++++++++++----------- pandas/core/dtypes/base.py | 25 +++++++++++----- pandas/tests/dtypes/test_dtypes.py | 8 ++--- 3 files changed, 51 insertions(+), 30 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index e219652334cd5..62d7b685163b7 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1,20 +1,35 @@ """An interface for extending pandas with custom arrays.""" -import abc - -from pandas.compat import add_metaclass - +from pandas.errors import AbstractMethodError _not_implemented_message = "{} does not implement {}." -@add_metaclass(abc.ABCMeta) class ExtensionArray(object): """Abstract base class for custom 1-D array types. + pandas will recognize instances of this class as proper arrays + with a custom type and will not attempt to coerce them to objects. They + may be stored directly inside a :class:`DataFrame` or :class:`Series`. + Notes ----- - pandas will recognize instances of this class as proper arrays - with a custom type and will not attempt to coerce them to objects. + The interface includes the following abstract methods that must be + implemented by subclasses: + + * __getitem__ + * __len__ + * dtype + * nbytes + * isna + * take + * copy + * _formatting_values + * _concat_same_type + + This class does not inherit from 'abc.ABCMeta' for performance reasons. + Methods and properties required by the interface raise + ``pandas.errors.AbstractMethodError`` and no ``register`` method is + provided for registering virtual subclasses. ExtensionArrays are limited to 1 dimension. @@ -37,7 +52,6 @@ class ExtensionArray(object): # ------------------------------------------------------------------------ # Must be a Sequence # ------------------------------------------------------------------------ - @abc.abstractmethod def __getitem__(self, item): # type (Any) -> Any """Select a subset of self. @@ -67,6 +81,7 @@ def __getitem__(self, item): For a boolean mask, return an instance of ``ExtensionArray``, filtered to the values where ``item`` is True. """ + raise AbstractMethodError(self) def __setitem__(self, key, value): # type: (Any, Any) -> None @@ -74,7 +89,6 @@ def __setitem__(self, key, value): type(self), '__setitem__') ) - @abc.abstractmethod def __len__(self): """Length of this array @@ -83,16 +97,16 @@ def __len__(self): length : int """ # type: () -> int - pass + raise AbstractMethodError(self) # ------------------------------------------------------------------------ # Required attributes # ------------------------------------------------------------------------ @property - @abc.abstractmethod def dtype(self): # type: () -> ExtensionDtype """An instance of 'ExtensionDtype'.""" + raise AbstractMethodError(self) @property def shape(self): @@ -106,7 +120,6 @@ def ndim(self): return 1 @property - @abc.abstractmethod def nbytes(self): # type: () -> int """The number of bytes needed to store this object in memory. @@ -114,19 +127,19 @@ def nbytes(self): If this is expensive to compute, return an approximate lower bound on the number of bytes needed. """ + raise AbstractMethodError(self) # ------------------------------------------------------------------------ # Additional Methods # ------------------------------------------------------------------------ - @abc.abstractmethod def isna(self): # type: () -> np.ndarray """Boolean NumPy array indicating if each value is missing.""" + raise AbstractMethodError(self) # ------------------------------------------------------------------------ # Indexing methods # ------------------------------------------------------------------------ - @abc.abstractmethod def take(self, indexer, allow_fill=True, fill_value=None): # type: (Sequence[int], bool, Optional[Any]) -> ExtensionArray """Take elements from an array. @@ -167,8 +180,8 @@ def take(self, indexer, allow_fill=True, fill_value=None): result[mask] = self._fill_value return type(self)(result) """ + raise AbstractMethodError(self) - @abc.abstractmethod def copy(self, deep=False): # type: (bool) -> ExtensionArray """Return a copy of the array. @@ -182,6 +195,7 @@ def copy(self, deep=False): ------- ExtensionArray """ + raise AbstractMethodError(self) # ------------------------------------------------------------------------ # Block-related methods @@ -192,14 +206,13 @@ def _fill_value(self): """The missing value for this type, e.g. np.nan""" return None - @abc.abstractmethod def _formatting_values(self): # type: () -> np.ndarray # At the moment, this has to be an array since we use result.dtype """An array of values to be printed in, e.g. the Series repr""" + raise AbstractMethodError(self) @classmethod - @abc.abstractmethod def _concat_same_type(cls, to_concat): # type: (Sequence[ExtensionArray]) -> ExtensionArray """Concatenate multiple array @@ -212,6 +225,7 @@ def _concat_same_type(cls, to_concat): ------- ExtensionArray """ + raise AbstractMethodError(cls) def _can_hold_na(self): # type: () -> bool diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index b5257c444e6d3..17171e3bcb25a 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -1,19 +1,29 @@ """Extend pandas with custom array types""" -import abc +from pandas.errors import AbstractMethodError -from pandas.compat import add_metaclass - -@add_metaclass(abc.ABCMeta) class ExtensionDtype(object): """A custom data type, to be paired with an ExtensionArray. + + Notes + ----- + The interface includes the following abstract methods that must + be implemented by subclasses: + + * type + * name + * construct_from_string + + This class does not inherit from 'abc.ABCMeta' for performance reasons. + Methods and properties required by the interface raise + ``pandas.errors.AbstractMethodError`` and no ``register`` method is + provided for registering virtual subclasses. """ def __str__(self): return self.name @property - @abc.abstractmethod def type(self): # type: () -> type """The scalar type for the array, e.g. ``int`` @@ -21,6 +31,7 @@ def type(self): It's expected ``ExtensionArray[item]`` returns an instance of ``ExtensionDtype.type`` for scalar ``item``. """ + raise AbstractMethodError(self) @property def kind(self): @@ -39,13 +50,13 @@ def kind(self): return 'O' @property - @abc.abstractmethod def name(self): # type: () -> str """A string identifying the data type. Will be used for display in, e.g. ``Series.dtype`` """ + raise AbstractMethodError(self) @property def names(self): @@ -54,7 +65,6 @@ def names(self): return None @classmethod - @abc.abstractmethod def construct_from_string(cls, string): """Attempt to construct this type from a string. @@ -84,6 +94,7 @@ def construct_from_string(cls, string): ... raise TypeError("Cannot construct a '{}' from " ... "'{}'".format(cls, string)) """ + raise AbstractMethodError(cls) @classmethod def is_dtype(cls, dtype): diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index 3423e22a4c64e..eca4dd4cf2106 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -746,18 +746,14 @@ def test_categorical_categories(self): tm.assert_index_equal(c1.categories, pd.Index(['a', 'b'])) -class DummyArray(object): +class DummyArray(ExtensionArray): pass -class DummyDtype(object): +class DummyDtype(ExtensionDtype): pass -ExtensionArray.register(DummyArray) -ExtensionDtype.register(DummyDtype) - - class TestExtensionArrayDtype(object): @pytest.mark.parametrize('values', [ From 0f4c2d797b7e7a791f0c96b1320bc8fe425a9a72 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 31 Jan 2018 13:22:17 -0600 Subject: [PATCH 32/40] Lint --- pandas/errors/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index c1e855732f915..af4e83f506257 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -100,4 +100,3 @@ def __str__(self): name = self.class_instance.__class__.__name__ msg = "This {methodtype} must be defined in the concrete class {name}" return (msg.format(methodtype=self.methodtype, name=name)) - From f9316e0b7d8521746015811ae8d8d55a0266da57 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 31 Jan 2018 21:06:53 -0600 Subject: [PATCH 33/40] Cleanup --- pandas/core/arrays/base.py | 11 ++++++++++- pandas/core/arrays/categorical.py | 2 +- pandas/core/dtypes/base.py | 6 +++++- pandas/core/internals.py | 9 ++++++++- 4 files changed, 24 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 62d7b685163b7..1556b653819a6 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -26,6 +26,12 @@ class ExtensionArray(object): * _formatting_values * _concat_same_type + Some additional methods are required to satisfy pandas' internal, private + block API. + + * _concat_same_type + * _can_hold_na + This class does not inherit from 'abc.ABCMeta' for performance reasons. Methods and properties required by the interface raise ``pandas.errors.AbstractMethodError`` and no ``register`` method is @@ -134,7 +140,10 @@ def nbytes(self): # ------------------------------------------------------------------------ def isna(self): # type: () -> np.ndarray - """Boolean NumPy array indicating if each value is missing.""" + """Boolean NumPy array indicating if each value is missing. + + This should return a 1-D array the same length as 'self'. + """ raise AbstractMethodError(self) # ------------------------------------------------------------------------ diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 40987cfe0f484..62c6a6b16cbe9 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2132,7 +2132,7 @@ def repeat(self, repeats, *args, **kwargs): return self._constructor(values=codes, categories=self.categories, ordered=self.ordered, fastpath=True) - # ExtensionArray Interface things + # Implement the ExtensionArray interface @property def _can_hold_na(self): return True diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index 17171e3bcb25a..c7c5378801f02 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -61,7 +61,11 @@ def name(self): @property def names(self): # type: () -> Optional[List[str]] - """Ordered list of field names, or None if there are no fields.""" + """Ordered list of field names, or None if there are no fields. + + This is for compatibility with NumPy arrays, and may be removed in the + future. + """ return None @classmethod diff --git a/pandas/core/internals.py b/pandas/core/internals.py index bb5057bc412ad..9e2bd21c665f7 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1680,6 +1680,13 @@ class NonConsolidatableMixIn(object): _holder = None def __init__(self, values, placement, ndim=None): + """Initialize a non-consolidatable block. + + 'ndim' may be inferred from 'placement'. + + This will call continue to call __init__ for the other base + classes mixed in with this Mixin. + """ # Placement must be converted to BlockPlacement so that we can check # its length if not isinstance(placement, BlockPlacement): @@ -1806,7 +1813,7 @@ class ExtensionBlock(NonConsolidatableMixIn, Block): Notes ----- - This is the holds all 3rd-party extension types. It's also the immediate + This holds all 3rd-party extension array types. It's also the immediate parent class for our internal extension types' blocks, CategoricalBlock. ExtensionArrays are limited to 1-D. From 9c06b13d2c8ecf53d678a0ec613acea927bb3955 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 31 Jan 2018 21:07:04 -0600 Subject: [PATCH 34/40] Move ndim validation to a method. --- pandas/core/internals.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 9e2bd21c665f7..fdaebf57a7aa2 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -110,10 +110,10 @@ class Block(PandasObject): def __init__(self, values, placement, ndim=None): if ndim is None: ndim = values.ndim - elif self._validate_ndim and values.ndim != ndim: - raise ValueError('Wrong number of dimensions') - self.ndim = ndim + self._maybe_validate_ndim(values, ndim) + + self.ndim = ndim self.mgr_locs = placement self.values = values @@ -123,6 +123,18 @@ def __init__(self, values, placement, ndim=None): 'Wrong number of items passed {val}, placement implies ' '{mgr}'.format(val=len(self.values), mgr=len(self.mgr_locs))) + def _maybe_validate_ndim(self, values, ndim): + """Maybe check that ``values.ndim`` matches ``ndim``. + + This is not checked if ``self._validate_ndim`` is False. + + Raises + ------ + ValueError : the number of dimensions do not match + """ + if self._validate_ndim and values.ndim != ndim: + raise ValueError('Wrong number of dimensions') + @property def _consolidate_key(self): return (self._can_consolidate, self.dtype.name) From 7d2cf9cdfdfbe736c949a05eb7a81bfe15db25a1 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 1 Feb 2018 07:12:08 -0600 Subject: [PATCH 35/40] Try this --- pandas/core/internals.py | 53 +++++++++++++++++++++++++++++++++++----- 1 file changed, 47 insertions(+), 6 deletions(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index fdaebf57a7aa2..1841501c1b601 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -2523,13 +2523,29 @@ class DatetimeBlock(DatetimeLikeBlockMixin, Block): _can_hold_na = True def __init__(self, values, placement, ndim=None): - if values.dtype != _NS_DTYPE and values.dtype.base != _NS_DTYPE: - # not datetime64 or datetime64tz - values = conversion.ensure_datetime64ns(values) - + values = self._maybe_coerce_values(values) super(DatetimeBlock, self).__init__(values, placement=placement, ndim=ndim) + def _maybe_coerce_values(self, values): + """Input validation for values passed to __init__. Ensure that + we have datetime64ns, coercing if nescessary. + + Parametetrs + ----------- + values : array-like + Must be convertable to datetime64 + + Returns + ------- + values : ndarray[datetime64ns] + + Overridden by DatetimeTZBlock. + """ + if values.dtype != _NS_DTYPE: + values = conversion.ensure_datetime64ns(values) + return values + def _astype(self, dtype, mgr=None, **kwargs): """ these automatically copy, so copy=True has no effect @@ -2660,7 +2676,33 @@ class DatetimeTZBlock(NonConsolidatableMixIn, DatetimeBlock): is_datetimetz = True def __init__(self, values, placement, ndim=2, dtype=None): + # XXX: This will end up calling _maybe_coerce_values twice + # when dtype is not None. It's relatively cheap (just an isinstance) + # but it'd nice to avoid. + # + # If we can remove dtype from __init__, and push that conversion + # push onto the callers, then we can remove this entire __init__ + # and just use DatetimeBlock's. + if dtype is not None: + values = self._maybe_coerce_values(values, dtype=dtype) + super(DatetimeTZBlock, self).__init__(values, placement=placement, + ndim=ndim) + + def _maybe_coerce_values(self, values, dtype=None): + """Input validation for values passed to __init__. Ensure that + we have datetime64TZ, coercing if nescessary. + + Parametetrs + ----------- + values : array-like + Must be convertable to datetime64 + dtype : string or DatetimeTZDtype, optional + Does a shallow copy to this tz + Returns + ------- + values : ndarray[datetime64ns] + """ if not isinstance(values, self._holder): values = self._holder(values) @@ -2672,8 +2714,7 @@ def __init__(self, values, placement, ndim=2, dtype=None): if values.tz is None: raise ValueError("cannot create a DatetimeTZBlock without a tz") - super(DatetimeTZBlock, self).__init__(values, placement=placement, - ndim=ndim) + return values def copy(self, deep=True, mgr=None): """ copy constructor """ From afae8ae9563142ee1c3b29158269f8f38e3f9e1c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 1 Feb 2018 08:06:06 -0600 Subject: [PATCH 36/40] Make ExtensionBlock._holder a property Removed ExtensionBlock.__init__ --- pandas/core/internals.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 1841501c1b601..7e66c6e04a010 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1830,9 +1830,11 @@ class ExtensionBlock(NonConsolidatableMixIn, Block): ExtensionArrays are limited to 1-D. """ - def __init__(self, values, placement, ndim=None): - self._holder = type(values) - super(ExtensionBlock, self).__init__(values, placement, ndim=ndim) + @property + def _holder(self): + # For extension blocks, the holder is values-dependent so we + # use a property. + return type(self.values) @property def is_view(self): From cd0997e354121bc0414ef1675cb5b8241944a9b0 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 1 Feb 2018 08:27:53 -0600 Subject: [PATCH 37/40] Make _holder a property for all --- pandas/core/internals.py | 35 +++++++++++++++++++----- pandas/tests/internals/test_internals.py | 14 +++++++++- 2 files changed, 41 insertions(+), 8 deletions(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 7e66c6e04a010..767c890d7a63b 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -64,6 +64,7 @@ from pandas.core.indexing import maybe_convert_indices, length_of_indexer from pandas.core.arrays import Categorical from pandas.core.indexes.datetimes import DatetimeIndex +from pandas.core.indexes.timedeltas import TimedeltaIndex from pandas.io.formats.printing import pprint_thing import pandas.core.missing as missing @@ -104,7 +105,6 @@ class Block(PandasObject): _verify_integrity = True _validate_ndim = True _ftype = 'dense' - _holder = None _concatenator = staticmethod(np.concatenate) def __init__(self, values, placement, ndim=None): @@ -135,6 +135,15 @@ def _maybe_validate_ndim(self, values, ndim): if self._validate_ndim and values.ndim != ndim: raise ValueError('Wrong number of dimensions') + @property + def _holder(self): + """The array-like that can hold the underlying values. + + None for 'Block', overridden by subclasses that don't + use an ndarray. + """ + return None + @property def _consolidate_key(self): return (self._can_consolidate, self.dtype.name) @@ -1689,7 +1698,6 @@ class NonConsolidatableMixIn(object): _can_consolidate = False _verify_integrity = False _validate_ndim = False - _holder = None def __init__(self, values, placement, ndim=None): """Initialize a non-consolidatable block. @@ -1832,8 +1840,7 @@ class ExtensionBlock(NonConsolidatableMixIn, Block): """ @property def _holder(self): - # For extension blocks, the holder is values-dependent so we - # use a property. + # For extension blocks, the holder is values-dependent. return type(self.values) @property @@ -2012,6 +2019,11 @@ def should_store(self, value): class DatetimeLikeBlockMixin(object): + """Mixin class for DatetimeBlock and DatetimeTZBlock.""" + + @property + def _holder(self): + return DatetimeIndex @property def _na_value(self): @@ -2044,6 +2056,10 @@ def __init__(self, values, placement, ndim=None): super(TimeDeltaBlock, self).__init__(values, placement=placement, ndim=ndim) + @property + def _holder(self): + return TimedeltaIndex + @property def _box_func(self): return lambda x: tslib.Timedelta(x, unit='ns') @@ -2424,7 +2440,6 @@ class CategoricalBlock(ExtensionBlock): is_categorical = True _verify_integrity = True _can_hold_na = True - _holder = Categorical _concatenator = staticmethod(_concat._concat_categorical) def __init__(self, values, placement, ndim=None): @@ -2435,6 +2450,10 @@ def __init__(self, values, placement, ndim=None): placement=placement, ndim=ndim) + @property + def _holder(self): + return Categorical + @property def array_dtype(self): """ the dtype to return if I want to construct this block as an @@ -2673,7 +2692,6 @@ def set(self, locs, values, check=False): class DatetimeTZBlock(NonConsolidatableMixIn, DatetimeBlock): """ implement a datetime64 block with a tz attribute """ __slots__ = () - _holder = DatetimeIndex _concatenator = staticmethod(_concat._concat_datetime) is_datetimetz = True @@ -2856,7 +2874,6 @@ class SparseBlock(NonConsolidatableMixIn, Block): _box_to_block_values = False _can_hold_na = True _ftype = 'sparse' - _holder = SparseArray _concatenator = staticmethod(_concat._concat_sparse) def __init__(self, values, placement, ndim=None): @@ -2866,6 +2883,10 @@ def __init__(self, values, placement, ndim=None): assert isinstance(values, SparseArray) super(SparseBlock, self).__init__(values, placement, ndim=ndim) + @property + def _holder(self): + return SparseArray + @property def shape(self): return (len(self.mgr_locs), self.sp_index.length) diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index e3490f465b24a..c8ccf23ebcf66 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -11,7 +11,7 @@ from distutils.version import LooseVersion import itertools from pandas import (Index, MultiIndex, DataFrame, DatetimeIndex, - Series, Categorical) + Series, Categorical, TimedeltaIndex, SparseArray) from pandas.compat import OrderedDict, lrange from pandas.core.sparse.array import SparseArray from pandas.core.internals import (BlockPlacement, SingleBlockManager, @@ -1263,6 +1263,18 @@ def test_binop_other(self, op, value, dtype): assert_series_equal(result, expected) +@pytest.mark.parametrize('typestr, holder', [ + ('category', Categorical), + ('M8[ns]', DatetimeIndex), + ('M8[ns, US/Central]', DatetimeIndex), + ('m8[ns]', TimedeltaIndex), + ('sparse', SparseArray), +]) +def test_holder(typestr, holder): + blk = create_block(typestr, [1]) + assert blk._holder is holder + + def test_deprecated_fastpath(): # GH#19265 values = np.random.rand(3, 3) From 1d6eb049d0ad7546b1193c718cea773d8799e7cd Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 1 Feb 2018 10:47:45 -0600 Subject: [PATCH 38/40] Refactored validate_ndim --- pandas/core/internals.py | 33 +++++++++++++++++------- pandas/tests/internals/test_internals.py | 9 +++++++ 2 files changed, 32 insertions(+), 10 deletions(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 767c890d7a63b..a271114274fc6 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -108,12 +108,7 @@ class Block(PandasObject): _concatenator = staticmethod(np.concatenate) def __init__(self, values, placement, ndim=None): - if ndim is None: - ndim = values.ndim - - self._maybe_validate_ndim(values, ndim) - - self.ndim = ndim + self.ndim = self._check_ndim(values, ndim) self.mgr_locs = placement self.values = values @@ -123,17 +118,35 @@ def __init__(self, values, placement, ndim=None): 'Wrong number of items passed {val}, placement implies ' '{mgr}'.format(val=len(self.values), mgr=len(self.mgr_locs))) - def _maybe_validate_ndim(self, values, ndim): - """Maybe check that ``values.ndim`` matches ``ndim``. + def _check_ndim(self, values, ndim): + """ndim inference and validation. - This is not checked if ``self._validate_ndim`` is False. + Infers ndim from 'values' if not provided to __init__. + Validates that values.ndim and ndim are consistent if and only if + the class variable '_validate_ndim' is True. + + Parameters + ---------- + values : array-like + ndim : int or None + + Returns + ------- + ndim : int Raises ------ ValueError : the number of dimensions do not match """ + if ndim is None: + ndim = values.ndim + if self._validate_ndim and values.ndim != ndim: - raise ValueError('Wrong number of dimensions') + msg = ("Wrong number of dimensions. values.ndim != ndim " + "[{} != {}]") + raise ValueError(msg.format(values.ndim, ndim)) + + return ndim @property def _holder(self): diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index c8ccf23ebcf66..a45b5cb48d914 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -1281,3 +1281,12 @@ def test_deprecated_fastpath(): with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): make_block(values, placement=np.arange(3), fastpath=True) + + +def test_validate_ndim(): + values = np.array([1.0, 2.0]) + placement = slice(2) + msg = "Wrong number of dimensions. values.ndim != ndim \[1 != 2\]" + + with tm.assert_raises_regex(ValueError, msg): + make_block(values, placement, ndim=2) From 92aed49f4c49b89ae09045ee563f6b0de5f2e6bd Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 1 Feb 2018 11:21:16 -0600 Subject: [PATCH 39/40] fixup! Refactored validate_ndim --- pandas/core/internals.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index a271114274fc6..cef5b776eff66 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -112,7 +112,7 @@ def __init__(self, values, placement, ndim=None): self.mgr_locs = placement self.values = values - if (self._validate_ndim and ndim and + if (self._validate_ndim and self.ndim and len(self.mgr_locs) != len(self.values)): raise ValueError( 'Wrong number of items passed {val}, placement implies ' From 34134f2f9633cca26b4efd382475927c1eb3fe5a Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 1 Feb 2018 14:54:53 -0600 Subject: [PATCH 40/40] lint --- pandas/tests/internals/test_internals.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index a45b5cb48d914..9338aba90d7cb 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -13,7 +13,6 @@ from pandas import (Index, MultiIndex, DataFrame, DatetimeIndex, Series, Categorical, TimedeltaIndex, SparseArray) from pandas.compat import OrderedDict, lrange -from pandas.core.sparse.array import SparseArray from pandas.core.internals import (BlockPlacement, SingleBlockManager, make_block, BlockManager) import pandas.core.algorithms as algos