From e239ab308ac6e08edb6a458b83acd52d281e568b Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 28 Dec 2018 14:59:32 -0800 Subject: [PATCH] searchsorted, repeat broken off from #24024 (#24461) --- pandas/core/arrays/datetimelike.py | 135 +++++++++++++++++++++++ pandas/core/arrays/datetimes.py | 21 ++++ pandas/core/arrays/period.py | 22 ++++ pandas/core/arrays/timedeltas.py | 17 +++ pandas/core/indexes/datetimelike.py | 1 + pandas/tests/arrays/test_datetimelike.py | 53 +++++++++ 6 files changed, 249 insertions(+) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 2acb08b696506..df2b5977bbe7c 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -12,6 +12,7 @@ from pandas._libs.tslibs.timestamps import ( RoundTo, maybe_integer_op_deprecated, round_nsint64) import pandas.compat as compat +from pandas.compat.numpy import function as nv from pandas.errors import ( AbstractMethodError, NullFrequencyError, PerformanceWarning) from pandas.util._decorators import Appender, Substitution, deprecate_kwarg @@ -82,6 +83,79 @@ def _get_attributes_dict(self): """ return {k: getattr(self, k, None) for k in self._attributes} + @property + def _scalar_type(self): + # type: () -> Union[type, Tuple[type]] + """The scalar associated with this datelike + + * PeriodArray : Period + * DatetimeArray : Timestamp + * TimedeltaArray : Timedelta + """ + raise AbstractMethodError(self) + + def _scalar_from_string(self, value): + # type: (str) -> Union[Period, Timestamp, Timedelta, NaTType] + """ + Construct a scalar type from a string. + + Parameters + ---------- + value : str + + Returns + ------- + Period, Timestamp, or Timedelta, or NaT + Whatever the type of ``self._scalar_type`` is. + + Notes + ----- + This should call ``self._check_compatible_with`` before + unboxing the result. + """ + raise AbstractMethodError(self) + + def _unbox_scalar(self, value): + # type: (Union[Period, Timestamp, Timedelta, NaTType]) -> int + """ + Unbox the integer value of a scalar `value`. + + Parameters + ---------- + value : Union[Period, Timestamp, Timedelta] + + Returns + ------- + int + + Examples + -------- + >>> self._unbox_scalar(Timedelta('10s')) # DOCTEST: +SKIP + 10000000000 + """ + raise AbstractMethodError(self) + + def _check_compatible_with(self, other): + # type: (Union[Period, Timestamp, Timedelta, NaTType]) -> None + """ + Verify that `self` and `other` are compatible. + + * DatetimeArray verifies that the timezones (if any) match + * PeriodArray verifies that the freq matches + * Timedelta has no verification + + In each case, NaT is considered compatible. + + Parameters + ---------- + other + + Raises + ------ + Exception + """ + raise AbstractMethodError(self) + class DatelikeOps(object): """ @@ -515,6 +589,67 @@ def _values_for_factorize(self): def _from_factorized(cls, values, original): return cls(values, dtype=original.dtype) + def _values_for_argsort(self): + return self._data + + # ------------------------------------------------------------------ + # Additional array methods + # These are not part of the EA API, but we implement them because + # pandas assumes they're there. + + def searchsorted(self, value, side='left', sorter=None): + """ + Find indices where elements should be inserted to maintain order. + + Find the indices into a sorted array `self` such that, if the + corresponding elements in `value` were inserted before the indices, + the order of `self` would be preserved. + + Parameters + ---------- + value : array_like + Values to insert into `self`. + side : {'left', 'right'}, optional + If 'left', the index of the first suitable location found is given. + If 'right', return the last such index. If there is no suitable + index, return either 0 or N (where N is the length of `self`). + sorter : 1-D array_like, optional + Optional array of integer indices that sort `self` into ascending + order. They are typically the result of ``np.argsort``. + + Returns + ------- + indices : array of ints + Array of insertion points with the same shape as `value`. + """ + if isinstance(value, compat.string_types): + value = self._scalar_from_string(value) + + if not (isinstance(value, (self._scalar_type, type(self))) + or isna(value)): + raise ValueError("Unexpected type for 'value': {valtype}" + .format(valtype=type(value))) + + self._check_compatible_with(value) + if isinstance(value, type(self)): + value = value.asi8 + else: + value = self._unbox_scalar(value) + + return self.asi8.searchsorted(value, side=side, sorter=sorter) + + def repeat(self, repeats, *args, **kwargs): + """ + Repeat elements of an array. + + See Also + -------- + numpy.ndarray.repeat + """ + nv.validate_repeat(args, kwargs) + values = self._data.repeat(repeats) + return type(self)(values, dtype=self.dtype) + # ------------------------------------------------------------------ # Null Handling diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 966511d048421..79dcc677973cc 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -171,6 +171,7 @@ class DatetimeArrayMixin(dtl.DatetimeLikeArrayMixin, _data """ _typ = "datetimearray" + _scalar_type = Timestamp # define my properties & methods for delegation _bool_ops = ['is_month_start', 'is_month_end', @@ -347,6 +348,26 @@ def _generate_range(cls, start, end, periods, freq, tz=None, return cls._simple_new(index.asi8, freq=freq, tz=tz) + # ----------------------------------------------------------------- + # DatetimeLike Interface + + def _unbox_scalar(self, value): + if not isinstance(value, self._scalar_type) and value is not NaT: + raise ValueError("'value' should be a Timestamp.") + if not isna(value): + self._check_compatible_with(value) + return value.value + + def _scalar_from_string(self, value): + return Timestamp(value, tz=self.tz) + + def _check_compatible_with(self, other): + if other is NaT: + return + if not timezones.tz_compare(self.tz, other.tz): + raise ValueError("Timezones don't match. '{own} != {other}'" + .format(own=self.tz, other=other.tz)) + # ----------------------------------------------------------------- # Descriptive Properties diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 2c7ee5b277a90..2a7422aedb8a3 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -137,6 +137,7 @@ class PeriodArray(dtl.DatetimeLikeArrayMixin, __array_priority__ = 1000 _attributes = ["freq"] _typ = "periodarray" # ABCPeriodArray + _scalar_type = Period # Names others delegate to us _other_ops = [] @@ -240,7 +241,28 @@ def _generate_range(cls, start, end, periods, freq, fields): return subarr, freq + # ----------------------------------------------------------------- + # DatetimeLike Interface + + def _unbox_scalar(self, value): + # type: (Union[Period, NaTType]) -> int + if value is NaT: + return value.value + elif isinstance(value, self._scalar_type): + if not isna(value): + self._check_compatible_with(value) + return value.ordinal + else: + raise ValueError("'value' should be a Period. Got '{val}' instead." + .format(val=value)) + + def _scalar_from_string(self, value): + # type: (str) -> Period + return Period(value, freq=self.freq) + def _check_compatible_with(self, other): + if other is NaT: + return if self.freqstr != other.freqstr: _raise_on_incompatible(self, other) diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 2c7187f85517f..376c99df080d8 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -116,6 +116,7 @@ def wrapper(self, other): class TimedeltaArrayMixin(dtl.DatetimeLikeArrayMixin, dtl.TimelikeOps): _typ = "timedeltaarray" + _scalar_type = Timedelta __array_priority__ = 1000 # define my properties & methods for delegation _other_ops = [] @@ -221,6 +222,22 @@ def _generate_range(cls, start, end, periods, freq, closed=None): return cls._simple_new(index, freq=freq) + # ---------------------------------------------------------------- + # DatetimeLike Interface + + def _unbox_scalar(self, value): + if not isinstance(value, self._scalar_type) and value is not NaT: + raise ValueError("'value' should be a Timedelta.") + self._check_compatible_with(value) + return value.value + + def _scalar_from_string(self, value): + return Timedelta(value) + + def _check_compatible_with(self, other): + # we don't have anything to validate. + pass + # ---------------------------------------------------------------- # Array-Like / EA-Interface Methods diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 8178f7e9c6469..0fe8f73977e6b 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -477,6 +477,7 @@ def repeat(self, repeats, axis=None): nv.validate_repeat(tuple(), dict(axis=axis)) freq = self.freq if is_period_dtype(self) else None return self._shallow_copy(self.asi8.repeat(repeats), freq=freq) + # TODO: dispatch to _eadata @Appender(_index_shared_docs['where'] % _index_doc_kwargs) def where(self, cond, other=None): diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index ebe84232d7f6d..483f25513775e 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -2,6 +2,8 @@ import numpy as np import pytest +import pandas.compat as compat + import pandas as pd from pandas.core.arrays import ( DatetimeArrayMixin as DatetimeArray, PeriodArray, @@ -129,6 +131,57 @@ def test_concat_same_type(self): tm.assert_index_equal(self.index_cls(result), expected) + def test_unbox_scalar(self): + data = np.arange(10, dtype='i8') * 24 * 3600 * 10**9 + arr = self.array_cls(data, freq='D') + result = arr._unbox_scalar(arr[0]) + assert isinstance(result, (int, compat.long)) + + result = arr._unbox_scalar(pd.NaT) + assert isinstance(result, (int, compat.long)) + + with pytest.raises(ValueError): + arr._unbox_scalar('foo') + + def test_check_compatible_with(self): + data = np.arange(10, dtype='i8') * 24 * 3600 * 10**9 + arr = self.array_cls(data, freq='D') + + arr._check_compatible_with(arr[0]) + arr._check_compatible_with(arr[:1]) + arr._check_compatible_with(pd.NaT) + + def test_scalar_from_string(self): + data = np.arange(10, dtype='i8') * 24 * 3600 * 10**9 + arr = self.array_cls(data, freq='D') + result = arr._scalar_from_string(str(arr[0])) + assert result == arr[0] + + def test_searchsorted(self): + data = np.arange(10, dtype='i8') * 24 * 3600 * 10**9 + arr = self.array_cls(data, freq='D') + + # scalar + result = arr.searchsorted(arr[1]) + assert result == 1 + + result = arr.searchsorted(arr[2], side="right") + assert result == 3 + + # own-type + result = arr.searchsorted(arr[1:3]) + expected = np.array([1, 2], dtype=np.int64) + tm.assert_numpy_array_equal(result, expected) + + result = arr.searchsorted(arr[1:3], side="right") + expected = np.array([2, 3], dtype=np.int64) + tm.assert_numpy_array_equal(result, expected) + + # Following numpy convention, NaT goes at the beginning + # (unlike NaN which goes at the end) + result = arr.searchsorted(pd.NaT) + assert result == 0 + class TestDatetimeArray(SharedTests): index_cls = pd.DatetimeIndex