Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: Fix IntervalIndex constructor inconsistencies #18424

Merged
merged 1 commit into from
Nov 24, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.22.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ Other API Changes
- `tseries.frequencies.get_freq_group()` and `tseries.frequencies.DAYS` are removed from the public API (:issue:`18034`)
- :func:`Series.truncate` and :func:`DataFrame.truncate` will raise a ``ValueError`` if the index is not sorted instead of an unhelpful ``KeyError`` (:issue:`17935`)
- :func:`Dataframe.unstack` will now default to filling with ``np.nan`` for ``object`` columns. (:issue:`12815`)
- :class:`IntervalIndex` constructor will raise if the ``closed`` parameter conflicts with how the input data is inferred to be closed (:issue:`18421`)


.. _whatsnew_0220.deprecations:
Expand Down Expand Up @@ -137,6 +138,7 @@ Indexing
- Bug in :func:`Series.truncate` which raises ``TypeError`` with a monotonic ``PeriodIndex`` (:issue:`17717`)
- Bug in :func:`DataFrame.groupby` where tuples were interpreted as lists of keys rather than as keys (:issue:`17979`, :issue:`18249`)
- Bug in :func:`MultiIndex.remove_unused_levels`` which would fill nan values (:issue:`18417`)
- Bug in :class:`IntervalIndex` where empty and purely NA data was constructed inconsistently depending on the construction method (:issue:`18421`)
-

I/O
Expand Down
4 changes: 2 additions & 2 deletions pandas/_libs/interval.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -211,8 +211,8 @@ cpdef intervals_to_interval_bounds(ndarray intervals):
int64_t n = len(intervals)
ndarray left, right

left = np.empty(n, dtype=object)
right = np.empty(n, dtype=object)
left = np.empty(n, dtype=intervals.dtype)
right = np.empty(n, dtype=intervals.dtype)

for i in range(len(intervals)):
interval = intervals[i]
Expand Down
57 changes: 39 additions & 18 deletions pandas/core/indexes/interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from pandas.core.dtypes.missing import notna, isna
from pandas.core.dtypes.generic import ABCPeriodIndex
from pandas.core.dtypes.dtypes import IntervalDtype
from pandas.core.dtypes.cast import maybe_convert_platform
from pandas.core.dtypes.common import (
_ensure_platform_int,
is_list_like,
Expand All @@ -31,7 +32,9 @@
from pandas.core.indexes.timedeltas import timedelta_range
from pandas.core.indexes.multi import MultiIndex
from pandas.compat.numpy import function as nv
from pandas.core import common as com
from pandas.core.common import (
_all_not_none, _any_none, _asarray_tuplesafe, _count_not_none,
is_bool_indexer, _maybe_box_datetimelike, _not_none)
from pandas.util._decorators import cache_readonly, Appender
from pandas.core.config import get_option
from pandas.tseries.frequencies import to_offset
Expand Down Expand Up @@ -176,7 +179,7 @@ class IntervalIndex(IntervalMixin, Index):

_mask = None

def __new__(cls, data, closed='right',
def __new__(cls, data, closed=None,
name=None, copy=False, dtype=None,
fastpath=False, verify_integrity=True):

Expand All @@ -197,8 +200,17 @@ def __new__(cls, data, closed='right',
if is_scalar(data):
cls._scalar_data_error(data)

data = IntervalIndex.from_intervals(data, name=name)
left, right, closed = data.left, data.right, data.closed
data = maybe_convert_platform(data)
left, right, infer_closed = intervals_to_interval_bounds(data)

if _all_not_none(closed, infer_closed) and closed != infer_closed:
# GH 18421
msg = ("conflicting values for closed: constructor got "
"'{closed}', inferred from data '{infer_closed}'"
.format(closed=closed, infer_closed=infer_closed))
raise ValueError(msg)

closed = closed or infer_closed

return cls._simple_new(left, right, closed, name,
copy=copy, verify_integrity=verify_integrity)
Expand Down Expand Up @@ -376,7 +388,8 @@ def from_breaks(cls, breaks, closed='right', name=None, copy=False):
IntervalIndex.from_tuples : Construct an IntervalIndex from a
list/array of tuples
"""
breaks = np.asarray(breaks)
breaks = maybe_convert_platform(breaks)

return cls.from_arrays(breaks[:-1], breaks[1:], closed,
name=name, copy=copy)

Expand Down Expand Up @@ -416,8 +429,9 @@ def from_arrays(cls, left, right, closed='right', name=None, copy=False):
IntervalIndex.from_tuples : Construct an IntervalIndex from a
list/array of tuples
"""
left = np.asarray(left)
right = np.asarray(right)
left = maybe_convert_platform(left)
right = maybe_convert_platform(right)

return cls._simple_new(left, right, closed, name=name,
copy=copy, verify_integrity=True)

Expand Down Expand Up @@ -460,8 +474,12 @@ def from_intervals(cls, data, name=None, copy=False):
IntervalIndex.from_tuples : Construct an IntervalIndex from a
list/array of tuples
"""
data = np.asarray(data)
left, right, closed = intervals_to_interval_bounds(data)
if isinstance(data, IntervalIndex):
left, right, closed = data.left, data.right, data.closed
name = name or data.name
else:
data = maybe_convert_platform(data)
left, right, closed = intervals_to_interval_bounds(data)
return cls.from_arrays(left, right, closed, name=name, copy=False)

@classmethod
Expand Down Expand Up @@ -497,8 +515,11 @@ def from_tuples(cls, data, closed='right', name=None, copy=False):
IntervalIndex.from_intervals : Construct an IntervalIndex from an array
of Interval objects
"""
left = []
right = []
if len(data):
left, right = [], []
else:
left = right = data

for d in data:

if isna(d):
Expand All @@ -517,7 +538,7 @@ def from_tuples(cls, data, closed='right', name=None, copy=False):
return cls.from_arrays(left, right, closed, name=name, copy=False)

def to_tuples(self):
return Index(com._asarray_tuplesafe(zip(self.left, self.right)))
return Index(_asarray_tuplesafe(zip(self.left, self.right)))

@cache_readonly
def _multiindex(self):
Expand Down Expand Up @@ -838,7 +859,7 @@ def get_loc(self, key, method=None):
return self._engine.get_loc(key)

def get_value(self, series, key):
if com.is_bool_indexer(key):
if is_bool_indexer(key):
loc = key
elif is_list_like(key):
loc = self.get_indexer(key)
Expand Down Expand Up @@ -1166,7 +1187,7 @@ def _is_type_compatible(a, b):
return ((is_number(a) and is_number(b)) or
(is_ts_compat(a) and is_ts_compat(b)) or
(is_td_compat(a) and is_td_compat(b)) or
com._any_none(a, b))
_any_none(a, b))


def interval_range(start=None, end=None, periods=None, freq=None,
Expand Down Expand Up @@ -1244,13 +1265,13 @@ def interval_range(start=None, end=None, periods=None, freq=None,
--------
IntervalIndex : an Index of intervals that are all closed on the same side.
"""
if com._count_not_none(start, end, periods) != 2:
if _count_not_none(start, end, periods) != 2:
raise ValueError('Of the three parameters: start, end, and periods, '
'exactly two must be specified')

start = com._maybe_box_datetimelike(start)
end = com._maybe_box_datetimelike(end)
endpoint = next(com._not_none(start, end))
start = _maybe_box_datetimelike(start)
end = _maybe_box_datetimelike(end)
endpoint = next(_not_none(start, end))

if not _is_valid_endpoint(start):
msg = 'start must be numeric or datetime-like, got {start}'
Expand Down
126 changes: 90 additions & 36 deletions pandas/tests/indexes/test_interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from pandas import (Interval, IntervalIndex, Index, isna,
interval_range, Timestamp, Timedelta,
compat, date_range, timedelta_range, DateOffset)
from pandas.compat import zip
from pandas.compat import lzip
from pandas.tseries.offsets import Day
from pandas._libs.interval import IntervalTree
from pandas.tests.indexes.common import Base
Expand Down Expand Up @@ -38,7 +38,7 @@ def create_index_with_nan(self, closed='right'):
@pytest.mark.parametrize('name', [None, 'foo'])
def test_constructors(self, closed, name):
left, right = Index([0, 1, 2, 3]), Index([1, 2, 3, 4])
ivs = [Interval(l, r, closed=closed) for l, r in zip(left, right)]
ivs = [Interval(l, r, closed=closed) for l, r in lzip(left, right)]
expected = IntervalIndex._simple_new(
left=left, right=right, closed=closed, name=name)

Expand All @@ -57,7 +57,7 @@ def test_constructors(self, closed, name):
tm.assert_index_equal(result, expected)

result = IntervalIndex.from_tuples(
zip(left, right), closed=closed, name=name)
lzip(left, right), closed=closed, name=name)
tm.assert_index_equal(result, expected)

result = Index(ivs, name=name)
Expand All @@ -68,6 +68,9 @@ def test_constructors(self, closed, name):
tm.assert_index_equal(Index(expected), expected)
tm.assert_index_equal(IntervalIndex(expected), expected)

result = IntervalIndex.from_intervals(expected)
tm.assert_index_equal(result, expected)

result = IntervalIndex.from_intervals(
expected.values, name=expected.name)
tm.assert_index_equal(result, expected)
Expand All @@ -86,63 +89,118 @@ def test_constructors(self, closed, name):
breaks, closed=expected.closed, name=expected.name)
tm.assert_index_equal(result, expected)

def test_constructors_other(self):

# all-nan
result = IntervalIndex.from_intervals([np.nan])
expected = np.array([np.nan], dtype=object)
tm.assert_numpy_array_equal(result.values, expected)

# empty
result = IntervalIndex.from_intervals([])
expected = np.array([], dtype=object)
tm.assert_numpy_array_equal(result.values, expected)
@pytest.mark.parametrize('data', [[np.nan], [np.nan] * 2, [np.nan] * 50])
def test_constructors_nan(self, closed, data):
# GH 18421
expected_values = np.array(data, dtype=object)
expected_idx = IntervalIndex(data, closed=closed)

# validate the expected index
assert expected_idx.closed == closed
tm.assert_numpy_array_equal(expected_idx.values, expected_values)

result = IntervalIndex.from_tuples(data, closed=closed)
tm.assert_index_equal(result, expected_idx)
tm.assert_numpy_array_equal(result.values, expected_values)

result = IntervalIndex.from_breaks([np.nan] + data, closed=closed)
tm.assert_index_equal(result, expected_idx)
tm.assert_numpy_array_equal(result.values, expected_values)

result = IntervalIndex.from_arrays(data, data, closed=closed)
tm.assert_index_equal(result, expected_idx)
tm.assert_numpy_array_equal(result.values, expected_values)

if closed == 'right':
# Can't specify closed for IntervalIndex.from_intervals
result = IntervalIndex.from_intervals(data)
tm.assert_index_equal(result, expected_idx)
tm.assert_numpy_array_equal(result.values, expected_values)

@pytest.mark.parametrize('data', [
[],
np.array([], dtype='int64'),
np.array([], dtype='float64'),
np.array([], dtype=object)])
def test_constructors_empty(self, data, closed):
# GH 18421
expected_dtype = data.dtype if isinstance(data, np.ndarray) else object
expected_values = np.array([], dtype=object)
expected_index = IntervalIndex(data, closed=closed)

# validate the expected index
assert expected_index.empty
assert expected_index.closed == closed
assert expected_index.dtype.subtype == expected_dtype
tm.assert_numpy_array_equal(expected_index.values, expected_values)

result = IntervalIndex.from_tuples(data, closed=closed)
tm.assert_index_equal(result, expected_index)
tm.assert_numpy_array_equal(result.values, expected_values)

result = IntervalIndex.from_breaks(data, closed=closed)
tm.assert_index_equal(result, expected_index)
tm.assert_numpy_array_equal(result.values, expected_values)

result = IntervalIndex.from_arrays(data, data, closed=closed)
tm.assert_index_equal(result, expected_index)
tm.assert_numpy_array_equal(result.values, expected_values)

if closed == 'right':
# Can't specify closed for IntervalIndex.from_intervals
result = IntervalIndex.from_intervals(data)
tm.assert_index_equal(result, expected_index)
tm.assert_numpy_array_equal(result.values, expected_values)

def test_constructors_errors(self):

# scalar
msg = ('IntervalIndex(...) must be called with a collection of '
msg = ('IntervalIndex\(...\) must be called with a collection of '
'some kind, 5 was passed')
with pytest.raises(TypeError, message=msg):
with tm.assert_raises_regex(TypeError, msg):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For some reason pytest.raises didn't appear to actually be checking the message, so I switched to tm.assert_raises_regex.

IntervalIndex(5)

# not an interval
msg = "type <class 'numpy.int32'> with value 0 is not an interval"
with pytest.raises(TypeError, message=msg):
msg = ("type <(class|type) 'numpy.int64'> with value 0 "
"is not an interval")
with tm.assert_raises_regex(TypeError, msg):
IntervalIndex([0, 1])

with pytest.raises(TypeError, message=msg):
with tm.assert_raises_regex(TypeError, msg):
IntervalIndex.from_intervals([0, 1])

# invalid closed
msg = "invalid options for 'closed': invalid"
with pytest.raises(ValueError, message=msg):
with tm.assert_raises_regex(ValueError, msg):
IntervalIndex.from_arrays([0, 1], [1, 2], closed='invalid')

# mismatched closed
# mismatched closed within intervals
msg = 'intervals must all be closed on the same side'
with pytest.raises(ValueError, message=msg):
with tm.assert_raises_regex(ValueError, msg):
IntervalIndex.from_intervals([Interval(0, 1),
Interval(1, 2, closed='left')])

with pytest.raises(ValueError, message=msg):
IntervalIndex.from_arrays([0, 10], [3, 5])
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test was in the wrong section; raised a ValueError but should have been under the "decreasing breaks/arrays" commented section. Already have a relevant test there, so deleted.


with pytest.raises(ValueError, message=msg):
with tm.assert_raises_regex(ValueError, msg):
Index([Interval(0, 1), Interval(2, 3, closed='left')])

# mismatched closed inferred from intervals vs constructor.
msg = 'conflicting values for closed'
with tm.assert_raises_regex(ValueError, msg):
iv = [Interval(0, 1, closed='both'), Interval(1, 2, closed='both')]
IntervalIndex(iv, closed='neither')

# no point in nesting periods in an IntervalIndex
msg = 'Period dtypes are not supported, use a PeriodIndex instead'
with pytest.raises(ValueError, message=msg):
with tm.assert_raises_regex(ValueError, msg):
IntervalIndex.from_breaks(
pd.period_range('2000-01-01', periods=3))

# decreasing breaks/arrays
msg = 'left side of interval must be <= right side'
with pytest.raises(ValueError, message=msg):
with tm.assert_raises_regex(ValueError, msg):
IntervalIndex.from_breaks(range(10, -1, -1))

with pytest.raises(ValueError, message=msg):
with tm.assert_raises_regex(ValueError, msg):
IntervalIndex.from_arrays(range(10, -1, -1), range(9, -2, -1))

def test_constructors_datetimelike(self, closed):
Expand Down Expand Up @@ -865,23 +923,23 @@ def test_is_non_overlapping_monotonic(self, closed):
idx = IntervalIndex.from_tuples(tpls, closed=closed)
assert idx.is_non_overlapping_monotonic is True

idx = IntervalIndex.from_tuples(reversed(tpls), closed=closed)
idx = IntervalIndex.from_tuples(tpls[::-1], closed=closed)
assert idx.is_non_overlapping_monotonic is True

# Should be False in all cases (overlapping)
tpls = [(0, 2), (1, 3), (4, 5), (6, 7)]
idx = IntervalIndex.from_tuples(tpls, closed=closed)
assert idx.is_non_overlapping_monotonic is False

idx = IntervalIndex.from_tuples(reversed(tpls), closed=closed)
idx = IntervalIndex.from_tuples(tpls[::-1], closed=closed)
assert idx.is_non_overlapping_monotonic is False

# Should be False in all cases (non-monotonic)
tpls = [(0, 1), (2, 3), (6, 7), (4, 5)]
idx = IntervalIndex.from_tuples(tpls, closed=closed)
assert idx.is_non_overlapping_monotonic is False

idx = IntervalIndex.from_tuples(reversed(tpls), closed=closed)
idx = IntervalIndex.from_tuples(tpls[::-1], closed=closed)
assert idx.is_non_overlapping_monotonic is False

# Should be False for closed='both', overwise True (GH16560)
Expand Down Expand Up @@ -1054,10 +1112,6 @@ def test_constructor_coverage(self):
end=end.to_pydatetime())
tm.assert_index_equal(result, expected)

result = pd.interval_range(start=start.tz_localize('UTC'),
end=end.tz_localize('UTC'))
tm.assert_index_equal(result, expected)
Copy link
Member Author

@jschendel jschendel Nov 22, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Deleted this check because it's invalid; just didn't appear as so until these fixes went in and fixed an unrelated issue. Here expected doesn't have any tz related info, but start.tz_localize('UTC')/end.tz_localize('UTC') adds tz info, so the dtype for result is tz aware, and not the same as expected.

Copy link
Member Author

@jschendel jschendel Nov 22, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thinking about it now, I should add a tz aware related test, but will wait until there are other comments on this PR.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yep this is broken

In [3]: pd.interval_range(start=start.tz_localize('UTC'),
   ...:                   end=end.tz_localize('UTC'))
   ...: 
Out[3]: 
IntervalIndex([(2017-01-01, 2017-01-02], (2017-01-02, 2017-01-03], (2017-01-03, 2017-01-04], (2017-01-04, 2017-01-05], (2017-01-05, 2017-01-06] ... (2017-01-10, 2017-01-11], (2017-01-11, 2017-01-12], (2017-01-12, 2017-01-13], (2017-01-13, 2017-01-14], (2017-01-14, 2017-01-15]]
              closed='right',
              dtype='interval[datetime64[ns]]')

In [4]: pd.interval_range(start=start.tz_localize('UTC'),
   ...:                   end=end.tz_localize('UTC')).left
   ...:                   
Out[4]: 
DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04',
               '2017-01-05', '2017-01-06', '2017-01-07', '2017-01-08',
               '2017-01-09', '2017-01-10', '2017-01-11', '2017-01-12',
               '2017-01-13', '2017-01-14'],
              dtype='datetime64[ns]', freq=None)

also need to validate that left/right have the same tz (if constructed that way)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The changes in this commit seem to resolve tz related issues. Not super well versed on the tz aware code, but looks right to me:

In [2]: dt1 = pd.Timestamp('2017-01-01', tz='US/Eastern')

In [3]: dt2 = pd.Timestamp('2017-01-04', tz='US/Eastern')

In [4]: dt2_bad = pd.Timestamp('2017-01-04', tz='UTC')

In [5]: pd.interval_range(dt1, dt2)
Out[5]:
IntervalIndex([(2017-01-01, 2017-01-02], (2017-01-02, 2017-01-03], (2017-01-03, 2017-01-04]]
              closed='right',
              dtype='interval[datetime64[ns, US/Eastern]]')

In [6]: pd.interval_range(dt1, dt2).left
Out[6]:
DatetimeIndex(['2017-01-01 00:00:00-05:00', '2017-01-02 00:00:00-05:00',
               '2017-01-03 00:00:00-05:00'],
              dtype='datetime64[ns, US/Eastern]', freq='D')

In [7]: pd.interval_range(dt1, dt2_bad)
---------------------------------------------------------------------------
AssertionError: Inputs must both have the same timezone, US/Eastern != UTC

During handling of the above exception, another exception occurred:

TypeError: Start and end cannot both be tz-aware with different timezones

I think it was just a matter of making sure the dates were handled properly, and the validation follows from deferring to date_range as part of the creation process.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmmm...the only issue appears to be an inconsistency when one date is tz aware but the other isn't:

In [2]: dt1_tz = pd.Timestamp('2017-01-01', tz='US/Eastern')

In [3]: dt1_no_tz = pd.Timestamp('2017-01-01')

In [4]: dt2_tz = pd.Timestamp('2017-01-04', tz='US/Eastern')

In [5]: dt2_no_tz = pd.Timestamp('2017-01-04')

In [6]: pd.interval_range(dt1_tz, dt2_no_tz)
Out[6]:
IntervalIndex([(2017-01-01, 2017-01-02], (2017-01-02, 2017-01-03], (2017-01-03, 2017-01-04]]
              closed='right',
              dtype='interval[datetime64[ns, US/Eastern]]')

In [7]: pd.interval_range(dt1_no_tz, dt2_tz)
---------------------------------------------------------------------------
AssertionError: Inputs must both have the same timezone, None != US/Eastern

During handling of the above exception, another exception occurred:

TypeError: Start and end cannot both be tz-aware with different timezones

Again, seems to be an underlying issue with date_range and DatetimeIndex though:

In [8]: pd.date_range(dt1_tz, dt2_no_tz)
Out[8]:
DatetimeIndex(['2017-01-01 00:00:00-05:00', '2017-01-02 00:00:00-05:00',
               '2017-01-03 00:00:00-05:00', '2017-01-04 00:00:00-05:00'],
              dtype='datetime64[ns, US/Eastern]', freq='D')

In [9]: pd.DatetimeIndex(start=dt1_tz, end=dt2_no_tz, freq='D')
Out[9]:
DatetimeIndex(['2017-01-01 00:00:00-05:00', '2017-01-02 00:00:00-05:00',
               '2017-01-03 00:00:00-05:00', '2017-01-04 00:00:00-05:00'],
              dtype='datetime64[ns, US/Eastern]', freq='D')

In [10]: pd.date_range(dt1_no_tz, dt2_tz)
---------------------------------------------------------------------------
AssertionError: Inputs must both have the same timezone, None != US/Eastern

During handling of the above exception, another exception occurred:

TypeError: Start and end cannot both be tz-aware with different timezones

In [11]: pd.DatetimeIndex(start=dt1_no_tz, end=dt2_tz, freq='D')
---------------------------------------------------------------------------
AssertionError: Inputs must both have the same timezone, None != US/Eastern

During handling of the above exception, another exception occurred:

TypeError: Start and end cannot both be tz-aware with different timezones

Will open a new issue for this.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes


result = pd.interval_range(start=start.asm8, end=end.asm8)
tm.assert_index_equal(result, expected)

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/indexing/test_interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def test_getitem_with_scalar(self):
def test_nonoverlapping_monotonic(self, direction, closed):
tpls = [(0, 1), (2, 3), (4, 5)]
if direction == 'decreasing':
tpls = reversed(tpls)
tpls = tpls[::-1]

idx = IntervalIndex.from_tuples(tpls, closed=closed)
s = Series(list('abc'), idx)
Expand Down