From 4e9c0d1f2156c656df5da4ac3f00190f0da5828b Mon Sep 17 00:00:00 2001 From: jschendel Date: Sat, 19 Aug 2017 10:51:05 -0600 Subject: [PATCH 001/188] CLN: replace %s syntax with .format in pandas.tseries (#17290) --- pandas/tseries/frequencies.py | 38 +++++----- pandas/tseries/holiday.py | 14 ++-- pandas/tseries/offsets.py | 137 +++++++++++++++++++--------------- 3 files changed, 105 insertions(+), 84 deletions(-) diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index aa33a3849acb3d..7f34bcaf52926e 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -409,16 +409,17 @@ def _get_freq_str(base, mult=1): need_suffix = ['QS', 'BQ', 'BQS', 'YS', 'AS', 'BY', 'BA', 'BYS', 'BAS'] for __prefix in need_suffix: for _m in tslib._MONTHS: - _offset_to_period_map['%s-%s' % (__prefix, _m)] = \ - _offset_to_period_map[__prefix] + _alias = '{prefix}-{month}'.format(prefix=__prefix, month=_m) + _offset_to_period_map[_alias] = _offset_to_period_map[__prefix] for __prefix in ['A', 'Q']: for _m in tslib._MONTHS: - _alias = '%s-%s' % (__prefix, _m) + _alias = '{prefix}-{month}'.format(prefix=__prefix, month=_m) _offset_to_period_map[_alias] = _alias _days = ['MON', 'TUE', 'WED', 'THU', 'FRI', 'SAT', 'SUN'] for _d in _days: - _offset_to_period_map['W-%s' % _d] = 'W-%s' % _d + _alias = 'W-{day}'.format(day=_d) + _offset_to_period_map[_alias] = _alias def get_period_alias(offset_str): @@ -587,7 +588,7 @@ def _base_and_stride(freqstr): groups = opattern.match(freqstr) if not groups: - raise ValueError("Could not evaluate %s" % freqstr) + raise ValueError("Could not evaluate {freq}".format(freq=freqstr)) stride = groups.group(1) @@ -775,8 +776,8 @@ def infer_freq(index, warn=True): if not (is_datetime64_dtype(values) or is_timedelta64_dtype(values) or values.dtype == object): - raise TypeError("cannot infer freq from a non-convertible " - "dtype on a Series of {0}".format(index.dtype)) + raise TypeError("cannot infer freq from a non-convertible dtype " + "on a Series of {dtype}".format(dtype=index.dtype)) index = values if is_period_arraylike(index): @@ -789,7 +790,7 @@ def infer_freq(index, warn=True): if isinstance(index, pd.Index) and not isinstance(index, pd.DatetimeIndex): if isinstance(index, (pd.Int64Index, pd.Float64Index)): raise TypeError("cannot infer freq from a non-convertible index " - "type {0}".format(type(index))) + "type {type}".format(type=type(index))) index = index.values if not isinstance(index, pd.DatetimeIndex): @@ -956,15 +957,17 @@ def _infer_daily_rule(self): if annual_rule: nyears = self.ydiffs[0] month = _month_aliases[self.rep_stamp.month] - return _maybe_add_count('%s-%s' % (annual_rule, month), nyears) + alias = '{prefix}-{month}'.format(prefix=annual_rule, month=month) + return _maybe_add_count(alias, nyears) quarterly_rule = self._get_quarterly_rule() if quarterly_rule: nquarters = self.mdiffs[0] / 3 mod_dict = {0: 12, 2: 11, 1: 10} month = _month_aliases[mod_dict[self.rep_stamp.month % 3]] - return _maybe_add_count('%s-%s' % (quarterly_rule, month), - nquarters) + alias = '{prefix}-{month}'.format(prefix=quarterly_rule, + month=month) + return _maybe_add_count(alias, nquarters) monthly_rule = self._get_monthly_rule() if monthly_rule: @@ -974,8 +977,8 @@ def _infer_daily_rule(self): days = self.deltas[0] / _ONE_DAY if days % 7 == 0: # Weekly - alias = _weekday_rule_aliases[self.rep_stamp.weekday()] - return _maybe_add_count('W-%s' % alias, days / 7) + day = _weekday_rule_aliases[self.rep_stamp.weekday()] + return _maybe_add_count('W-{day}'.format(day=day), days / 7) else: return _maybe_add_count('D', days) @@ -1048,7 +1051,7 @@ def _get_wom_rule(self): week = week_of_months[0] + 1 wd = _weekday_rule_aliases[weekdays[0]] - return 'WOM-%d%s' % (week, wd) + return 'WOM-{week}{weekday}'.format(week=week, weekday=wd) class _TimedeltaFrequencyInferer(_FrequencyInferer): @@ -1058,15 +1061,16 @@ def _infer_daily_rule(self): days = self.deltas[0] / _ONE_DAY if days % 7 == 0: # Weekly - alias = _weekday_rule_aliases[self.rep_stamp.weekday()] - return _maybe_add_count('W-%s' % alias, days / 7) + wd = _weekday_rule_aliases[self.rep_stamp.weekday()] + alias = 'W-{weekday}'.format(weekday=wd) + return _maybe_add_count(alias, days / 7) else: return _maybe_add_count('D', days) def _maybe_add_count(base, count): if count != 1: - return '%d%s' % (count, base) + return '{count}{base}'.format(count=int(count), base=base) else: return base diff --git a/pandas/tseries/holiday.py b/pandas/tseries/holiday.py index 9acb52ebe0e9f2..d8bfa3013f8f79 100644 --- a/pandas/tseries/holiday.py +++ b/pandas/tseries/holiday.py @@ -174,16 +174,16 @@ class from pandas.tseries.offsets def __repr__(self): info = '' if self.year is not None: - info += 'year=%s, ' % self.year - info += 'month=%s, day=%s, ' % (self.month, self.day) + info += 'year={year}, '.format(year=self.year) + info += 'month={mon}, day={day}, '.format(mon=self.month, day=self.day) if self.offset is not None: - info += 'offset=%s' % self.offset + info += 'offset={offset}'.format(offset=self.offset) if self.observance is not None: - info += 'observance=%s' % self.observance + info += 'observance={obs}'.format(obs=self.observance) - repr = 'Holiday: %s (%s)' % (self.name, info) + repr = 'Holiday: {name} ({info})'.format(name=self.name, info=info) return repr def dates(self, start_date, end_date, return_name=False): @@ -374,8 +374,8 @@ def holidays(self, start=None, end=None, return_name=False): DatetimeIndex of holidays """ if self.rules is None: - raise Exception('Holiday Calendar %s does not have any ' - 'rules specified' % self.name) + raise Exception('Holiday Calendar {name} does not have any ' + 'rules specified'.format(name=self.name)) if start is None: start = AbstractHolidayCalendar.start_date diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index 56ef703e67ca08..29cdda55488965 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -261,10 +261,10 @@ def apply_index(self, i): """ if not type(self) is DateOffset: - raise NotImplementedError("DateOffset subclass %s " + raise NotImplementedError("DateOffset subclass {name} " "does not have a vectorized " - "implementation" - % (self.__class__.__name__,)) + "implementation".format( + name=self.__class__.__name__)) relativedelta_fast = set(['years', 'months', 'weeks', 'days', 'hours', 'minutes', 'seconds', 'microseconds']) @@ -295,10 +295,10 @@ def apply_index(self, i): return i + (self._offset * self.n) else: # relativedelta with other keywords + kwd = set(self.kwds) - relativedelta_fast raise NotImplementedError("DateOffset with relativedelta " - "keyword(s) %s not able to be " - "applied vectorized" % - (set(self.kwds) - relativedelta_fast),) + "keyword(s) {kwd} not able to be " + "applied vectorized".format(kwd=kwd)) def isAnchored(self): return (self.n == 1) @@ -339,19 +339,20 @@ def __repr__(self): if attr not in exclude: attrs.append('='.join((attr, repr(getattr(self, attr))))) + plural = '' if abs(self.n) != 1: plural = 's' - else: - plural = '' - n_str = "" + n_str = '' if self.n != 1: - n_str = "%s * " % self.n + n_str = '{n} * '.format(n=self.n) - out = '<%s' % n_str + className + plural + attrs_str = '' if attrs: - out += ': ' + ', '.join(attrs) - out += '>' + attrs_str = ': ' + ', '.join(attrs) + + repr_content = ''.join([n_str, className, plural, attrs_str]) + out = '<{content}>'.format(content=repr_content) return out @property @@ -501,7 +502,7 @@ def freqstr(self): return repr(self) if self.n != 1: - fstr = '%d%s' % (self.n, code) + fstr = '{n}{code}'.format(n=self.n, code=code) else: fstr = code @@ -509,7 +510,7 @@ def freqstr(self): @property def nanos(self): - raise ValueError("{0} is a non-fixed frequency".format(self)) + raise ValueError("{name} is a non-fixed frequency".format(name=self)) class SingleConstructorOffset(DateOffset): @@ -518,7 +519,7 @@ class SingleConstructorOffset(DateOffset): def _from_name(cls, suffix=None): # default _from_name calls cls with no args if suffix: - raise ValueError("Bad freq suffix %s" % suffix) + raise ValueError("Bad freq suffix {suffix}".format(suffix=suffix)) return cls() @@ -531,21 +532,21 @@ class BusinessMixin(object): def __repr__(self): className = getattr(self, '_outputName', self.__class__.__name__) + plural = '' if abs(self.n) != 1: plural = 's' - else: - plural = '' - n_str = "" + n_str = '' if self.n != 1: - n_str = "%s * " % self.n + n_str = '{n} * '.format(n=self.n) - out = '<%s' % n_str + className + plural + self._repr_attrs() + '>' + repr_content = ''.join([n_str, className, plural, self._repr_attrs()]) + out = '<{content}>'.format(content=repr_content) return out def _repr_attrs(self): if self.offset: - attrs = ['offset=%s' % repr(self.offset)] + attrs = ['offset={offset!r}'.format(offset=self.offset)] else: attrs = None out = '' @@ -601,7 +602,7 @@ def freqstr(self): return repr(self) if self.n != 1: - fstr = '%d%s' % (self.n, code) + fstr = '{n}{code}'.format(n=self.n, code=code) else: fstr = code @@ -1109,7 +1110,8 @@ def name(self): if self.isAnchored: return self.rule_code else: - return "%s-%s" % (self.rule_code, _int_to_month[self.n]) + return "{code}-{month}".format(code=self.rule_code, + month=_int_to_month[self.n]) class MonthEnd(MonthOffset): @@ -1176,9 +1178,9 @@ def __init__(self, n=1, day_of_month=None, normalize=False, **kwds): else: self.day_of_month = int(day_of_month) if not self._min_day_of_month <= self.day_of_month <= 27: - raise ValueError('day_of_month must be ' - '{}<=day_of_month<=27, got {}'.format( - self._min_day_of_month, self.day_of_month)) + msg = 'day_of_month must be {min}<=day_of_month<=27, got {day}' + raise ValueError(msg.format(min=self._min_day_of_month, + day=self.day_of_month)) self.n = int(n) self.normalize = normalize self.kwds = kwds @@ -1190,7 +1192,7 @@ def _from_name(cls, suffix=None): @property def rule_code(self): - suffix = '-{}'.format(self.day_of_month) + suffix = '-{day_of_month}'.format(day_of_month=self.day_of_month) return self._prefix + suffix @apply_wraps @@ -1576,8 +1578,8 @@ def __init__(self, n=1, normalize=False, **kwds): if self.weekday is not None: if self.weekday < 0 or self.weekday > 6: - raise ValueError('Day must be 0<=day<=6, got %d' % - self.weekday) + raise ValueError('Day must be 0<=day<=6, got {day}' + .format(day=self.weekday)) self._inc = timedelta(weeks=1) self.kwds = kwds @@ -1630,7 +1632,7 @@ def onOffset(self, dt): def rule_code(self): suffix = '' if self.weekday is not None: - suffix = '-%s' % (_int_to_weekday[self.weekday]) + suffix = '-{weekday}'.format(weekday=_int_to_weekday[self.weekday]) return self._prefix + suffix @classmethod @@ -1696,11 +1698,11 @@ def __init__(self, n=1, normalize=False, **kwds): raise ValueError('N cannot be 0') if self.weekday < 0 or self.weekday > 6: - raise ValueError('Day must be 0<=day<=6, got %d' % - self.weekday) + raise ValueError('Day must be 0<=day<=6, got {day}' + .format(day=self.weekday)) if self.week < 0 or self.week > 3: - raise ValueError('Week must be 0<=day<=3, got %d' % - self.week) + raise ValueError('Week must be 0<=week<=3, got {week}' + .format(week=self.week)) self.kwds = kwds @@ -1746,15 +1748,18 @@ def onOffset(self, dt): @property def rule_code(self): - return '%s-%d%s' % (self._prefix, self.week + 1, - _int_to_weekday.get(self.weekday, '')) + weekday = _int_to_weekday.get(self.weekday, '') + return '{prefix}-{week}{weekday}'.format(prefix=self._prefix, + week=self.week + 1, + weekday=weekday) _prefix = 'WOM' @classmethod def _from_name(cls, suffix=None): if not suffix: - raise ValueError("Prefix %r requires a suffix." % (cls._prefix)) + raise ValueError("Prefix {prefix!r} requires a suffix." + .format(prefix=cls._prefix)) # TODO: handle n here... # only one digit weeks (1 --> week 0, 2 --> week 1, etc.) week = int(suffix[0]) - 1 @@ -1789,8 +1794,8 @@ def __init__(self, n=1, normalize=False, **kwds): raise ValueError('N cannot be 0') if self.weekday < 0 or self.weekday > 6: - raise ValueError('Day must be 0<=day<=6, got %d' % - self.weekday) + raise ValueError('Day must be 0<=day<=6, got {day}' + .format(day=self.weekday)) self.kwds = kwds @@ -1829,14 +1834,17 @@ def onOffset(self, dt): @property def rule_code(self): - return '%s-%s' % (self._prefix, _int_to_weekday.get(self.weekday, '')) + weekday = _int_to_weekday.get(self.weekday, '') + return '{prefix}-{weekday}'.format(prefix=self._prefix, + weekday=weekday) _prefix = 'LWOM' @classmethod def _from_name(cls, suffix=None): if not suffix: - raise ValueError("Prefix %r requires a suffix." % (cls._prefix)) + raise ValueError("Prefix {prefix!r} requires a suffix." + .format(prefix=cls._prefix)) # TODO: handle n here... weekday = _weekday_to_int[suffix] return cls(weekday=weekday) @@ -1876,7 +1884,8 @@ def _from_name(cls, suffix=None): @property def rule_code(self): - return '%s-%s' % (self._prefix, _int_to_month[self.startingMonth]) + month = _int_to_month[self.startingMonth] + return '{prefix}-{month}'.format(prefix=self._prefix, month=month) class BQuarterEnd(QuarterOffset): @@ -2045,8 +2054,7 @@ def apply(self, other): @apply_index_wraps def apply_index(self, i): freq_month = 12 if self.startingMonth == 1 else self.startingMonth - 1 - # freq_month = self.startingMonth - freqstr = 'Q-%s' % (_int_to_month[freq_month],) + freqstr = 'Q-{month}'.format(month=_int_to_month[freq_month]) return self._beg_apply_index(i, freqstr) @@ -2071,7 +2079,8 @@ def _from_name(cls, suffix=None): @property def rule_code(self): - return '%s-%s' % (self._prefix, _int_to_month[self.month]) + month = _int_to_month[self.month] + return '{prefix}-{month}'.format(prefix=self._prefix, month=month) class BYearEnd(YearOffset): @@ -2246,7 +2255,7 @@ def _rollf(date): @apply_index_wraps def apply_index(self, i): freq_month = 12 if self.month == 1 else self.month - 1 - freqstr = 'A-%s' % (_int_to_month[freq_month],) + freqstr = 'A-{month}'.format(month=_int_to_month[freq_month]) return self._beg_apply_index(i, freqstr) def onOffset(self, dt): @@ -2312,7 +2321,8 @@ def __init__(self, n=1, normalize=False, **kwds): raise ValueError('N cannot be 0') if self.variation not in ["nearest", "last"]: - raise ValueError('%s is not a valid variation' % self.variation) + raise ValueError('{variation} is not a valid variation' + .format(variation=self.variation)) if self.variation == "nearest": weekday_offset = weekday(self.weekday) @@ -2438,8 +2448,9 @@ def _get_year_end_last(self, dt): @property def rule_code(self): + prefix = self._get_prefix() suffix = self.get_rule_code_suffix() - return "%s-%s" % (self._get_prefix(), suffix) + return "{prefix}-{suffix}".format(prefix=prefix, suffix=suffix) def _get_prefix(self): return self._prefix @@ -2451,9 +2462,11 @@ def _get_suffix_prefix(self): return self._suffix_prefix_last def get_rule_code_suffix(self): - return '%s-%s-%s' % (self._get_suffix_prefix(), - _int_to_month[self.startingMonth], - _int_to_weekday[self.weekday]) + prefix = self._get_suffix_prefix() + month = _int_to_month[self.startingMonth] + weekday = _int_to_weekday[self.weekday] + return '{prefix}-{month}-{weekday}'.format(prefix=prefix, month=month, + weekday=weekday) @classmethod def _parse_suffix(cls, varion_code, startingMonth_code, weekday_code): @@ -2463,7 +2476,7 @@ def _parse_suffix(cls, varion_code, startingMonth_code, weekday_code): variation = "last" else: raise ValueError( - "Unable to parse varion_code: %s" % (varion_code,)) + "Unable to parse varion_code: {code}".format(code=varion_code)) startingMonth = _month_to_int[startingMonth_code] weekday = _weekday_to_int[weekday_code] @@ -2628,8 +2641,9 @@ def onOffset(self, dt): @property def rule_code(self): suffix = self._offset.get_rule_code_suffix() - return "%s-%s" % (self._prefix, - "%s-%d" % (suffix, self.qtr_with_extra_week)) + qtr = self.qtr_with_extra_week + return "{prefix}-{suffix}-{qtr}".format(prefix=self._prefix, + suffix=suffix, qtr=qtr) @classmethod def _from_name(cls, *args): @@ -2712,8 +2726,8 @@ def __add__(self, other): except ApplyTypeError: return NotImplemented except OverflowError: - raise OverflowError("the add operation between {} and {} " - "will overflow".format(self, other)) + raise OverflowError("the add operation between {self} and {other} " + "will overflow".format(self=self, other=other)) def __eq__(self, other): if isinstance(other, compat.string_types): @@ -2771,7 +2785,8 @@ def apply(self, other): elif isinstance(other, type(self)): return type(self)(self.n + other.n) - raise ApplyTypeError('Unhandled type: %s' % type(other).__name__) + raise ApplyTypeError('Unhandled type: {type_str}' + .format(type_str=type(other).__name__)) _prefix = 'undefined' @@ -2921,7 +2936,8 @@ def generate_range(start=None, end=None, periods=None, # faster than cur + offset next_date = offset.apply(cur) if next_date <= cur: - raise ValueError('Offset %s did not increment date' % offset) + raise ValueError('Offset {offset} did not increment date' + .format(offset=offset)) cur = next_date else: while cur >= end: @@ -2930,7 +2946,8 @@ def generate_range(start=None, end=None, periods=None, # faster than cur + offset next_date = offset.apply(cur) if next_date >= cur: - raise ValueError('Offset %s did not decrement date' % offset) + raise ValueError('Offset {offset} did not decrement date' + .format(offset=offset)) cur = next_date From ab32c0a3e2033456ede23dbfeffc6adc8c4ea190 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 19 Aug 2017 17:55:34 -0400 Subject: [PATCH 002/188] TST: parameterize consistency tests for rolling/expanding windows (#17292) --- pandas/tests/test_window.py | 403 ++++++++++++++++++------------------ 1 file changed, 203 insertions(+), 200 deletions(-) diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index 21a9b05d481262..1cc0ad8bb40416 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -2009,6 +2009,15 @@ def no_nans(x): _consistency_data = _create_consistency_data() +def _rolling_consistency_cases(): + for window in [1, 2, 3, 10, 20]: + for min_periods in set([0, 1, 2, 3, 4, window]): + if min_periods and (min_periods > window): + continue + for center in [False, True]: + yield window, min_periods, center + + class TestMomentsConsistency(Base): base_functions = [ (lambda v: Series(v).count(), None, 'count'), @@ -2177,7 +2186,11 @@ def _non_null_values(x): (mean_x * mean_y)) @pytest.mark.slow - def test_ewm_consistency(self): + @pytest.mark.parametrize( + 'min_periods, adjust, ignore_na', product([0, 1, 2, 3, 4], + [True, False], + [False, True])) + def test_ewm_consistency(self, min_periods, adjust, ignore_na): def _weights(s, com, adjust, ignore_na): if isinstance(s, DataFrame): if not len(s.columns): @@ -2231,52 +2244,51 @@ def _ewma(s, com, min_periods, adjust, ignore_na): return result com = 3. - for min_periods, adjust, ignore_na in product([0, 1, 2, 3, 4], - [True, False], - [False, True]): - # test consistency between different ewm* moments - self._test_moments_consistency( - min_periods=min_periods, - count=lambda x: x.expanding().count(), - mean=lambda x: x.ewm(com=com, min_periods=min_periods, - adjust=adjust, - ignore_na=ignore_na).mean(), - mock_mean=lambda x: _ewma(x, com=com, - min_periods=min_periods, - adjust=adjust, - ignore_na=ignore_na), - corr=lambda x, y: x.ewm(com=com, min_periods=min_periods, - adjust=adjust, - ignore_na=ignore_na).corr(y), - var_unbiased=lambda x: ( - x.ewm(com=com, min_periods=min_periods, - adjust=adjust, - ignore_na=ignore_na).var(bias=False)), - std_unbiased=lambda x: ( - x.ewm(com=com, min_periods=min_periods, - adjust=adjust, ignore_na=ignore_na) - .std(bias=False)), - cov_unbiased=lambda x, y: ( - x.ewm(com=com, min_periods=min_periods, - adjust=adjust, ignore_na=ignore_na) - .cov(y, bias=False)), - var_biased=lambda x: ( - x.ewm(com=com, min_periods=min_periods, - adjust=adjust, ignore_na=ignore_na) - .var(bias=True)), - std_biased=lambda x: x.ewm(com=com, min_periods=min_periods, - adjust=adjust, - ignore_na=ignore_na).std(bias=True), - cov_biased=lambda x, y: ( - x.ewm(com=com, min_periods=min_periods, - adjust=adjust, ignore_na=ignore_na) - .cov(y, bias=True)), - var_debiasing_factors=lambda x: ( - _variance_debiasing_factors(x, com=com, adjust=adjust, - ignore_na=ignore_na))) + # test consistency between different ewm* moments + self._test_moments_consistency( + min_periods=min_periods, + count=lambda x: x.expanding().count(), + mean=lambda x: x.ewm(com=com, min_periods=min_periods, + adjust=adjust, + ignore_na=ignore_na).mean(), + mock_mean=lambda x: _ewma(x, com=com, + min_periods=min_periods, + adjust=adjust, + ignore_na=ignore_na), + corr=lambda x, y: x.ewm(com=com, min_periods=min_periods, + adjust=adjust, + ignore_na=ignore_na).corr(y), + var_unbiased=lambda x: ( + x.ewm(com=com, min_periods=min_periods, + adjust=adjust, + ignore_na=ignore_na).var(bias=False)), + std_unbiased=lambda x: ( + x.ewm(com=com, min_periods=min_periods, + adjust=adjust, ignore_na=ignore_na) + .std(bias=False)), + cov_unbiased=lambda x, y: ( + x.ewm(com=com, min_periods=min_periods, + adjust=adjust, ignore_na=ignore_na) + .cov(y, bias=False)), + var_biased=lambda x: ( + x.ewm(com=com, min_periods=min_periods, + adjust=adjust, ignore_na=ignore_na) + .var(bias=True)), + std_biased=lambda x: x.ewm(com=com, min_periods=min_periods, + adjust=adjust, + ignore_na=ignore_na).std(bias=True), + cov_biased=lambda x, y: ( + x.ewm(com=com, min_periods=min_periods, + adjust=adjust, ignore_na=ignore_na) + .cov(y, bias=True)), + var_debiasing_factors=lambda x: ( + _variance_debiasing_factors(x, com=com, adjust=adjust, + ignore_na=ignore_na))) @pytest.mark.slow - def test_expanding_consistency(self): + @pytest.mark.parametrize( + 'min_periods', [0, 1, 2, 3, 4]) + def test_expanding_consistency(self, min_periods): # suppress warnings about empty slices, as we are deliberately testing # with empty/0-length Series/DataFrames @@ -2285,72 +2297,72 @@ def test_expanding_consistency(self): message=".*(empty slice|0 for slice).*", category=RuntimeWarning) - for min_periods in [0, 1, 2, 3, 4]: - - # test consistency between different expanding_* moments - self._test_moments_consistency( - min_periods=min_periods, - count=lambda x: x.expanding().count(), - mean=lambda x: x.expanding( - min_periods=min_periods).mean(), - mock_mean=lambda x: x.expanding( - min_periods=min_periods).sum() / x.expanding().count(), - corr=lambda x, y: x.expanding( - min_periods=min_periods).corr(y), - var_unbiased=lambda x: x.expanding( - min_periods=min_periods).var(), - std_unbiased=lambda x: x.expanding( - min_periods=min_periods).std(), - cov_unbiased=lambda x, y: x.expanding( - min_periods=min_periods).cov(y), - var_biased=lambda x: x.expanding( - min_periods=min_periods).var(ddof=0), - std_biased=lambda x: x.expanding( - min_periods=min_periods).std(ddof=0), - cov_biased=lambda x, y: x.expanding( - min_periods=min_periods).cov(y, ddof=0), - var_debiasing_factors=lambda x: ( - x.expanding().count() / - (x.expanding().count() - 1.) - .replace(0., np.nan))) - - # test consistency between expanding_xyz() and either (a) - # expanding_apply of Series.xyz(), or (b) expanding_apply of - # np.nanxyz() - for (x, is_constant, no_nans) in self.data: - functions = self.base_functions - - # GH 8269 - if no_nans: - functions = self.base_functions + self.no_nan_functions - for (f, require_min_periods, name) in functions: - expanding_f = getattr( - x.expanding(min_periods=min_periods), name) - - if (require_min_periods and - (min_periods is not None) and - (min_periods < require_min_periods)): - continue - - if name == 'count': - expanding_f_result = expanding_f() - expanding_apply_f_result = x.expanding( - min_periods=0).apply(func=f) + # test consistency between different expanding_* moments + self._test_moments_consistency( + min_periods=min_periods, + count=lambda x: x.expanding().count(), + mean=lambda x: x.expanding( + min_periods=min_periods).mean(), + mock_mean=lambda x: x.expanding( + min_periods=min_periods).sum() / x.expanding().count(), + corr=lambda x, y: x.expanding( + min_periods=min_periods).corr(y), + var_unbiased=lambda x: x.expanding( + min_periods=min_periods).var(), + std_unbiased=lambda x: x.expanding( + min_periods=min_periods).std(), + cov_unbiased=lambda x, y: x.expanding( + min_periods=min_periods).cov(y), + var_biased=lambda x: x.expanding( + min_periods=min_periods).var(ddof=0), + std_biased=lambda x: x.expanding( + min_periods=min_periods).std(ddof=0), + cov_biased=lambda x, y: x.expanding( + min_periods=min_periods).cov(y, ddof=0), + var_debiasing_factors=lambda x: ( + x.expanding().count() / + (x.expanding().count() - 1.) + .replace(0., np.nan))) + + # test consistency between expanding_xyz() and either (a) + # expanding_apply of Series.xyz(), or (b) expanding_apply of + # np.nanxyz() + for (x, is_constant, no_nans) in self.data: + functions = self.base_functions + + # GH 8269 + if no_nans: + functions = self.base_functions + self.no_nan_functions + for (f, require_min_periods, name) in functions: + expanding_f = getattr( + x.expanding(min_periods=min_periods), name) + + if (require_min_periods and + (min_periods is not None) and + (min_periods < require_min_periods)): + continue + + if name == 'count': + expanding_f_result = expanding_f() + expanding_apply_f_result = x.expanding( + min_periods=0).apply(func=f) + else: + if name in ['cov', 'corr']: + expanding_f_result = expanding_f( + pairwise=False) else: - if name in ['cov', 'corr']: - expanding_f_result = expanding_f( - pairwise=False) - else: - expanding_f_result = expanding_f() - expanding_apply_f_result = x.expanding( - min_periods=min_periods).apply(func=f) - - if not tm._incompat_bottleneck_version(name): - assert_equal(expanding_f_result, - expanding_apply_f_result) + expanding_f_result = expanding_f() + expanding_apply_f_result = x.expanding( + min_periods=min_periods).apply(func=f) + + if not tm._incompat_bottleneck_version(name): + assert_equal(expanding_f_result, + expanding_apply_f_result) @pytest.mark.slow - def test_rolling_consistency(self): + @pytest.mark.parametrize( + 'window,min_periods,center', list(_rolling_consistency_cases())) + def test_rolling_consistency(self, window, min_periods, center): # suppress warnings about empty slices, as we are deliberately testing # with empty/0-length Series/DataFrames @@ -2359,100 +2371,91 @@ def test_rolling_consistency(self): message=".*(empty slice|0 for slice).*", category=RuntimeWarning) - def cases(): - for window in [1, 2, 3, 10, 20]: - for min_periods in set([0, 1, 2, 3, 4, window]): - if min_periods and (min_periods > window): - continue - for center in [False, True]: - yield window, min_periods, center - - for window, min_periods, center in cases(): - # test consistency between different rolling_* moments - self._test_moments_consistency( - min_periods=min_periods, - count=lambda x: ( - x.rolling(window=window, center=center) - .count()), - mean=lambda x: ( - x.rolling(window=window, min_periods=min_periods, - center=center).mean()), - mock_mean=lambda x: ( - x.rolling(window=window, - min_periods=min_periods, - center=center).sum() - .divide(x.rolling(window=window, - min_periods=min_periods, - center=center).count())), - corr=lambda x, y: ( - x.rolling(window=window, min_periods=min_periods, - center=center).corr(y)), - - var_unbiased=lambda x: ( - x.rolling(window=window, min_periods=min_periods, - center=center).var()), - - std_unbiased=lambda x: ( - x.rolling(window=window, min_periods=min_periods, - center=center).std()), - - cov_unbiased=lambda x, y: ( - x.rolling(window=window, min_periods=min_periods, - center=center).cov(y)), - - var_biased=lambda x: ( - x.rolling(window=window, min_periods=min_periods, - center=center).var(ddof=0)), - - std_biased=lambda x: ( - x.rolling(window=window, min_periods=min_periods, - center=center).std(ddof=0)), - - cov_biased=lambda x, y: ( - x.rolling(window=window, min_periods=min_periods, - center=center).cov(y, ddof=0)), - var_debiasing_factors=lambda x: ( - x.rolling(window=window, center=center).count() - .divide((x.rolling(window=window, center=center) - .count() - 1.) - .replace(0., np.nan)))) - - # test consistency between rolling_xyz() and either (a) - # rolling_apply of Series.xyz(), or (b) rolling_apply of - # np.nanxyz() - for (x, is_constant, no_nans) in self.data: - functions = self.base_functions - - # GH 8269 - if no_nans: - functions = self.base_functions + self.no_nan_functions - for (f, require_min_periods, name) in functions: - rolling_f = getattr( - x.rolling(window=window, center=center, - min_periods=min_periods), name) - - if require_min_periods and ( - min_periods is not None) and ( - min_periods < require_min_periods): - continue + # test consistency between different rolling_* moments + self._test_moments_consistency( + min_periods=min_periods, + count=lambda x: ( + x.rolling(window=window, center=center) + .count()), + mean=lambda x: ( + x.rolling(window=window, min_periods=min_periods, + center=center).mean()), + mock_mean=lambda x: ( + x.rolling(window=window, + min_periods=min_periods, + center=center).sum() + .divide(x.rolling(window=window, + min_periods=min_periods, + center=center).count())), + corr=lambda x, y: ( + x.rolling(window=window, min_periods=min_periods, + center=center).corr(y)), - if name == 'count': - rolling_f_result = rolling_f() - rolling_apply_f_result = x.rolling( - window=window, min_periods=0, - center=center).apply(func=f) + var_unbiased=lambda x: ( + x.rolling(window=window, min_periods=min_periods, + center=center).var()), + + std_unbiased=lambda x: ( + x.rolling(window=window, min_periods=min_periods, + center=center).std()), + + cov_unbiased=lambda x, y: ( + x.rolling(window=window, min_periods=min_periods, + center=center).cov(y)), + + var_biased=lambda x: ( + x.rolling(window=window, min_periods=min_periods, + center=center).var(ddof=0)), + + std_biased=lambda x: ( + x.rolling(window=window, min_periods=min_periods, + center=center).std(ddof=0)), + + cov_biased=lambda x, y: ( + x.rolling(window=window, min_periods=min_periods, + center=center).cov(y, ddof=0)), + var_debiasing_factors=lambda x: ( + x.rolling(window=window, center=center).count() + .divide((x.rolling(window=window, center=center) + .count() - 1.) + .replace(0., np.nan)))) + + # test consistency between rolling_xyz() and either (a) + # rolling_apply of Series.xyz(), or (b) rolling_apply of + # np.nanxyz() + for (x, is_constant, no_nans) in self.data: + functions = self.base_functions + + # GH 8269 + if no_nans: + functions = self.base_functions + self.no_nan_functions + for (f, require_min_periods, name) in functions: + rolling_f = getattr( + x.rolling(window=window, center=center, + min_periods=min_periods), name) + + if require_min_periods and ( + min_periods is not None) and ( + min_periods < require_min_periods): + continue + + if name == 'count': + rolling_f_result = rolling_f() + rolling_apply_f_result = x.rolling( + window=window, min_periods=0, + center=center).apply(func=f) + else: + if name in ['cov', 'corr']: + rolling_f_result = rolling_f( + pairwise=False) else: - if name in ['cov', 'corr']: - rolling_f_result = rolling_f( - pairwise=False) - else: - rolling_f_result = rolling_f() - rolling_apply_f_result = x.rolling( - window=window, min_periods=min_periods, - center=center).apply(func=f) - if not tm._incompat_bottleneck_version(name): - assert_equal(rolling_f_result, - rolling_apply_f_result) + rolling_f_result = rolling_f() + rolling_apply_f_result = x.rolling( + window=window, min_periods=min_periods, + center=center).apply(func=f) + if not tm._incompat_bottleneck_version(name): + assert_equal(rolling_f_result, + rolling_apply_f_result) # binary moments def test_rolling_cov(self): From 3b02e73b856a6f8d53382bf3908f04447bf90e03 Mon Sep 17 00:00:00 2001 From: Thomas A Caswell Date: Sat, 19 Aug 2017 17:59:19 -0400 Subject: [PATCH 003/188] FIX: define `DataFrame.items` for all versions of python (#17214) --- doc/source/whatsnew/v0.21.0.txt | 4 ++++ pandas/core/frame.py | 3 +-- pandas/core/series.py | 3 +-- pandas/tests/frame/test_api.py | 11 ++++++++++- pandas/tests/series/test_api.py | 10 ++++++++++ 5 files changed, 26 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 6008ea5d4cbcd2..c5fe89282bf52d 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -128,6 +128,10 @@ Other Enhancements - :func:`DataFrame.add_prefix` and :func:`DataFrame.add_suffix` now accept strings containing the '%' character. (:issue:`17151`) - `read_*` methods can now infer compression from non-string paths, such as ``pathlib.Path`` objects (:issue:`17206`). - :func:`pd.read_sas()` now recognizes much more of the most frequently used date (datetime) formats in SAS7BDAT files (:issue:`15871`). +- :func:`DataFrame.items` and :func:`Series.items` is now present in both Python 2 and 3 and is lazy in all cases (:issue:`13918`, :issue:`17213`) + + + .. _whatsnew_0210.api_breaking: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 467ef52de234e8..b5b3df64d24c0b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -802,8 +802,7 @@ def itertuples(self, index=True, name="Pandas"): # fallback to regular tuples return zip(*arrays) - if compat.PY3: # pragma: no cover - items = iteritems + items = iteritems def __len__(self): """Returns length of info axis, but here we use the index """ diff --git a/pandas/core/series.py b/pandas/core/series.py index c8282450b77a9e..75dc3d6403650c 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1110,8 +1110,7 @@ def iteritems(self): """ return zip(iter(self.index), iter(self)) - if compat.PY3: # pragma: no cover - items = iteritems + items = iteritems # ---------------------------------------------------------------------- # Misc public methods diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 53a1b9525a0dd1..a62fcb506a34bc 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -171,7 +171,16 @@ def test_nonzero(self): def test_iteritems(self): df = self.klass([[1, 2, 3], [4, 5, 6]], columns=['a', 'a', 'b']) for k, v in compat.iteritems(df): - assert type(v) == self.klass._constructor_sliced + assert isinstance(v, self.klass._constructor_sliced) + + def test_items(self): + # issue #17213, #13918 + cols = ['a', 'b', 'c'] + df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=cols) + for c, (k, v) in zip(cols, df.items()): + assert c == k + assert isinstance(v, Series) + assert (df[k] == v).all() def test_iter(self): assert tm.equalContents(list(self.frame), self.frame.columns) diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index 8e22dd38030ee2..b7fbe803f8d3b9 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -301,6 +301,16 @@ def test_iteritems(self): # assert is lazy (genrators don't define reverse, lists do) assert not hasattr(self.series.iteritems(), 'reverse') + def test_items(self): + for idx, val in self.series.items(): + assert val == self.series[idx] + + for idx, val in self.ts.items(): + assert val == self.ts[idx] + + # assert is lazy (genrators don't define reverse, lists do) + assert not hasattr(self.series.items(), 'reverse') + def test_raise_on_info(self): s = Series(np.random.randn(10)) with pytest.raises(AttributeError): From 58d872903449b8a29237288ade6227cdb280fe18 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 20 Aug 2017 16:25:43 -0500 Subject: [PATCH 004/188] PERF: Update ASV publish config (#17293) Stricter cutoffs for considering regressions [ci skip] --- asv_bench/asv.conf.json | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index 59c05400d06b0b..ced4f2b12445f3 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -117,8 +117,10 @@ // with results. If the commit is `null`, regression detection is // skipped for the matching benchmark. // - // "regressions_first_commits": { - // "some_benchmark": "352cdf", // Consider regressions only after this commit - // "another_benchmark": null, // Skip regression detection altogether - // } + "regressions_first_commits": { + "*": "v0.20.0" + }, + "regression_thresholds": { + "*": 0.05 + } } From e14431f897c7c0afd76d627ba933c07c277f8deb Mon Sep 17 00:00:00 2001 From: Yosuke Nakabayashi Date: Mon, 21 Aug 2017 09:50:44 +0200 Subject: [PATCH 005/188] DOC: Expand docstrings for head / tail methods (#16941) --- pandas/core/generic.py | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 5a7f37bba91aa2..d9d75c870b20c1 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2978,14 +2978,36 @@ def filter(self, items=None, like=None, regex=None, axis=None): def head(self, n=5): """ - Returns first n rows + Return the first n rows. + + Parameters + ---------- + n : int, default 5 + Number of rows to select. + + Returns + ------- + obj_head : type of caller + The first n rows of the caller object. """ + return self.iloc[:n] def tail(self, n=5): """ - Returns last n rows + Return the last n rows. + + Parameters + ---------- + n : int, default 5 + Number of rows to select. + + Returns + ------- + obj_tail : type of caller + The last n rows of the caller object. """ + if n == 0: return self.iloc[0:0] return self.iloc[-n:] From 8354a1dfa9073eab1b120d39be31103fc29394bb Mon Sep 17 00:00:00 2001 From: gfyoung Date: Mon, 21 Aug 2017 00:56:39 -0700 Subject: [PATCH 006/188] MAINT: Use set literal for unsupported + depr args Initializes unsupported and deprecated argument sets with set literals instead of the set constructor in pandas/io/parsers.py, as the former is slightly faster than the latter. --- pandas/io/parsers.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 05a04f268f72b5..a9821be3fa5e2d 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -487,18 +487,18 @@ def _read(filepath_or_buffer, kwds): 'widths': None, } -_c_unsupported = set(['skipfooter']) -_python_unsupported = set([ +_c_unsupported = {'skipfooter'} +_python_unsupported = { 'low_memory', 'buffer_lines', 'float_precision', -]) -_deprecated_args = set([ +} +_deprecated_args = { 'as_recarray', 'buffer_lines', 'compact_ints', 'use_unsigned', -]) +} def _make_parser_function(name, sep=','): From 91245a758ee32658c66bdecd9556f7054cd99901 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Mon, 21 Aug 2017 01:14:50 -0700 Subject: [PATCH 007/188] DOC: Add proper docstring to maybe_convert_indices Patches several spelling errors and expands current doc to a proper doc-string. --- pandas/core/indexing.py | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 109183827de4e8..929c2346ba5b0d 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1985,9 +1985,31 @@ def get_indexer(_i, _idx): def maybe_convert_indices(indices, n): - """ if we have negative indicies, translate to postive here - if have indicies that are out-of-bounds, raise an IndexError """ + Attempt to convert indices into valid, positive indices. + + If we have negative indices, translate to positive here. + If we have indices that are out-of-bounds, raise an IndexError. + + Parameters + ---------- + indices : array-like + The array of indices that we are to convert. + n : int + The number of elements in the array that we are indexing. + + Returns + ------- + valid_indices : array-like + An array-like of positive indices that correspond to the ones + that were passed in initially to this function. + + Raises + ------ + IndexError : one of the converted indices either exceeded the number + of elements (specified by `n`) OR was still negative. + """ + if isinstance(indices, list): indices = np.array(indices) if len(indices) == 0: From d0d28fec180ee61de17921fe5068ecde95adae8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?agust=C3=ADn=20m=C3=A9ndez?= Date: Mon, 21 Aug 2017 10:27:24 +0200 Subject: [PATCH 008/188] DOC: Improving docstring of take method (#16948) --- pandas/core/generic.py | 67 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 63 insertions(+), 4 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index d9d75c870b20c1..c83b1073afc8e3 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2063,18 +2063,77 @@ def __delitem__(self, key): def take(self, indices, axis=0, convert=True, is_copy=True, **kwargs): """ - Analogous to ndarray.take + Return the elements in the given *positional* indices along an axis. + + This means that we are not indexing according to actual values in + the index attribute of the object. We are indexing according to the + actual position of the element in the object. Parameters ---------- - indices : list / array of ints + indices : array-like + An array of ints indicating which positions to take. axis : int, default 0 - convert : translate neg to pos indices (default) - is_copy : mark the returned frame as a copy + The axis on which to select elements. "0" means that we are + selecting rows, "1" means that we are selecting columns, etc. + convert : bool, default True + Whether to convert negative indices to positive ones, just as with + indexing into Python lists. For example, if `-1` was passed in, + this index would be converted ``n - 1``. + is_copy : bool, default True + Whether to return a copy of the original object or not. + + Examples + -------- + >>> df = pd.DataFrame([('falcon', 'bird', 389.0), + ('parrot', 'bird', 24.0), + ('lion', 'mammal', 80.5), + ('monkey', 'mammal', np.nan)], + columns=('name', 'class', 'max_speed'), + index=[0, 2, 3, 1]) + >>> df + name class max_speed + 0 falcon bird 389.0 + 2 parrot bird 24.0 + 3 lion mammal 80.5 + 1 monkey mammal NaN + + Take elements at positions 0 and 3 along the axis 0 (default). + + Note how the actual indices selected (0 and 1) do not correspond to + our selected indices 0 and 3. That's because we are selecting the 0th + and 3rd rows, not rows whose indices equal 0 and 3. + + >>> df.take([0, 3]) + 0 falcon bird 389.0 + 1 monkey mammal NaN + + Take elements at indices 1 and 2 along the axis 1 (column selection). + + >>> df.take([1, 2], axis=1) + class max_speed + 0 bird 389.0 + 2 bird 24.0 + 3 mammal 80.5 + 1 mammal NaN + + We may take elements using negative integers for positive indices, + starting from the end of the object, just like with Python lists. + + >>> df.take([-1, -2]) + name class max_speed + 1 monkey mammal NaN + 3 lion mammal 80.5 Returns ------- taken : type of caller + An array-like containing the elements taken from the object. + + See Also + -------- + numpy.ndarray.take + numpy.take """ nv.validate_take(tuple(), kwargs) self._consolidate_inplace() From 91c2f1f6acde8e5f571d12716e72327747183247 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 21 Aug 2017 14:39:50 -0500 Subject: [PATCH 009/188] BUG: Fixed regex in asv.conf.json (#17300) In https://github.com/pandas-dev/pandas/pull/17293 I messed up the syntax. I used a glob instead of a regex. According to the docs at http://asv.readthedocs.io/en/latest/asv.conf.json.html#regressions-thresholds we want to use a regex. I've actually manually tested this change and verified that it works. [ci skip] --- asv_bench/asv.conf.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index ced4f2b12445f3..9c333f62810f46 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -118,9 +118,9 @@ // skipped for the matching benchmark. // "regressions_first_commits": { - "*": "v0.20.0" + ".*": "v0.20.0" }, "regression_thresholds": { - "*": 0.05 + ".*": 0.05 } } From eff1f889d26fb47467124b103cb70045f85fdf84 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 21 Aug 2017 16:49:17 -0700 Subject: [PATCH 010/188] Remove unnecessary usage of _TSObject (#17297) --- pandas/_libs/period.pyx | 20 -------------------- pandas/_libs/src/datetime.pxd | 32 -------------------------------- pandas/_libs/tslib.pyx | 35 ++++++----------------------------- 3 files changed, 6 insertions(+), 81 deletions(-) diff --git a/pandas/_libs/period.pyx b/pandas/_libs/period.pyx index e017d863e19075..6ba7ec0270f30a 100644 --- a/pandas/_libs/period.pyx +++ b/pandas/_libs/period.pyx @@ -120,26 +120,6 @@ initialize_daytime_conversion_factor_matrix() # Period logic #---------------------------------------------------------------------- -cdef inline int64_t apply_mult(int64_t period_ord, int64_t mult): - """ - Get freq+multiple ordinal value from corresponding freq-only ordinal value. - For example, 5min ordinal will be 1/5th the 1min ordinal (rounding down to - integer). - """ - if mult == 1: - return period_ord - - return (period_ord - 1) // mult - -cdef inline int64_t remove_mult(int64_t period_ord_w_mult, int64_t mult): - """ - Get freq-only ordinal value from corresponding freq+multiple ordinal. - """ - if mult == 1: - return period_ord_w_mult - - return period_ord_w_mult * mult + 1; - @cython.wraparound(False) @cython.boundscheck(False) diff --git a/pandas/_libs/src/datetime.pxd b/pandas/_libs/src/datetime.pxd index 2267c8282ec144..23620e790c1323 100644 --- a/pandas/_libs/src/datetime.pxd +++ b/pandas/_libs/src/datetime.pxd @@ -88,11 +88,6 @@ cdef extern from "datetime/np_datetime.h": int cmp_pandas_datetimestruct(pandas_datetimestruct *a, pandas_datetimestruct *b) - int convert_pydatetime_to_datetimestruct(PyObject *obj, - pandas_datetimestruct *out, - PANDAS_DATETIMEUNIT *out_bestunit, - int apply_tzinfo) - npy_datetime pandas_datetimestruct_to_datetime(PANDAS_DATETIMEUNIT fr, pandas_datetimestruct *d) nogil void pandas_datetime_to_datetimestruct(npy_datetime val, @@ -112,12 +107,6 @@ cdef extern from "datetime/np_datetime_strings.h": PANDAS_DATETIMEUNIT *out_bestunit, npy_bool *out_special) - int make_iso_8601_datetime(pandas_datetimestruct *dts, char *outstr, int outlen, - int local, PANDAS_DATETIMEUNIT base, int tzoffset, - NPY_CASTING casting) - - int get_datetime_iso_8601_strlen(int local, PANDAS_DATETIMEUNIT base) - # int parse_python_string(object obj, pandas_datetimestruct *out) except -1 @@ -152,16 +141,6 @@ cdef inline int _cstring_to_dts(char *val, int length, return result -cdef inline object _datetime64_to_datetime(int64_t val): - cdef pandas_datetimestruct dts - pandas_datetime_to_datetimestruct(val, PANDAS_FR_ns, &dts) - return _dts_to_pydatetime(&dts) - -cdef inline object _dts_to_pydatetime(pandas_datetimestruct *dts): - return PyDateTime_FromDateAndTime(dts.year, dts.month, - dts.day, dts.hour, - dts.min, dts.sec, dts.us) - cdef inline int64_t _pydatetime_to_dts(object val, pandas_datetimestruct *dts): dts.year = PyDateTime_GET_YEAR(val) dts.month = PyDateTime_GET_MONTH(val) @@ -173,17 +152,6 @@ cdef inline int64_t _pydatetime_to_dts(object val, pandas_datetimestruct *dts): dts.ps = dts.as = 0 return pandas_datetimestruct_to_datetime(PANDAS_FR_ns, dts) -cdef inline int64_t _dtlike_to_datetime64(object val, - pandas_datetimestruct *dts): - dts.year = val.year - dts.month = val.month - dts.day = val.day - dts.hour = val.hour - dts.min = val.minute - dts.sec = val.second - dts.us = val.microsecond - dts.ps = dts.as = 0 - return pandas_datetimestruct_to_datetime(PANDAS_FR_ns, dts) cdef inline int64_t _date_to_datetime64(object val, pandas_datetimestruct *dts): diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 32b8c92a50269d..c4a38ec660a4c3 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -705,7 +705,6 @@ class Timestamp(_Timestamp): pandas_datetimestruct dts int64_t value object _tzinfo, result, k, v - _TSObject ts # set to naive if needed _tzinfo = self.tzinfo @@ -1009,10 +1008,6 @@ def unique_deltas(ndarray[int64_t] arr): return result -cdef inline bint _is_multiple(int64_t us, int64_t mult): - return us % mult == 0 - - cdef inline bint _cmp_scalar(int64_t lhs, int64_t rhs, int op) except -1: if op == Py_EQ: return lhs == rhs @@ -4694,7 +4689,6 @@ def get_date_field(ndarray[int64_t] dtindex, object field): field and return an array of these values. """ cdef: - _TSObject ts Py_ssize_t i, count = 0 ndarray[int32_t] out ndarray[int32_t, ndim=2] _month_offset @@ -4876,7 +4870,6 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, (defined by frequency). """ cdef: - _TSObject ts Py_ssize_t i int count = 0 bint is_business = 0 @@ -4925,9 +4918,8 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, pandas_datetime_to_datetimestruct( dtindex[i], PANDAS_FR_ns, &dts) - ts = convert_to_tsobject(dtindex[i], None, None, 0, 0) dom = dts.day - dow = ts_dayofweek(ts) + dow = dayofweek(dts.year, dts.month, dts.day) if (dom == 1 and dow < 5) or (dom <= 3 and dow == 0): out[i] = 1 @@ -4951,13 +4943,12 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, pandas_datetime_to_datetimestruct( dtindex[i], PANDAS_FR_ns, &dts) - ts = convert_to_tsobject(dtindex[i], None, None, 0, 0) isleap = is_leapyear(dts.year) mo_off = _month_offset[isleap, dts.month - 1] dom = dts.day doy = mo_off + dom ldom = _month_offset[isleap, dts.month] - dow = ts_dayofweek(ts) + dow = dayofweek(dts.year, dts.month, dts.day) if (ldom == doy and dow < 5) or ( dow == 4 and (ldom - doy <= 2)): @@ -4986,9 +4977,8 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, pandas_datetime_to_datetimestruct( dtindex[i], PANDAS_FR_ns, &dts) - ts = convert_to_tsobject(dtindex[i], None, None, 0, 0) dom = dts.day - dow = ts_dayofweek(ts) + dow = dayofweek(dts.year, dts.month, dts.day) if ((dts.month - start_month) % 3 == 0) and ( (dom == 1 and dow < 5) or (dom <= 3 and dow == 0)): @@ -5013,13 +5003,12 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, pandas_datetime_to_datetimestruct( dtindex[i], PANDAS_FR_ns, &dts) - ts = convert_to_tsobject(dtindex[i], None, None, 0, 0) isleap = is_leapyear(dts.year) mo_off = _month_offset[isleap, dts.month - 1] dom = dts.day doy = mo_off + dom ldom = _month_offset[isleap, dts.month] - dow = ts_dayofweek(ts) + dow = dayofweek(dts.year, dts.month, dts.day) if ((dts.month - end_month) % 3 == 0) and ( (ldom == doy and dow < 5) or ( @@ -5049,9 +5038,8 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, pandas_datetime_to_datetimestruct( dtindex[i], PANDAS_FR_ns, &dts) - ts = convert_to_tsobject(dtindex[i], None, None, 0, 0) dom = dts.day - dow = ts_dayofweek(ts) + dow = dayofweek(dts.year, dts.month, dts.day) if (dts.month == start_month) and ( (dom == 1 and dow < 5) or (dom <= 3 and dow == 0)): @@ -5076,12 +5064,11 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, pandas_datetime_to_datetimestruct( dtindex[i], PANDAS_FR_ns, &dts) - ts = convert_to_tsobject(dtindex[i], None, None, 0, 0) isleap = is_leapyear(dts.year) dom = dts.day mo_off = _month_offset[isleap, dts.month - 1] doy = mo_off + dom - dow = ts_dayofweek(ts) + dow = dayofweek(dts.year, dts.month, dts.day) ldom = _month_offset[isleap, dts.month] if (dts.month == end_month) and ( @@ -5095,7 +5082,6 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, pandas_datetime_to_datetimestruct( dtindex[i], PANDAS_FR_ns, &dts) - ts = convert_to_tsobject(dtindex[i], None, None, 0, 0) isleap = is_leapyear(dts.year) mo_off = _month_offset[isleap, dts.month - 1] dom = dts.day @@ -5117,7 +5103,6 @@ def get_date_name_field(ndarray[int64_t] dtindex, object field): name based on requested field (e.g. weekday_name) """ cdef: - _TSObject ts Py_ssize_t i, count = 0 ndarray[object] out pandas_datetimestruct dts @@ -5143,10 +5128,6 @@ def get_date_name_field(ndarray[int64_t] dtindex, object field): raise ValueError("Field %s not supported" % field) -cdef inline int m8_weekday(int64_t val): - ts = convert_to_tsobject(val, None, None, 0, 0) - return ts_dayofweek(ts) - cdef int64_t DAY_NS = 86400000000000LL @@ -5156,11 +5137,9 @@ def date_normalize(ndarray[int64_t] stamps, tz=None): cdef: Py_ssize_t i, n = len(stamps) pandas_datetimestruct dts - _TSObject tso ndarray[int64_t] result = np.empty(n, dtype=np.int64) if tz is not None: - tso = _TSObject() tz = maybe_get_tz(tz) result = _normalize_local(stamps, tz) else: @@ -5305,8 +5284,6 @@ def monthrange(int64_t year, int64_t month): return (dayofweek(year, month, 1), days) -cdef inline int64_t ts_dayofweek(_TSObject ts): - return dayofweek(ts.dts.year, ts.dts.month, ts.dts.day) cdef inline int days_in_month(pandas_datetimestruct dts) nogil: return days_per_month_table[is_leapyear(dts.year)][dts.month -1] From 910207ffe518413e84cfa95d772cb66d57a0d08e Mon Sep 17 00:00:00 2001 From: Michael Gasvoda Date: Mon, 21 Aug 2017 19:51:18 -0400 Subject: [PATCH 011/188] BUG: clip should handle null values closes #17276 Author: Michael Gasvoda Author: mgasvoda Closes #17288 from mgasvoda/master and squashes the following commits: a1dbdf293 [mgasvoda] Merge branch 'master' into master 9333952c2 [Michael Gasvoda] Checking output of tests 4e0464eaf [Michael Gasvoda] fixing whatsnew text c44204080 [Michael Gasvoda] formatting fixes 7e2367879 [Michael Gasvoda] formatting updates 781ea724a [Michael Gasvoda] whatsnew entry d9627fe4c [Michael Gasvoda] adding clip tests 9aa0159e9 [Michael Gasvoda] Treating na values as none for clips --- doc/source/whatsnew/v0.21.0.txt | 2 +- pandas/core/generic.py | 12 ++++++++---- pandas/tests/frame/test_analytics.py | 26 ++++++++++---------------- pandas/tests/series/test_analytics.py | 11 +++++++++++ 4 files changed, 30 insertions(+), 21 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index c5fe89282bf52d..0d2c52c70b345e 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -132,7 +132,6 @@ Other Enhancements - .. _whatsnew_0210.api_breaking: Backwards incompatible API changes @@ -384,6 +383,7 @@ Reshaping Numeric ^^^^^^^ - Bug in ``.clip()`` with ``axis=1`` and a list-like for ``threshold`` is passed; previously this raised ``ValueError`` (:issue:`15390`) +- :func:`Series.clip()` and :func:`DataFrame.clip()` now treat NA values for upper and lower arguments as ``None`` instead of raising ``ValueError`` (:issue:`17276`). Categorical diff --git a/pandas/core/generic.py b/pandas/core/generic.py index c83b1073afc8e3..5c9e1f22ddd200 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4741,9 +4741,6 @@ def _clip_with_one_bound(self, threshold, method, axis, inplace): if axis is not None: axis = self._get_axis_number(axis) - if np.any(isna(threshold)): - raise ValueError("Cannot use an NA value as a clip threshold") - # method is self.le for upper bound and self.ge for lower bound if is_scalar(threshold) and is_number(threshold): if method.__name__ == 'le': @@ -4823,6 +4820,14 @@ def clip(self, lower=None, upper=None, axis=None, inplace=False, axis = nv.validate_clip_with_axis(axis, args, kwargs) + # GH 17276 + # numpy doesn't like NaN as a clip value + # so ignore + if np.any(pd.isnull(lower)): + lower = None + if np.any(pd.isnull(upper)): + upper = None + # GH 2747 (arguments were reversed) if lower is not None and upper is not None: if is_scalar(lower) and is_scalar(upper): @@ -4839,7 +4844,6 @@ def clip(self, lower=None, upper=None, axis=None, inplace=False, if upper is not None: if inplace: result = self - result = result.clip_upper(upper, axis, inplace=inplace) return result diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 484a09f11b58a7..93514a8a422151 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -1931,22 +1931,16 @@ def test_clip_against_frame(self, axis): tm.assert_frame_equal(clipped_df[ub_mask], ub[ub_mask]) tm.assert_frame_equal(clipped_df[mask], df[mask]) - def test_clip_na(self): - msg = "Cannot use an NA" - with tm.assert_raises_regex(ValueError, msg): - self.frame.clip(lower=np.nan) - - with tm.assert_raises_regex(ValueError, msg): - self.frame.clip(lower=[np.nan]) - - with tm.assert_raises_regex(ValueError, msg): - self.frame.clip(upper=np.nan) - - with tm.assert_raises_regex(ValueError, msg): - self.frame.clip(upper=[np.nan]) - - with tm.assert_raises_regex(ValueError, msg): - self.frame.clip(lower=np.nan, upper=np.nan) + def test_clip_with_na_args(self): + """Should process np.nan argument as None """ + # GH # 17276 + tm.assert_frame_equal(self.frame.clip(np.nan), self.frame) + tm.assert_frame_equal(self.frame.clip(upper=[1, 2, np.nan]), + self.frame) + tm.assert_frame_equal(self.frame.clip(lower=[1, np.nan, 3]), + self.frame) + tm.assert_frame_equal(self.frame.clip(upper=np.nan, lower=np.nan), + self.frame) # Matrix-like diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 44da0968d70243..f1d044f7a11325 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1000,6 +1000,17 @@ def test_clip_types_and_nulls(self): assert list(isna(s)) == list(isna(l)) assert list(isna(s)) == list(isna(u)) + def test_clip_with_na_args(self): + """Should process np.nan argument as None """ + # GH # 17276 + s = Series([1, 2, 3]) + + assert_series_equal(s.clip(np.nan), Series([1, 2, 3])) + assert_series_equal(s.clip(upper=[1, 1, np.nan]), Series([1, 2, 3])) + assert_series_equal(s.clip(lower=[1, np.nan, 1]), Series([1, 2, 3])) + assert_series_equal(s.clip(upper=np.nan, lower=np.nan), + Series([1, 2, 3])) + def test_clip_against_series(self): # GH #6966 From a4c4edeb2a7e5c84b5a82a9743a12a4b66e7bcf1 Mon Sep 17 00:00:00 2001 From: ante328 Date: Tue, 22 Aug 2017 01:55:10 +0200 Subject: [PATCH 012/188] BUG: fillna returns frame when inplace=True if value is a dict (#16156) (#17279) --- doc/source/whatsnew/v0.21.0.txt | 2 +- pandas/core/generic.py | 3 ++- pandas/tests/frame/test_missing.py | 3 +++ 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 0d2c52c70b345e..dd06114f6abd31 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -318,7 +318,7 @@ Conversion - Fix :func:`DataFrame.memory_usage` to support PyPy. Objects on PyPy do not have a fixed size, so an approximation is used instead (:issue:`17228`) - Fixed the return type of ``IntervalIndex.is_non_overlapping_monotonic`` to be a Python ``bool`` for consistency with similar attributes/methods. Previously returned a ``numpy.bool_``. (:issue:`17237`) - Bug in ``IntervalIndex.is_non_overlapping_monotonic`` when intervals are closed on both sides and overlap at a point (:issue:`16560`) - +- Bug in :func:`Series.fillna` returns frame when ``inplace=True`` and ``value`` is dict (:issue:`16156`) Indexing ^^^^^^^^ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 5c9e1f22ddd200..e84e4eac3f34d6 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4135,7 +4135,8 @@ def fillna(self, value=None, method=None, axis=None, inplace=False, continue obj = result[k] obj.fillna(v, limit=limit, inplace=True, downcast=downcast) - return result + return result if not inplace else None + elif not is_list_like(value): new_data = self._data.fillna(value=value, limit=limit, inplace=inplace, diff --git a/pandas/tests/frame/test_missing.py b/pandas/tests/frame/test_missing.py index 77f0357685cab8..ebd15b3180a337 100644 --- a/pandas/tests/frame/test_missing.py +++ b/pandas/tests/frame/test_missing.py @@ -407,6 +407,9 @@ def test_fillna_inplace(self): df.fillna(value=0, inplace=True) tm.assert_frame_equal(df, expected) + expected = df.fillna(value={0: 0}, inplace=True) + assert expected is None + df[1][:4] = np.nan df[3][-4:] = np.nan expected = df.fillna(method='ffill') From 2f00159da32c85c3b30b433f78a43e47677711a3 Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Tue, 22 Aug 2017 10:11:10 +0200 Subject: [PATCH 013/188] CLN: Index.append() refactoring (#16236) --- pandas/core/dtypes/concat.py | 48 ++++++++++++++++++++++- pandas/core/indexes/base.py | 11 +++--- pandas/core/indexes/category.py | 6 ++- pandas/core/indexes/datetimelike.py | 2 +- pandas/core/indexes/interval.py | 4 +- pandas/core/indexes/range.py | 59 ++--------------------------- 6 files changed, 63 insertions(+), 67 deletions(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 292d5f608d4cb2..0ce45eea119ed2 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -19,7 +19,7 @@ _TD_DTYPE) from pandas.core.dtypes.generic import ( ABCDatetimeIndex, ABCTimedeltaIndex, - ABCPeriodIndex) + ABCPeriodIndex, ABCRangeIndex) def get_dtype_kinds(l): @@ -41,6 +41,8 @@ def get_dtype_kinds(l): typ = 'category' elif is_sparse(arr): typ = 'sparse' + elif isinstance(arr, ABCRangeIndex): + typ = 'range' elif is_datetimetz(arr): # if to_concat contains different tz, # the result must be object dtype @@ -559,3 +561,47 @@ def convert_sparse(x, axis): # coerce to object if needed result = result.astype('object') return result + + +def _concat_rangeindex_same_dtype(indexes): + """ + Concatenates multiple RangeIndex instances. All members of "indexes" must + be of type RangeIndex; result will be RangeIndex if possible, Int64Index + otherwise. E.g.: + indexes = [RangeIndex(3), RangeIndex(3, 6)] -> RangeIndex(6) + indexes = [RangeIndex(3), RangeIndex(4, 6)] -> Int64Index([0,1,2,4,5]) + """ + + start = step = next = None + + for obj in indexes: + if not len(obj): + continue + + if start is None: + # This is set by the first non-empty index + start = obj._start + if step is None and len(obj) > 1: + step = obj._step + elif step is None: + # First non-empty index had only one element + if obj._start == start: + return _concat_index_asobject(indexes) + step = obj._start - start + + non_consecutive = ((step != obj._step and len(obj) > 1) or + (next is not None and obj._start != next)) + if non_consecutive: + # Int64Index._append_same_dtype([ix.astype(int) for ix in indexes]) + # would be preferred... but it currently resorts to + # _concat_index_asobject anyway. + return _concat_index_asobject(indexes) + + if step is not None: + next = obj[-1] + step + + if start is None: + start = obj._start + step = obj._step + stop = obj._stop if next is None else next + return indexes[0].__class__(start, stop, step) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index de6221987a59aa..a21e6df3ffc93d 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1745,18 +1745,17 @@ def append(self, other): names = set([obj.name for obj in to_concat]) name = None if len(names) > 1 else self.name - if self.is_categorical(): - # if calling index is category, don't check dtype of others - from pandas.core.indexes.category import CategoricalIndex - return CategoricalIndex._append_same_dtype(self, to_concat, name) + return self._concat(to_concat, name) + + def _concat(self, to_concat, name): typs = _concat.get_dtype_kinds(to_concat) if len(typs) == 1: - return self._append_same_dtype(to_concat, name=name) + return self._concat_same_dtype(to_concat, name=name) return _concat._concat_index_asobject(to_concat, name=name) - def _append_same_dtype(self, to_concat, name): + def _concat_same_dtype(self, to_concat, name): """ Concatenate to_concat which has the same class """ diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index ac4698b570d172..f22407308e0944 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -633,7 +633,11 @@ def insert(self, loc, item): codes = np.concatenate((codes[:loc], code, codes[loc:])) return self._create_from_codes(codes) - def _append_same_dtype(self, to_concat, name): + def _concat(self, to_concat, name): + # if calling index is category, don't check dtype of others + return CategoricalIndex._concat_same_dtype(self, to_concat, name) + + def _concat_same_dtype(self, to_concat, name): """ Concatenate to_concat which has the same class ValueError if other is not in the categories diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 845c71b6c41d8b..c3232627fce74c 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -837,7 +837,7 @@ def summary(self, name=None): result = result.replace("'", "") return result - def _append_same_dtype(self, to_concat, name): + def _concat_same_dtype(self, to_concat, name): """ Concatenate to_concat which has the same class """ diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index e90378184e3f3e..e0ed6c7ea35c0c 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -876,7 +876,7 @@ def _as_like_interval_index(self, other, error_msg): raise ValueError(error_msg) return other - def _append_same_dtype(self, to_concat, name): + def _concat_same_dtype(self, to_concat, name): """ assert that we all have the same .closed we allow a 0-len index here as well @@ -885,7 +885,7 @@ def _append_same_dtype(self, to_concat, name): msg = ('can only append two IntervalIndex objects ' 'that are closed on the same side') raise ValueError(msg) - return super(IntervalIndex, self)._append_same_dtype(to_concat, name) + return super(IntervalIndex, self)._concat_same_dtype(to_concat, name) @Appender(_index_shared_docs['take'] % _index_doc_kwargs) def take(self, indices, axis=0, allow_fill=True, diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index ac4cc6986cace2..82412d3a7ef57a 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -14,6 +14,7 @@ from pandas.compat.numpy import function as nv from pandas.core.indexes.base import Index, _index_shared_docs from pandas.util._decorators import Appender, cache_readonly +import pandas.core.dtypes.concat as _concat import pandas.core.indexes.base as ibase from pandas.core.indexes.numeric import Int64Index @@ -447,62 +448,8 @@ def join(self, other, how='left', level=None, return_indexers=False, return super(RangeIndex, self).join(other, how, level, return_indexers, sort) - def append(self, other): - """ - Append a collection of Index options together - - Parameters - ---------- - other : Index or list/tuple of indices - - Returns - ------- - appended : RangeIndex if all indexes are consecutive RangeIndexes, - otherwise Int64Index or Index - """ - - to_concat = [self] - - if isinstance(other, (list, tuple)): - to_concat = to_concat + list(other) - else: - to_concat.append(other) - - if not all([isinstance(i, RangeIndex) for i in to_concat]): - return super(RangeIndex, self).append(other) - - start = step = next = None - - for obj in to_concat: - if not len(obj): - continue - - if start is None: - # This is set by the first non-empty index - start = obj._start - if step is None and len(obj) > 1: - step = obj._step - elif step is None: - # First non-empty index had only one element - if obj._start == start: - return super(RangeIndex, self).append(other) - step = obj._start - start - - non_consecutive = ((step != obj._step and len(obj) > 1) or - (next is not None and obj._start != next)) - if non_consecutive: - return super(RangeIndex, self).append(other) - - if step is not None: - next = obj[-1] + step - - if start is None: - start = obj._start - step = obj._step - stop = obj._stop if next is None else next - names = set([obj.name for obj in to_concat]) - name = None if len(names) > 1 else self.name - return RangeIndex(start, stop, step, name=name) + def _concat_same_dtype(self, indexes, name): + return _concat._concat_rangeindex_same_dtype(indexes).rename(name) def __len__(self): """ From 870b6a6d6415c76d051b287adcb180ac3020b6e8 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 22 Aug 2017 05:50:57 -0400 Subject: [PATCH 014/188] DEPS: set min versions (#17002) closes #15206, numpy >= 1.9 closes #15543, matplotlib >= 1.4.3 scipy >= 0.14.0 --- .travis.yml | 6 +- ci/install_travis.sh | 2 +- ci/requirements-2.7_COMPAT.build | 2 +- ci/requirements-2.7_COMPAT.run | 9 +- ci/requirements-2.7_LOCALE.build | 2 +- ci/requirements-2.7_LOCALE.run | 5 +- ci/requirements-2.7_SLOW.build | 2 +- ci/requirements-2.7_SLOW.run | 4 +- ci/script_multi.sh | 6 + ci/script_single.sh | 8 + doc/source/install.rst | 6 +- doc/source/whatsnew/v0.21.0.txt | 22 ++- pandas/_libs/sparse.pyx | 2 - pandas/compat/numpy/__init__.py | 14 +- pandas/core/algorithms.py | 7 +- pandas/core/generic.py | 5 +- pandas/core/groupby.py | 8 +- pandas/core/internals.py | 16 +- pandas/tests/frame/test_quantile.py | 42 ----- pandas/tests/frame/test_rank.py | 12 +- .../tests/indexes/datetimes/test_datetime.py | 8 +- pandas/tests/indexes/period/test_indexing.py | 34 ++-- .../indexes/timedeltas/test_timedelta.py | 8 +- pandas/tests/plotting/common.py | 3 +- pandas/tests/plotting/test_datetimelike.py | 2 + pandas/tests/plotting/test_frame.py | 163 ++++++++++-------- pandas/tests/plotting/test_misc.py | 45 +---- pandas/tests/plotting/test_series.py | 12 ++ pandas/tests/series/test_operators.py | 16 +- pandas/tests/series/test_quantile.py | 27 +-- pandas/tests/series/test_rank.py | 9 +- pandas/tests/sparse/test_array.py | 7 +- pandas/tests/test_nanops.py | 18 +- pandas/tests/test_resample.py | 2 +- pandas/tests/tools/test_numeric.py | 5 +- setup.py | 2 +- 36 files changed, 221 insertions(+), 320 deletions(-) diff --git a/.travis.yml b/.travis.yml index 897d31cf23a3b8..034e2a32bb75c7 100644 --- a/.travis.yml +++ b/.travis.yml @@ -37,7 +37,7 @@ matrix: - JOB="3.5_OSX" TEST_ARGS="--skip-slow --skip-network" - dist: trusty env: - - JOB="2.7_LOCALE" TEST_ARGS="--only-slow --skip-network" LOCALE_OVERRIDE="zh_CN.UTF-8" + - JOB="2.7_LOCALE" LOCALE_OVERRIDE="zh_CN.UTF-8" SLOW=true addons: apt: packages: @@ -62,7 +62,7 @@ matrix: # In allow_failures - dist: trusty env: - - JOB="2.7_SLOW" TEST_ARGS="--only-slow --skip-network" + - JOB="2.7_SLOW" SLOW=true # In allow_failures - dist: trusty env: @@ -82,7 +82,7 @@ matrix: allow_failures: - dist: trusty env: - - JOB="2.7_SLOW" TEST_ARGS="--only-slow --skip-network" + - JOB="2.7_SLOW" SLOW=true - dist: trusty env: - JOB="2.7_BUILD_TEST" TEST_ARGS="--skip-slow" BUILD_TEST=true diff --git a/ci/install_travis.sh b/ci/install_travis.sh index ad8f0bdd8a5977..d26689f2e6b4bd 100755 --- a/ci/install_travis.sh +++ b/ci/install_travis.sh @@ -47,7 +47,7 @@ which conda echo echo "[update conda]" conda config --set ssl_verify false || exit 1 -conda config --set always_yes true --set changeps1 false || exit 1 +conda config --set quiet true --set always_yes true --set changeps1 false || exit 1 conda update -q conda echo diff --git a/ci/requirements-2.7_COMPAT.build b/ci/requirements-2.7_COMPAT.build index 0e1ccf9eac9bf1..d9c932daa110ba 100644 --- a/ci/requirements-2.7_COMPAT.build +++ b/ci/requirements-2.7_COMPAT.build @@ -1,5 +1,5 @@ python=2.7* -numpy=1.7.1 +numpy=1.9.2 cython=0.23 dateutil=1.5 pytz=2013b diff --git a/ci/requirements-2.7_COMPAT.run b/ci/requirements-2.7_COMPAT.run index b94f4ab7b27d1a..39bf7201407333 100644 --- a/ci/requirements-2.7_COMPAT.run +++ b/ci/requirements-2.7_COMPAT.run @@ -1,11 +1,12 @@ -numpy=1.7.1 +numpy=1.9.2 dateutil=1.5 pytz=2013b -scipy=0.11.0 +scipy=0.14.0 xlwt=0.7.5 xlrd=0.9.2 -numexpr=2.2.2 -pytables=3.0.0 +bottleneck=1.0.0 +numexpr=2.4.4 # we test that we correctly don't use an unsupported numexpr +pytables=3.2.2 psycopg2 pymysql=0.6.0 sqlalchemy=0.7.8 diff --git a/ci/requirements-2.7_LOCALE.build b/ci/requirements-2.7_LOCALE.build index 4a37ce8fbe1613..96cb184ec2665e 100644 --- a/ci/requirements-2.7_LOCALE.build +++ b/ci/requirements-2.7_LOCALE.build @@ -1,5 +1,5 @@ python=2.7* python-dateutil pytz=2013b -numpy=1.8.2 +numpy=1.9.2 cython=0.23 diff --git a/ci/requirements-2.7_LOCALE.run b/ci/requirements-2.7_LOCALE.run index 8e360cf74b081c..00006106f7009c 100644 --- a/ci/requirements-2.7_LOCALE.run +++ b/ci/requirements-2.7_LOCALE.run @@ -1,11 +1,12 @@ python-dateutil pytz=2013b -numpy=1.8.2 +numpy=1.9.2 xlwt=0.7.5 openpyxl=1.6.2 xlsxwriter=0.5.2 xlrd=0.9.2 -matplotlib=1.3.1 +bottleneck=1.0.0 +matplotlib=1.4.3 sqlalchemy=0.8.1 lxml=3.2.1 scipy diff --git a/ci/requirements-2.7_SLOW.build b/ci/requirements-2.7_SLOW.build index 0f4a2c6792e6b1..a665ab9edd5850 100644 --- a/ci/requirements-2.7_SLOW.build +++ b/ci/requirements-2.7_SLOW.build @@ -1,5 +1,5 @@ python=2.7* python-dateutil pytz -numpy=1.8.2 +numpy=1.10* cython diff --git a/ci/requirements-2.7_SLOW.run b/ci/requirements-2.7_SLOW.run index 0a549554f5219e..f7708283ad04a0 100644 --- a/ci/requirements-2.7_SLOW.run +++ b/ci/requirements-2.7_SLOW.run @@ -1,7 +1,7 @@ python-dateutil pytz -numpy=1.8.2 -matplotlib=1.3.1 +numpy=1.10* +matplotlib=1.4.3 scipy patsy xlwt diff --git a/ci/script_multi.sh b/ci/script_multi.sh index d79fc43fbe175a..ee9fbcaad5ef5f 100755 --- a/ci/script_multi.sh +++ b/ci/script_multi.sh @@ -36,9 +36,15 @@ elif [ "$COVERAGE" ]; then echo pytest -s -n 2 -m "not single" --cov=pandas --cov-report xml:/tmp/cov-multiple.xml --junitxml=/tmp/multiple.xml $TEST_ARGS pandas pytest -s -n 2 -m "not single" --cov=pandas --cov-report xml:/tmp/cov-multiple.xml --junitxml=/tmp/multiple.xml $TEST_ARGS pandas +elif [ "$SLOW" ]; then + TEST_ARGS="--only-slow --skip-network" + echo pytest -r xX -m "not single and slow" -v --junitxml=/tmp/multiple.xml $TEST_ARGS pandas + pytest -r xX -m "not single and slow" -v --junitxml=/tmp/multiple.xml $TEST_ARGS pandas + else echo pytest -n 2 -r xX -m "not single" --junitxml=/tmp/multiple.xml $TEST_ARGS pandas pytest -n 2 -r xX -m "not single" --junitxml=/tmp/multiple.xml $TEST_ARGS pandas # TODO: doctest + fi RET="$?" diff --git a/ci/script_single.sh b/ci/script_single.sh index 245b4e6152c4d9..375e9879e950fd 100755 --- a/ci/script_single.sh +++ b/ci/script_single.sh @@ -12,16 +12,24 @@ if [ -n "$LOCALE_OVERRIDE" ]; then python -c "$pycmd" fi +if [ "$SLOW" ]; then + TEST_ARGS="--only-slow --skip-network" +fi + if [ "$BUILD_TEST" ]; then echo "We are not running pytest as this is a build test." + elif [ "$DOC" ]; then echo "We are not running pytest as this is a doc-build" + elif [ "$COVERAGE" ]; then echo pytest -s -m "single" --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas pytest -s -m "single" --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas + else echo pytest -m "single" -r xX --junitxml=/tmp/single.xml $TEST_ARGS pandas pytest -m "single" -r xX --junitxml=/tmp/single.xml $TEST_ARGS pandas # TODO: doctest + fi RET="$?" diff --git a/doc/source/install.rst b/doc/source/install.rst index 99d299b75b59b2..f92c43839ee317 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -203,7 +203,7 @@ Dependencies ------------ * `setuptools `__ -* `NumPy `__: 1.7.1 or higher +* `NumPy `__: 1.9.0 or higher * `python-dateutil `__: 1.5 or higher * `pytz `__: Needed for time zone support @@ -233,7 +233,7 @@ Optional Dependencies * `Cython `__: Only necessary to build development version. Version 0.23 or higher. -* `SciPy `__: miscellaneous statistical functions +* `SciPy `__: miscellaneous statistical functions, Version 0.14.0 or higher * `xarray `__: pandas like handling for > 2 dims, needed for converting Panels to xarray objects. Version 0.7.0 or higher is recommended. * `PyTables `__: necessary for HDF5-based storage. Version 3.0.0 or higher required, Version 3.2.1 or higher highly recommended. * `Feather Format `__: necessary for feather-based storage, version 0.3.1 or higher. @@ -244,7 +244,7 @@ Optional Dependencies * `pymysql `__: for MySQL. * `SQLite `__: for SQLite, this is included in Python's standard library by default. -* `matplotlib `__: for plotting +* `matplotlib `__: for plotting, Version 1.4.3 or higher. * For Excel I/O: * `xlrd/xlwt `__: Excel reading (xlrd) and writing (xlwt) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index dd06114f6abd31..148fd0a8324021 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -137,6 +137,27 @@ Other Enhancements Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. _whatsnew_0210.api_breaking.deps: + +Dependencies have increased minimum versions +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +We have updated our minimum supported versions of dependencies (:issue:`15206`, :issue:`15543`, :issue:`15214`) +). If installed, we now require: + + +--------------+-----------------+----------+ + | Package | Minimum Version | Required | + +======================+=========+==========+ + | Numpy | 1.9.0 | X | + +--------------+-----------------+----------+ + | Matplotlib | 1.4.3 | | + +--------------+-----------------+----------+ + | Scipy | 0.14.0 | | + +--------------+-----------------+----------+ + | Bottleneck | 1.0.0 | | + +--------------+-----------------+----------+ + .. _whatsnew_0210.api_breaking.pandas_eval: Improved error handling during item assignment in pd.eval @@ -258,7 +279,6 @@ Other API Changes ^^^^^^^^^^^^^^^^^ - Support has been dropped for Python 3.4 (:issue:`15251`) -- Support has been dropped for bottleneck < 1.0.0 (:issue:`15214`) - The Categorical constructor no longer accepts a scalar for the ``categories`` keyword. (:issue:`16022`) - Accessing a non-existent attribute on a closed :class:`~pandas.HDFStore` will now raise an ``AttributeError`` rather than a ``ClosedFileError`` (:issue:`16301`) diff --git a/pandas/_libs/sparse.pyx b/pandas/_libs/sparse.pyx index 0c2e056ead7fac..1cc7f5ace95ea5 100644 --- a/pandas/_libs/sparse.pyx +++ b/pandas/_libs/sparse.pyx @@ -12,8 +12,6 @@ from distutils.version import LooseVersion # numpy versioning _np_version = np.version.short_version -_np_version_under1p8 = LooseVersion(_np_version) < '1.8' -_np_version_under1p9 = LooseVersion(_np_version) < '1.9' _np_version_under1p10 = LooseVersion(_np_version) < '1.10' _np_version_under1p11 = LooseVersion(_np_version) < '1.11' diff --git a/pandas/compat/numpy/__init__.py b/pandas/compat/numpy/__init__.py index 2c5a18973afa8f..5112957b498751 100644 --- a/pandas/compat/numpy/__init__.py +++ b/pandas/compat/numpy/__init__.py @@ -9,19 +9,18 @@ # numpy versioning _np_version = np.__version__ _nlv = LooseVersion(_np_version) -_np_version_under1p8 = _nlv < '1.8' -_np_version_under1p9 = _nlv < '1.9' _np_version_under1p10 = _nlv < '1.10' _np_version_under1p11 = _nlv < '1.11' _np_version_under1p12 = _nlv < '1.12' _np_version_under1p13 = _nlv < '1.13' _np_version_under1p14 = _nlv < '1.14' +_np_version_under1p15 = _nlv < '1.15' -if _nlv < '1.7.0': +if _nlv < '1.9': raise ImportError('this version of pandas is incompatible with ' - 'numpy < 1.7.0\n' + 'numpy < 1.9.0\n' 'your numpy version is {0}.\n' - 'Please upgrade numpy to >= 1.7.0 to use ' + 'Please upgrade numpy to >= 1.9.0 to use ' 'this pandas version'.format(_np_version)) @@ -70,11 +69,10 @@ def np_array_datetime64_compat(arr, *args, **kwargs): __all__ = ['np', - '_np_version_under1p8', - '_np_version_under1p9', '_np_version_under1p10', '_np_version_under1p11', '_np_version_under1p12', '_np_version_under1p13', - '_np_version_under1p14' + '_np_version_under1p14', + '_np_version_under1p15' ] diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index f2359f3ff1a9db..ffd03096e2a27f 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -6,7 +6,6 @@ from warnings import warn, catch_warnings import numpy as np -from pandas import compat, _np_version_under1p8 from pandas.core.dtypes.cast import maybe_promote from pandas.core.dtypes.generic import ( ABCSeries, ABCIndex, @@ -407,14 +406,12 @@ def isin(comps, values): comps, dtype, _ = _ensure_data(comps) values, _, _ = _ensure_data(values, dtype=dtype) - # GH11232 - # work-around for numpy < 1.8 and comparisions on py3 # faster for larger cases to use np.in1d f = lambda x, y: htable.ismember_object(x, values) + # GH16012 # Ensure np.in1d doesn't get object types or it *may* throw an exception - if ((_np_version_under1p8 and compat.PY3) or len(comps) > 1000000 and - not is_object_dtype(comps)): + if len(comps) > 1000000 and not is_object_dtype(comps): f = lambda x, y: np.in1d(x, y) elif is_integer_dtype(comps): try: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e84e4eac3f34d6..f8366c804e3e79 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1827,11 +1827,8 @@ def _box_item_values(self, key, values): def _maybe_cache_changed(self, item, value): """The object has called back to us saying maybe it has changed. - - numpy < 1.8 has an issue with object arrays and aliasing - GH6026 """ - self._data.set(item, value, check=pd._np_version_under1p8) + self._data.set(item, value, check=False) @property def _is_cached(self): diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index a388892e925b63..aa7c4517c0a016 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -13,7 +13,7 @@ ) from pandas import compat -from pandas.compat.numpy import function as nv, _np_version_under1p8 +from pandas.compat.numpy import function as nv from pandas.compat import set_function_name from pandas.core.dtypes.common import ( @@ -3257,11 +3257,7 @@ def value_counts(self, normalize=False, sort=True, ascending=False, d = np.diff(np.r_[idx, len(ids)]) if dropna: m = ids[lab == -1] - if _np_version_under1p8: - mi, ml = algorithms.factorize(m) - d[ml] = d[ml] - np.bincount(mi) - else: - np.add.at(d, m, -1) + np.add.at(d, m, -1) acc = rep(d)[mask] else: acc = rep(d) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index b616270e47aa6e..83b382ec0ed723 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -69,8 +69,7 @@ import pandas.core.computation.expressions as expressions from pandas.util._decorators import cache_readonly from pandas.util._validators import validate_bool_kwarg - -from pandas import compat, _np_version_under1p9 +from pandas import compat from pandas.compat import range, map, zip, u @@ -857,9 +856,6 @@ def _is_empty_indexer(indexer): # set else: - if _np_version_under1p9: - # Work around GH 6168 to support old numpy - indexer = getattr(indexer, 'values', indexer) values[indexer] = value # coerce and try to infer the dtypes of the result @@ -1482,15 +1478,7 @@ def quantile(self, qs, interpolation='linear', axis=0, mgr=None): tuple of (axis, block) """ - if _np_version_under1p9: - if interpolation != 'linear': - raise ValueError("Interpolation methods other than linear " - "are not supported in numpy < 1.9.") - - kw = {} - if not _np_version_under1p9: - kw.update({'interpolation': interpolation}) - + kw = {'interpolation': interpolation} values = self.get_values() values, _, _, _ = self._try_coerce_args(values, values) diff --git a/pandas/tests/frame/test_quantile.py b/pandas/tests/frame/test_quantile.py index 2482e493dbefdc..2f264874378bce 100644 --- a/pandas/tests/frame/test_quantile.py +++ b/pandas/tests/frame/test_quantile.py @@ -12,7 +12,6 @@ from pandas.util.testing import assert_series_equal, assert_frame_equal import pandas.util.testing as tm -from pandas import _np_version_under1p9 from pandas.tests.frame.common import TestData @@ -103,9 +102,6 @@ def test_quantile_axis_parameter(self): def test_quantile_interpolation(self): # see gh-10174 - if _np_version_under1p9: - pytest.skip("Numpy version under 1.9") - from numpy import percentile # interpolation = linear (default case) @@ -166,44 +162,6 @@ def test_quantile_interpolation(self): index=[.25, .5], columns=['a', 'b', 'c']) assert_frame_equal(result, expected) - def test_quantile_interpolation_np_lt_1p9(self): - # see gh-10174 - if not _np_version_under1p9: - pytest.skip("Numpy version is greater than 1.9") - - from numpy import percentile - - # interpolation = linear (default case) - q = self.tsframe.quantile(0.1, axis=0, interpolation='linear') - assert q['A'] == percentile(self.tsframe['A'], 10) - q = self.intframe.quantile(0.1) - assert q['A'] == percentile(self.intframe['A'], 10) - - # test with and without interpolation keyword - q1 = self.intframe.quantile(0.1) - assert q1['A'] == np.percentile(self.intframe['A'], 10) - assert_series_equal(q, q1) - - # interpolation method other than default linear - msg = "Interpolation methods other than linear" - df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3]) - with tm.assert_raises_regex(ValueError, msg): - df.quantile(.5, axis=1, interpolation='nearest') - - with tm.assert_raises_regex(ValueError, msg): - df.quantile([.5, .75], axis=1, interpolation='lower') - - # test degenerate case - df = DataFrame({'x': [], 'y': []}) - with tm.assert_raises_regex(ValueError, msg): - q = df.quantile(0.1, axis=0, interpolation='higher') - - # multi - df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], - columns=['a', 'b', 'c']) - with tm.assert_raises_regex(ValueError, msg): - df.quantile([.25, .5], interpolation='midpoint') - def test_quantile_multi(self): df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], columns=['a', 'b', 'c']) diff --git a/pandas/tests/frame/test_rank.py b/pandas/tests/frame/test_rank.py index acf887d047c9e6..58f4d9b770173c 100644 --- a/pandas/tests/frame/test_rank.py +++ b/pandas/tests/frame/test_rank.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +import pytest from datetime import timedelta, datetime from distutils.version import LooseVersion from numpy import nan @@ -26,8 +27,7 @@ class TestRank(TestData): } def test_rank(self): - tm._skip_if_no_scipy() - from scipy.stats import rankdata + rankdata = pytest.importorskip('scipy.stats.rankdata') self.frame['A'][::2] = np.nan self.frame['B'][::3] = np.nan @@ -120,8 +120,7 @@ def test_rank2(self): tm.assert_frame_equal(df.rank(), exp) def test_rank_na_option(self): - tm._skip_if_no_scipy() - from scipy.stats import rankdata + rankdata = pytest.importorskip('scipy.stats.rankdata') self.frame['A'][::2] = np.nan self.frame['B'][::3] = np.nan @@ -193,10 +192,9 @@ def test_rank_axis(self): tm.assert_frame_equal(df.rank(axis=1), df.rank(axis='columns')) def test_rank_methods_frame(self): - tm.skip_if_no_package('scipy', min_version='0.13', - app='scipy.stats.rankdata') + pytest.importorskip('scipy.stats.special') + rankdata = pytest.importorskip('scipy.stats.rankdata') import scipy - from scipy.stats import rankdata xs = np.random.randint(0, 21, (100, 26)) xs = (xs - 10.0) / 10.0 diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index f99dcee9e5c8ab..47f53f53cfd021 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -9,7 +9,7 @@ from pandas.compat import lrange from pandas.compat.numpy import np_datetime64_compat from pandas import (DatetimeIndex, Index, date_range, Series, DataFrame, - Timestamp, datetime, offsets, _np_version_under1p8) + Timestamp, datetime, offsets) from pandas.util.testing import assert_series_equal, assert_almost_equal @@ -276,11 +276,7 @@ def test_comparisons_nat(self): np_datetime64_compat('2014-06-01 00:00Z'), np_datetime64_compat('2014-07-01 00:00Z')]) - if _np_version_under1p8: - # cannot test array because np.datetime('nat') returns today's date - cases = [(fidx1, fidx2), (didx1, didx2)] - else: - cases = [(fidx1, fidx2), (didx1, didx2), (didx1, darr)] + cases = [(fidx1, fidx2), (didx1, didx2), (didx1, darr)] # Check pd.NaT is handles as the same as np.nan with tm.assert_produces_warning(None): diff --git a/pandas/tests/indexes/period/test_indexing.py b/pandas/tests/indexes/period/test_indexing.py index d4dac1cf88fffb..efc13a56cd77e4 100644 --- a/pandas/tests/indexes/period/test_indexing.py +++ b/pandas/tests/indexes/period/test_indexing.py @@ -8,7 +8,7 @@ from pandas.compat import lrange from pandas._libs import tslib from pandas import (PeriodIndex, Series, DatetimeIndex, - period_range, Period, _np_version_under1p9) + period_range, Period) class TestGetItem(object): @@ -149,16 +149,12 @@ def test_getitem_seconds(self): values = ['2014', '2013/02', '2013/01/02', '2013/02/01 9H', '2013/02/01 09:00'] for v in values: - if _np_version_under1p9: - with pytest.raises(ValueError): - idx[v] - else: - # GH7116 - # these show deprecations as we are trying - # to slice with non-integer indexers - # with pytest.raises(IndexError): - # idx[v] - continue + # GH7116 + # these show deprecations as we are trying + # to slice with non-integer indexers + # with pytest.raises(IndexError): + # idx[v] + continue s = Series(np.random.rand(len(idx)), index=idx) tm.assert_series_equal(s['2013/01/01 10:00'], s[3600:3660]) @@ -178,16 +174,12 @@ def test_getitem_day(self): '2013/02/01 09:00'] for v in values: - if _np_version_under1p9: - with pytest.raises(ValueError): - idx[v] - else: - # GH7116 - # these show deprecations as we are trying - # to slice with non-integer indexers - # with pytest.raises(IndexError): - # idx[v] - continue + # GH7116 + # these show deprecations as we are trying + # to slice with non-integer indexers + # with pytest.raises(IndexError): + # idx[v] + continue s = Series(np.random.rand(len(idx)), index=idx) tm.assert_series_equal(s['2013/01'], s[0:31]) diff --git a/pandas/tests/indexes/timedeltas/test_timedelta.py b/pandas/tests/indexes/timedeltas/test_timedelta.py index 59e4b1432b8bc1..0b3bd0b03bccfd 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta.py @@ -7,7 +7,7 @@ import pandas.util.testing as tm from pandas import (timedelta_range, date_range, Series, Timedelta, DatetimeIndex, TimedeltaIndex, Index, DataFrame, - Int64Index, _np_version_under1p8) + Int64Index) from pandas.util.testing import (assert_almost_equal, assert_series_equal, assert_index_equal) @@ -379,11 +379,7 @@ def test_comparisons_nat(self): np.timedelta64(1, 'D') + np.timedelta64(2, 's'), np.timedelta64(5, 'D') + np.timedelta64(3, 's')]) - if _np_version_under1p8: - # cannot test array because np.datetime('nat') returns today's date - cases = [(tdidx1, tdidx2)] - else: - cases = [(tdidx1, tdidx2), (tdidx1, tdarr)] + cases = [(tdidx1, tdidx2), (tdidx1, tdarr)] # Check pd.NaT is handles as the same as np.nan for idx1, idx2 in cases: diff --git a/pandas/tests/plotting/common.py b/pandas/tests/plotting/common.py index 3ab443b223f207..dfab539e9474c5 100644 --- a/pandas/tests/plotting/common.py +++ b/pandas/tests/plotting/common.py @@ -39,7 +39,8 @@ def _ok_for_gaussian_kde(kind): from scipy.stats import gaussian_kde # noqa except ImportError: return False - return True + + return plotting._compat._mpl_ge_1_5_0() class TestPlotBase(object): diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index e9c7d806fd65df..cff0c1c0b424e5 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -610,6 +610,8 @@ def test_secondary_y_ts(self): @pytest.mark.slow def test_secondary_kde(self): + if not self.mpl_ge_1_5_0: + pytest.skip("mpl is not supported") tm._skip_if_no_scipy() _skip_if_no_scipy_gaussian_kde() diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index 6d813ac76cc4e2..67098529a01119 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -473,7 +473,6 @@ def test_subplots_multiple_axes(self): # TestDataFrameGroupByPlots.test_grouped_box_multiple_axes fig, axes = self.plt.subplots(2, 2) with warnings.catch_warnings(): - warnings.simplefilter('ignore') df = DataFrame(np.random.rand(10, 4), index=list(string.ascii_letters[:10])) @@ -1290,6 +1289,9 @@ def test_boxplot_subplots_return_type(self): def test_kde_df(self): tm._skip_if_no_scipy() _skip_if_no_scipy_gaussian_kde() + if not self.mpl_ge_1_5_0: + pytest.skip("mpl is not supported") + df = DataFrame(randn(100, 4)) ax = _check_plot_works(df.plot, kind='kde') expected = [pprint_thing(c) for c in df.columns] @@ -1311,6 +1313,9 @@ def test_kde_df(self): def test_kde_missing_vals(self): tm._skip_if_no_scipy() _skip_if_no_scipy_gaussian_kde() + if not self.mpl_ge_1_5_0: + pytest.skip("mpl is not supported") + df = DataFrame(np.random.uniform(size=(100, 4))) df.loc[0, 0] = np.nan _check_plot_works(df.plot, kind='kde') @@ -1835,6 +1840,8 @@ def test_hist_colors(self): def test_kde_colors(self): tm._skip_if_no_scipy() _skip_if_no_scipy_gaussian_kde() + if not self.mpl_ge_1_5_0: + pytest.skip("mpl is not supported") from matplotlib import cm @@ -1858,6 +1865,8 @@ def test_kde_colors(self): def test_kde_colors_and_styles_subplots(self): tm._skip_if_no_scipy() _skip_if_no_scipy_gaussian_kde() + if not self.mpl_ge_1_5_0: + pytest.skip("mpl is not supported") from matplotlib import cm default_colors = self._maybe_unpack_cycler(self.plt.rcParams) @@ -2160,71 +2169,74 @@ def test_pie_df_nan(self): @pytest.mark.slow def test_errorbar_plot(self): - d = {'x': np.arange(12), 'y': np.arange(12, 0, -1)} - df = DataFrame(d) - d_err = {'x': np.ones(12) * 0.2, 'y': np.ones(12) * 0.4} - df_err = DataFrame(d_err) - - # check line plots - ax = _check_plot_works(df.plot, yerr=df_err, logy=True) - self._check_has_errorbars(ax, xerr=0, yerr=2) - ax = _check_plot_works(df.plot, yerr=df_err, logx=True, logy=True) - self._check_has_errorbars(ax, xerr=0, yerr=2) - ax = _check_plot_works(df.plot, yerr=df_err, loglog=True) - self._check_has_errorbars(ax, xerr=0, yerr=2) + with warnings.catch_warnings(): + d = {'x': np.arange(12), 'y': np.arange(12, 0, -1)} + df = DataFrame(d) + d_err = {'x': np.ones(12) * 0.2, 'y': np.ones(12) * 0.4} + df_err = DataFrame(d_err) - kinds = ['line', 'bar', 'barh'] - for kind in kinds: - ax = _check_plot_works(df.plot, yerr=df_err['x'], kind=kind) + # check line plots + ax = _check_plot_works(df.plot, yerr=df_err, logy=True) self._check_has_errorbars(ax, xerr=0, yerr=2) - ax = _check_plot_works(df.plot, yerr=d_err, kind=kind) + ax = _check_plot_works(df.plot, yerr=df_err, logx=True, logy=True) self._check_has_errorbars(ax, xerr=0, yerr=2) - ax = _check_plot_works(df.plot, yerr=df_err, xerr=df_err, - kind=kind) - self._check_has_errorbars(ax, xerr=2, yerr=2) - ax = _check_plot_works(df.plot, yerr=df_err['x'], xerr=df_err['x'], - kind=kind) - self._check_has_errorbars(ax, xerr=2, yerr=2) - ax = _check_plot_works(df.plot, xerr=0.2, yerr=0.2, kind=kind) - self._check_has_errorbars(ax, xerr=2, yerr=2) - # _check_plot_works adds an ax so catch warning. see GH #13188 - with tm.assert_produces_warning(UserWarning): + ax = _check_plot_works(df.plot, yerr=df_err, loglog=True) + self._check_has_errorbars(ax, xerr=0, yerr=2) + + kinds = ['line', 'bar', 'barh'] + for kind in kinds: + ax = _check_plot_works(df.plot, yerr=df_err['x'], kind=kind) + self._check_has_errorbars(ax, xerr=0, yerr=2) + ax = _check_plot_works(df.plot, yerr=d_err, kind=kind) + self._check_has_errorbars(ax, xerr=0, yerr=2) + ax = _check_plot_works(df.plot, yerr=df_err, xerr=df_err, + kind=kind) + self._check_has_errorbars(ax, xerr=2, yerr=2) + ax = _check_plot_works(df.plot, yerr=df_err['x'], + xerr=df_err['x'], + kind=kind) + self._check_has_errorbars(ax, xerr=2, yerr=2) + ax = _check_plot_works(df.plot, xerr=0.2, yerr=0.2, kind=kind) + self._check_has_errorbars(ax, xerr=2, yerr=2) + + # _check_plot_works adds an ax so catch warning. see GH #13188 axes = _check_plot_works(df.plot, yerr=df_err, xerr=df_err, subplots=True, kind=kind) - self._check_has_errorbars(axes, xerr=1, yerr=1) - - ax = _check_plot_works((df + 1).plot, yerr=df_err, - xerr=df_err, kind='bar', log=True) - self._check_has_errorbars(ax, xerr=2, yerr=2) + self._check_has_errorbars(axes, xerr=1, yerr=1) - # yerr is raw error values - ax = _check_plot_works(df['y'].plot, yerr=np.ones(12) * 0.4) - self._check_has_errorbars(ax, xerr=0, yerr=1) - ax = _check_plot_works(df.plot, yerr=np.ones((2, 12)) * 0.4) - self._check_has_errorbars(ax, xerr=0, yerr=2) + ax = _check_plot_works((df + 1).plot, yerr=df_err, + xerr=df_err, kind='bar', log=True) + self._check_has_errorbars(ax, xerr=2, yerr=2) - # yerr is iterator - import itertools - ax = _check_plot_works(df.plot, yerr=itertools.repeat(0.1, len(df))) - self._check_has_errorbars(ax, xerr=0, yerr=2) + # yerr is raw error values + ax = _check_plot_works(df['y'].plot, yerr=np.ones(12) * 0.4) + self._check_has_errorbars(ax, xerr=0, yerr=1) + ax = _check_plot_works(df.plot, yerr=np.ones((2, 12)) * 0.4) + self._check_has_errorbars(ax, xerr=0, yerr=2) - # yerr is column name - for yerr in ['yerr', u('誤差')]: - s_df = df.copy() - s_df[yerr] = np.ones(12) * 0.2 - ax = _check_plot_works(s_df.plot, yerr=yerr) + # yerr is iterator + import itertools + ax = _check_plot_works(df.plot, + yerr=itertools.repeat(0.1, len(df))) self._check_has_errorbars(ax, xerr=0, yerr=2) - ax = _check_plot_works(s_df.plot, y='y', x='x', yerr=yerr) - self._check_has_errorbars(ax, xerr=0, yerr=1) - with pytest.raises(ValueError): - df.plot(yerr=np.random.randn(11)) + # yerr is column name + for yerr in ['yerr', u('誤差')]: + s_df = df.copy() + s_df[yerr] = np.ones(12) * 0.2 + ax = _check_plot_works(s_df.plot, yerr=yerr) + self._check_has_errorbars(ax, xerr=0, yerr=2) + ax = _check_plot_works(s_df.plot, y='y', x='x', yerr=yerr) + self._check_has_errorbars(ax, xerr=0, yerr=1) - df_err = DataFrame({'x': ['zzz'] * 12, 'y': ['zzz'] * 12}) - with pytest.raises((ValueError, TypeError)): - df.plot(yerr=df_err) + with pytest.raises(ValueError): + df.plot(yerr=np.random.randn(11)) + + df_err = DataFrame({'x': ['zzz'] * 12, 'y': ['zzz'] * 12}) + with pytest.raises((ValueError, TypeError)): + df.plot(yerr=df_err) @pytest.mark.slow def test_errorbar_with_integer_column_names(self): @@ -2262,33 +2274,34 @@ def test_errorbar_with_partial_columns(self): @pytest.mark.slow def test_errorbar_timeseries(self): - d = {'x': np.arange(12), 'y': np.arange(12, 0, -1)} - d_err = {'x': np.ones(12) * 0.2, 'y': np.ones(12) * 0.4} + with warnings.catch_warnings(): + d = {'x': np.arange(12), 'y': np.arange(12, 0, -1)} + d_err = {'x': np.ones(12) * 0.2, 'y': np.ones(12) * 0.4} - # check time-series plots - ix = date_range('1/1/2000', '1/1/2001', freq='M') - tdf = DataFrame(d, index=ix) - tdf_err = DataFrame(d_err, index=ix) + # check time-series plots + ix = date_range('1/1/2000', '1/1/2001', freq='M') + tdf = DataFrame(d, index=ix) + tdf_err = DataFrame(d_err, index=ix) - kinds = ['line', 'bar', 'barh'] - for kind in kinds: - ax = _check_plot_works(tdf.plot, yerr=tdf_err, kind=kind) - self._check_has_errorbars(ax, xerr=0, yerr=2) - ax = _check_plot_works(tdf.plot, yerr=d_err, kind=kind) - self._check_has_errorbars(ax, xerr=0, yerr=2) - ax = _check_plot_works(tdf.plot, y='y', yerr=tdf_err['x'], - kind=kind) - self._check_has_errorbars(ax, xerr=0, yerr=1) - ax = _check_plot_works(tdf.plot, y='y', yerr='x', kind=kind) - self._check_has_errorbars(ax, xerr=0, yerr=1) - ax = _check_plot_works(tdf.plot, yerr=tdf_err, kind=kind) - self._check_has_errorbars(ax, xerr=0, yerr=2) - # _check_plot_works adds an ax so catch warning. see GH #13188 - with tm.assert_produces_warning(UserWarning): + kinds = ['line', 'bar', 'barh'] + for kind in kinds: + ax = _check_plot_works(tdf.plot, yerr=tdf_err, kind=kind) + self._check_has_errorbars(ax, xerr=0, yerr=2) + ax = _check_plot_works(tdf.plot, yerr=d_err, kind=kind) + self._check_has_errorbars(ax, xerr=0, yerr=2) + ax = _check_plot_works(tdf.plot, y='y', yerr=tdf_err['x'], + kind=kind) + self._check_has_errorbars(ax, xerr=0, yerr=1) + ax = _check_plot_works(tdf.plot, y='y', yerr='x', kind=kind) + self._check_has_errorbars(ax, xerr=0, yerr=1) + ax = _check_plot_works(tdf.plot, yerr=tdf_err, kind=kind) + self._check_has_errorbars(ax, xerr=0, yerr=2) + + # _check_plot_works adds an ax so catch warning. see GH #13188 axes = _check_plot_works(tdf.plot, kind=kind, yerr=tdf_err, subplots=True) - self._check_has_errorbars(axes, xerr=0, yerr=1) + self._check_has_errorbars(axes, xerr=0, yerr=1) def test_errorbar_asymmetrical(self): diff --git a/pandas/tests/plotting/test_misc.py b/pandas/tests/plotting/test_misc.py index 684a943fb5a69f..c4795ea1e1eca6 100644 --- a/pandas/tests/plotting/test_misc.py +++ b/pandas/tests/plotting/test_misc.py @@ -4,7 +4,7 @@ import pytest -from pandas import Series, DataFrame +from pandas import DataFrame from pandas.compat import lmap import pandas.util.testing as tm @@ -13,8 +13,7 @@ from numpy.random import randn import pandas.plotting as plotting -from pandas.tests.plotting.common import (TestPlotBase, _check_plot_works, - _ok_for_gaussian_kde) +from pandas.tests.plotting.common import TestPlotBase, _check_plot_works tm._skip_if_no_mpl() @@ -52,46 +51,6 @@ def test_bootstrap_plot(self): class TestDataFramePlots(TestPlotBase): - @pytest.mark.slow - def test_scatter_plot_legacy(self): - tm._skip_if_no_scipy() - - df = DataFrame(randn(100, 2)) - - def scat(**kwds): - return plotting.scatter_matrix(df, **kwds) - - with tm.assert_produces_warning(UserWarning): - _check_plot_works(scat) - with tm.assert_produces_warning(UserWarning): - _check_plot_works(scat, marker='+') - with tm.assert_produces_warning(UserWarning): - _check_plot_works(scat, vmin=0) - if _ok_for_gaussian_kde('kde'): - with tm.assert_produces_warning(UserWarning): - _check_plot_works(scat, diagonal='kde') - if _ok_for_gaussian_kde('density'): - with tm.assert_produces_warning(UserWarning): - _check_plot_works(scat, diagonal='density') - with tm.assert_produces_warning(UserWarning): - _check_plot_works(scat, diagonal='hist') - with tm.assert_produces_warning(UserWarning): - _check_plot_works(scat, range_padding=.1) - with tm.assert_produces_warning(UserWarning): - _check_plot_works(scat, color='rgb') - with tm.assert_produces_warning(UserWarning): - _check_plot_works(scat, c='rgb') - with tm.assert_produces_warning(UserWarning): - _check_plot_works(scat, facecolor='rgb') - - def scat2(x, y, by=None, ax=None, figsize=None): - return plotting._core.scatter_plot(df, x, y, by, ax, figsize=None) - - _check_plot_works(scat2, x=0, y=1) - grouper = Series(np.repeat([1, 2, 3, 4, 5], 20), df.index) - with tm.assert_produces_warning(UserWarning): - _check_plot_works(scat2, x=0, y=1, by=grouper) - def test_scatter_matrix_axis(self): tm._skip_if_no_scipy() scatter_matrix = plotting.scatter_matrix diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index 9c9011ba1ca7b2..8164ad74a190a7 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -571,6 +571,9 @@ def test_plot_fails_with_dupe_color_and_style(self): @pytest.mark.slow def test_hist_kde(self): + if not self.mpl_ge_1_5_0: + pytest.skip("mpl is not supported") + _, ax = self.plt.subplots() ax = self.ts.plot.hist(logy=True, ax=ax) self._check_ax_scales(ax, yaxis='log') @@ -596,6 +599,9 @@ def test_hist_kde(self): def test_kde_kwargs(self): tm._skip_if_no_scipy() _skip_if_no_scipy_gaussian_kde() + if not self.mpl_ge_1_5_0: + pytest.skip("mpl is not supported") + from numpy import linspace _check_plot_works(self.ts.plot.kde, bw_method=.5, ind=linspace(-100, 100, 20)) @@ -611,6 +617,9 @@ def test_kde_kwargs(self): def test_kde_missing_vals(self): tm._skip_if_no_scipy() _skip_if_no_scipy_gaussian_kde() + if not self.mpl_ge_1_5_0: + pytest.skip("mpl is not supported") + s = Series(np.random.uniform(size=50)) s[0] = np.nan axes = _check_plot_works(s.plot.kde) @@ -638,6 +647,9 @@ def test_hist_kwargs(self): @pytest.mark.slow def test_hist_kde_color(self): + if not self.mpl_ge_1_5_0: + pytest.skip("mpl is not supported") + _, ax = self.plt.subplots() ax = self.ts.plot.hist(logy=True, bins=10, color='b', ax=ax) self._check_ax_scales(ax, yaxis='log') diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index 4888f8fe996b63..114a055de81953 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -14,8 +14,7 @@ import pandas as pd from pandas import (Index, Series, DataFrame, isna, bdate_range, - NaT, date_range, timedelta_range, - _np_version_under1p8) + NaT, date_range, timedelta_range) from pandas.core.indexes.datetimes import Timestamp from pandas.core.indexes.timedeltas import Timedelta import pandas.core.nanops as nanops @@ -687,14 +686,13 @@ def run_ops(ops, get_ser, test_ser): assert_series_equal(result, exp) # odd numpy behavior with scalar timedeltas - if not _np_version_under1p8: - result = td1[0] + dt1 - exp = (dt1.dt.tz_localize(None) + td1[0]).dt.tz_localize(tz) - assert_series_equal(result, exp) + result = td1[0] + dt1 + exp = (dt1.dt.tz_localize(None) + td1[0]).dt.tz_localize(tz) + assert_series_equal(result, exp) - result = td2[0] + dt2 - exp = (dt2.dt.tz_localize(None) + td2[0]).dt.tz_localize(tz) - assert_series_equal(result, exp) + result = td2[0] + dt2 + exp = (dt2.dt.tz_localize(None) + td2[0]).dt.tz_localize(tz) + assert_series_equal(result, exp) result = dt1 - td1[0] exp = (dt1.dt.tz_localize(None) - td1[0]).dt.tz_localize(tz) diff --git a/pandas/tests/series/test_quantile.py b/pandas/tests/series/test_quantile.py index 21379641a78d86..cf5e3fe4f29b06 100644 --- a/pandas/tests/series/test_quantile.py +++ b/pandas/tests/series/test_quantile.py @@ -1,11 +1,10 @@ # coding=utf-8 # pylint: disable-msg=E1101,W0612 -import pytest import numpy as np import pandas as pd -from pandas import (Index, Series, _np_version_under1p9) +from pandas import Index, Series from pandas.core.indexes.datetimes import Timestamp from pandas.core.dtypes.common import is_integer import pandas.util.testing as tm @@ -68,8 +67,6 @@ def test_quantile_multi(self): [], dtype=float)) tm.assert_series_equal(result, expected) - @pytest.mark.skipif(_np_version_under1p9, - reason="Numpy version is under 1.9") def test_quantile_interpolation(self): # see gh-10174 @@ -82,8 +79,6 @@ def test_quantile_interpolation(self): # test with and without interpolation keyword assert q == q1 - @pytest.mark.skipif(_np_version_under1p9, - reason="Numpy version is under 1.9") def test_quantile_interpolation_dtype(self): # GH #10174 @@ -96,26 +91,6 @@ def test_quantile_interpolation_dtype(self): assert q == np.percentile(np.array([1, 3, 4]), 50) assert is_integer(q) - @pytest.mark.skipif(not _np_version_under1p9, - reason="Numpy version is greater 1.9") - def test_quantile_interpolation_np_lt_1p9(self): - # GH #10174 - - # interpolation = linear (default case) - q = self.ts.quantile(0.1, interpolation='linear') - assert q == np.percentile(self.ts.valid(), 10) - q1 = self.ts.quantile(0.1) - assert q1 == np.percentile(self.ts.valid(), 10) - - # interpolation other than linear - msg = "Interpolation methods other than " - with tm.assert_raises_regex(ValueError, msg): - self.ts.quantile(0.9, interpolation='nearest') - - # object dtype - with tm.assert_raises_regex(ValueError, msg): - Series(self.ts, dtype=object).quantile(0.7, interpolation='higher') - def test_quantile_nan(self): # GH 13098 diff --git a/pandas/tests/series/test_rank.py b/pandas/tests/series/test_rank.py index ff489eb7f15b1e..128a4cdd845e6e 100644 --- a/pandas/tests/series/test_rank.py +++ b/pandas/tests/series/test_rank.py @@ -28,8 +28,8 @@ class TestSeriesRank(TestData): } def test_rank(self): - tm._skip_if_no_scipy() - from scipy.stats import rankdata + pytest.importorskip('scipy.stats.special') + rankdata = pytest.importorskip('scipy.stats.rankdata') self.ts[::2] = np.nan self.ts[:10][::3] = 4. @@ -246,10 +246,9 @@ def _check(s, expected, method='average'): _check(series, results[method], method=method) def test_rank_methods_series(self): - tm.skip_if_no_package('scipy', min_version='0.13', - app='scipy.stats.rankdata') + pytest.importorskip('scipy.stats.special') + rankdata = pytest.importorskip('scipy.stats.rankdata') import scipy - from scipy.stats import rankdata xs = np.random.randn(9) xs = np.concatenate([xs[i:] for i in range(0, 9, 2)]) # add duplicates diff --git a/pandas/tests/sparse/test_array.py b/pandas/tests/sparse/test_array.py index 4ce03f72dbba6e..b0a9182a265fe8 100644 --- a/pandas/tests/sparse/test_array.py +++ b/pandas/tests/sparse/test_array.py @@ -8,7 +8,6 @@ from numpy import nan import numpy as np -from pandas import _np_version_under1p8 from pandas.core.sparse.api import SparseArray, SparseSeries from pandas._libs.sparse import IntIndex from pandas.util.testing import assert_almost_equal @@ -150,10 +149,8 @@ def test_take(self): assert np.isnan(self.arr.take(0)) assert np.isscalar(self.arr.take(2)) - # np.take in < 1.8 doesn't support scalar indexing - if not _np_version_under1p8: - assert self.arr.take(2) == np.take(self.arr_data, 2) - assert self.arr.take(6) == np.take(self.arr_data, 6) + assert self.arr.take(2) == np.take(self.arr_data, 2) + assert self.arr.take(6) == np.take(self.arr_data, 6) exp = SparseArray(np.take(self.arr_data, [2, 3])) tm.assert_sp_array_equal(self.arr.take([2, 3]), exp) diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index 2a22fc9d329195..9305504f8d5e3e 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -8,7 +8,7 @@ import numpy as np import pandas as pd -from pandas import Series, isna, _np_version_under1p9 +from pandas import Series, isna from pandas.core.dtypes.common import is_integer_dtype import pandas.core.nanops as nanops import pandas.util.testing as tm @@ -340,15 +340,13 @@ def test_nanmean_overflow(self): # In the previous implementation mean can overflow for int dtypes, it # is now consistent with numpy - # numpy < 1.9.0 is not computing this correctly - if not _np_version_under1p9: - for a in [2 ** 55, -2 ** 55, 20150515061816532]: - s = Series(a, index=range(500), dtype=np.int64) - result = s.mean() - np_result = s.values.mean() - assert result == a - assert result == np_result - assert result.dtype == np.float64 + for a in [2 ** 55, -2 ** 55, 20150515061816532]: + s = Series(a, index=range(500), dtype=np.int64) + result = s.mean() + np_result = s.values.mean() + assert result == a + assert result == np_result + assert result.dtype == np.float64 def test_returned_dtype(self): diff --git a/pandas/tests/test_resample.py b/pandas/tests/test_resample.py index d938d5bf9f3abd..d42e37048d87ff 100644 --- a/pandas/tests/test_resample.py +++ b/pandas/tests/test_resample.py @@ -1688,7 +1688,7 @@ def test_resample_dtype_preservation(self): def test_resample_dtype_coerceion(self): - pytest.importorskip('scipy') + pytest.importorskip('scipy.interpolate') # GH 16361 df = {"a": [1, 3, 1, 4]} diff --git a/pandas/tests/tools/test_numeric.py b/pandas/tests/tools/test_numeric.py index 664a97640387ef..1d13ba93ba7592 100644 --- a/pandas/tests/tools/test_numeric.py +++ b/pandas/tests/tools/test_numeric.py @@ -3,7 +3,7 @@ import numpy as np import pandas as pd -from pandas import to_numeric, _np_version_under1p9 +from pandas import to_numeric from pandas.util import testing as tm from numpy import iinfo @@ -355,9 +355,6 @@ def test_downcast(self): def test_downcast_limits(self): # Test the limits of each downcast. Bug: #14401. - # Check to make sure numpy is new enough to run this test. - if _np_version_under1p9: - pytest.skip("Numpy version is under 1.9") i = 'integer' u = 'unsigned' diff --git a/setup.py b/setup.py index a912b253289540..04a5684c20fcd5 100755 --- a/setup.py +++ b/setup.py @@ -45,7 +45,7 @@ def is_platform_mac(): _have_setuptools = False setuptools_kwargs = {} -min_numpy_ver = '1.7.0' +min_numpy_ver = '1.9.0' if sys.version_info[0] >= 3: setuptools_kwargs = { From dfaf8c6918ff20ef781d3177f464a29e70ee5d65 Mon Sep 17 00:00:00 2001 From: jschendel Date: Tue, 22 Aug 2017 07:57:53 -0600 Subject: [PATCH 015/188] CLN: replace %s syntax with .format in core.tools, algorithms.py, base.py (#17305) --- pandas/core/algorithms.py | 10 +++++----- pandas/core/base.py | 19 ++++++++++--------- pandas/core/tools/datetimes.py | 32 ++++++++++++++++++-------------- pandas/core/tools/timedeltas.py | 7 ++++--- 4 files changed, 37 insertions(+), 31 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index ffd03096e2a27f..cccb094eaae7b6 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -393,12 +393,12 @@ def isin(comps, values): if not is_list_like(comps): raise TypeError("only list-like objects are allowed to be passed" - " to isin(), you passed a " - "[{0}]".format(type(comps).__name__)) + " to isin(), you passed a [{comps_type}]" + .format(comps_type=type(comps).__name__)) if not is_list_like(values): raise TypeError("only list-like objects are allowed to be passed" - " to isin(), you passed a " - "[{0}]".format(type(values).__name__)) + " to isin(), you passed a [{values_type}]" + .format(values_type=type(values).__name__)) if not isinstance(values, (ABCIndex, ABCSeries, np.ndarray)): values = lib.list_to_object_array(list(values)) @@ -671,7 +671,7 @@ def mode(values): try: result = np.sort(result) except TypeError as e: - warn("Unable to sort modes: %s" % e) + warn("Unable to sort modes: {error}".format(error=e)) result = _reconstruct_data(result, original.dtype, original) return Series(result) diff --git a/pandas/core/base.py b/pandas/core/base.py index 4ae47360357933..a7c991dc8d2572 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -342,24 +342,25 @@ def _obj_with_exclusions(self): def __getitem__(self, key): if self._selection is not None: - raise Exception('Column(s) %s already selected' % self._selection) + raise Exception('Column(s) {selection} already selected' + .format(selection=self._selection)) if isinstance(key, (list, tuple, ABCSeries, ABCIndexClass, np.ndarray)): if len(self.obj.columns.intersection(key)) != len(key): bad_keys = list(set(key).difference(self.obj.columns)) - raise KeyError("Columns not found: %s" - % str(bad_keys)[1:-1]) + raise KeyError("Columns not found: {missing}" + .format(missing=str(bad_keys)[1:-1])) return self._gotitem(list(key), ndim=2) elif not getattr(self, 'as_index', False): if key not in self.obj.columns: - raise KeyError("Column not found: %s" % key) + raise KeyError("Column not found: {key}".format(key=key)) return self._gotitem(key, ndim=2) else: if key not in self.obj: - raise KeyError("Column not found: %s" % key) + raise KeyError("Column not found: {key}".format(key=key)) return self._gotitem(key, ndim=1) def _gotitem(self, key, ndim, subset=None): @@ -409,7 +410,7 @@ def _try_aggregate_string_function(self, arg, *args, **kwargs): if f is not None: return f(self, *args, **kwargs) - raise ValueError("{} is an unknown string function".format(arg)) + raise ValueError("{arg} is an unknown string function".format(arg=arg)) def _aggregate(self, arg, *args, **kwargs): """ @@ -484,9 +485,9 @@ def nested_renaming_depr(level=4): is_nested_renamer = True if k not in obj.columns: - raise SpecificationError('cannot perform renaming ' - 'for {0} with a nested ' - 'dictionary'.format(k)) + msg = ('cannot perform renaming for {key} with a ' + 'nested dictionary').format(key=k) + raise SpecificationError(msg) nested_renaming_depr(4 + (_level or 0)) elif isinstance(obj, ABCSeries): diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 6ff4302937d073..53f58660cabdb5 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -46,7 +46,8 @@ def _infer(a, b): if b and b.tzinfo: if not (tslib.get_timezone(tz) == tslib.get_timezone(b.tzinfo)): raise AssertionError('Inputs must both have the same timezone,' - ' {0} != {1}'.format(tz, b.tzinfo)) + ' {timezone1} != {timezone2}' + .format(timezone1=tz, timezone2=b.tzinfo)) return tz tz = None @@ -491,10 +492,10 @@ def _convert_listlike(arg, box, format, name=None, tz=tz): offset = tslib.Timestamp(origin) - tslib.Timestamp(0) except tslib.OutOfBoundsDatetime: raise tslib.OutOfBoundsDatetime( - "origin {} is Out of Bounds".format(origin)) + "origin {origin} is Out of Bounds".format(origin=origin)) except ValueError: - raise ValueError("origin {} cannot be converted " - "to a Timestamp".format(origin)) + raise ValueError("origin {origin} cannot be converted " + "to a Timestamp".format(origin=origin)) # convert the offset to the unit of the arg # this should be lossless in terms of precision @@ -590,16 +591,16 @@ def f(value): required = ['year', 'month', 'day'] req = sorted(list(set(required) - set(unit_rev.keys()))) if len(req): - raise ValueError("to assemble mappings requires at " - "least that [year, month, day] be specified: " - "[{0}] is missing".format(','.join(req))) + raise ValueError("to assemble mappings requires at least that " + "[year, month, day] be specified: [{required}] " + "is missing".format(required=','.join(req))) # keys we don't recognize excess = sorted(list(set(unit_rev.keys()) - set(_unit_map.values()))) if len(excess): raise ValueError("extra keys have been passed " "to the datetime assemblage: " - "[{0}]".format(','.join(excess))) + "[{excess}]".format(','.join(excess=excess))) def coerce(values): # we allow coercion to if errors allows @@ -617,7 +618,7 @@ def coerce(values): values = to_datetime(values, format='%Y%m%d', errors=errors) except (TypeError, ValueError) as e: raise ValueError("cannot assemble the " - "datetimes: {0}".format(e)) + "datetimes: {error}".format(error=e)) for u in ['h', 'm', 's', 'ms', 'us', 'ns']: value = unit_rev.get(u) @@ -627,8 +628,8 @@ def coerce(values): unit=u, errors=errors) except (TypeError, ValueError) as e: - raise ValueError("cannot assemble the datetimes " - "[{0}]: {1}".format(value, e)) + raise ValueError("cannot assemble the datetimes [{value}]: " + "{error}".format(value=value, error=e)) return values @@ -810,8 +811,10 @@ def _convert_listlike(arg, format): times.append(datetime.strptime(element, format).time()) except (ValueError, TypeError): if errors == 'raise': - raise ValueError("Cannot convert %s to a time with " - "given format %s" % (element, format)) + msg = ("Cannot convert {element} to a time with given " + "format {format}").format(element=element, + format=format) + raise ValueError(msg) elif errors == 'ignore': return arg else: @@ -876,6 +879,7 @@ def ole2datetime(oledt): # Excel has a bug where it thinks the date 2/29/1900 exists # we just reject any date before 3/1/1900. if val < 61: - raise ValueError("Value is outside of acceptable range: %s " % val) + msg = "Value is outside of acceptable range: {value}".format(value=val) + raise ValueError(msg) return OLE_TIME_ZERO + timedelta(days=val) diff --git a/pandas/core/tools/timedeltas.py b/pandas/core/tools/timedeltas.py index f2d99d26a87b84..d5132826bb93f8 100644 --- a/pandas/core/tools/timedeltas.py +++ b/pandas/core/tools/timedeltas.py @@ -129,7 +129,8 @@ def _validate_timedelta_unit(arg): except: if arg is None: return 'ns' - raise ValueError("invalid timedelta unit {0} provided".format(arg)) + raise ValueError("invalid timedelta unit {arg} provided" + .format(arg=arg)) def _coerce_scalar_to_timedelta_type(r, unit='ns', box=True, errors='raise'): @@ -161,8 +162,8 @@ def _convert_listlike(arg, unit='ns', box=True, errors='raise', name=None): if is_timedelta64_dtype(arg): value = arg.astype('timedelta64[ns]') elif is_integer_dtype(arg): - value = arg.astype('timedelta64[{0}]'.format( - unit)).astype('timedelta64[ns]', copy=False) + value = arg.astype('timedelta64[{unit}]'.format(unit=unit)).astype( + 'timedelta64[ns]', copy=False) else: try: value = tslib.array_to_timedelta64(_ensure_object(arg), From 2bec750b21b8715e3f55e71a6c69f2abef54d08b Mon Sep 17 00:00:00 2001 From: ante328 Date: Tue, 22 Aug 2017 16:31:14 +0200 Subject: [PATCH 016/188] BUG: Fix strange behaviour of Series.iloc on MultiIndex Series (#17148) (#17291) --- doc/source/whatsnew/v0.21.0.txt | 1 + pandas/core/indexing.py | 3 ++- pandas/tests/indexing/test_iloc.py | 29 +++++++++++++++++++++++++++++ 3 files changed, 32 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 148fd0a8324021..f760d0b6359a2d 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -353,6 +353,7 @@ Indexing - Fixes ``DataFrame.loc`` for setting with alignment and tz-aware ``DatetimeIndex`` (:issue:`16889`) - Avoids ``IndexError`` when passing an Index or Series to ``.iloc`` with older numpy (:issue:`17193`) - Allow unicode empty strings as placeholders in multilevel columns in Python 2 (:issue:`17099`) +- Bug in ``.iloc`` when used with inplace addition or assignment and an int indexer on a ``MultiIndex`` causing the wrong indexes to be read from and written to (:issue:`17148`) I/O ^^^ diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 929c2346ba5b0d..6b9ad5cd2d93b7 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -146,7 +146,8 @@ def _get_setitem_indexer(self, key): return self._convert_tuple(key, is_setter=True) axis = self.obj._get_axis(0) - if isinstance(axis, MultiIndex): + + if isinstance(axis, MultiIndex) and self.name != 'iloc': try: return axis.get_loc(key) except Exception: diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 31fee303a41e20..39569f0b0cb383 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -269,6 +269,35 @@ def test_iloc_setitem(self): expected = Series([0, 1, 0], index=[4, 5, 6]) tm.assert_series_equal(s, expected) + @pytest.mark.parametrize( + 'data, indexes, values, expected_k', [ + # test without indexer value in first level of MultiIndex + ([[2, 22, 5], [2, 33, 6]], [0, -1, 1], [2, 3, 1], [7, 10]), + # test like code sample 1 in the issue + ([[1, 22, 555], [1, 33, 666]], [0, -1, 1], [200, 300, 100], + [755, 1066]), + # test like code sample 2 in the issue + ([[1, 3, 7], [2, 4, 8]], [0, -1, 1], [10, 10, 1000], [17, 1018]), + # test like code sample 3 in the issue + ([[1, 11, 4], [2, 22, 5], [3, 33, 6]], [0, -1, 1], [4, 7, 10], + [8, 15, 13]) + ]) + def test_iloc_setitem_int_multiindex_series( + self, data, indexes, values, expected_k): + # GH17148 + df = pd.DataFrame( + data=data, + columns=['i', 'j', 'k']) + df = df.set_index(['i', 'j']) + + series = df.k.copy() + for i, v in zip(indexes, values): + series.iloc[i] += v + + df['k'] = expected_k + expected = df.k + tm.assert_series_equal(series, expected) + def test_iloc_setitem_list(self): # setitem with an iloc list From 0cf2b146c526fe85e2df45b1c5e80da404c9d58f Mon Sep 17 00:00:00 2001 From: gfyoung Date: Wed, 23 Aug 2017 13:30:34 -0700 Subject: [PATCH 017/188] DOC: Add module doc-string to tseries/api.py --- pandas/tseries/api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tseries/api.py b/pandas/tseries/api.py index 71386c02547ba7..2094791ecdc609 100644 --- a/pandas/tseries/api.py +++ b/pandas/tseries/api.py @@ -1,5 +1,5 @@ """ - +Timeseries API """ # flake8: noqa From 66ec5f3e616f6449ef2c88401042cf2a282234d7 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Wed, 23 Aug 2017 14:35:49 -0700 Subject: [PATCH 018/188] MAINT: Clean up docs in pandas/errors/__init__.py --- pandas/errors/__init__.py | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index 6304f3a527f2c9..42b3bdd4991a9a 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -1,25 +1,28 @@ # flake8: noqa -""" expose public exceptions & warnings """ +""" +Expose public exceptions & warnings +""" from pandas._libs.tslib import OutOfBoundsDatetime class PerformanceWarning(Warning): """ - Warnings shown when there is a possible performance - impact. + Warning raised when there is a possible + performance impact. """ class UnsupportedFunctionCall(ValueError): """ - If attempting to call a numpy function on a pandas - object. For example using ``np.cumsum(groupby_object)``. + Exception raised when attempting to call a numpy function + on a pandas object, but that function is not supported by + the object e.g. ``np.cumsum(groupby_object)``. """ class UnsortedIndexError(KeyError): """ - Error raised when attempting to get a slice of a MultiIndex + Error raised when attempting to get a slice of a MultiIndex, and the index has not been lexsorted. Subclass of `KeyError`. .. versionadded:: 0.20.0 @@ -29,22 +32,22 @@ class UnsortedIndexError(KeyError): class ParserError(ValueError): """ - Exception that is thrown by an error is encountered in `pd.read_csv` + Exception that is raised by an error encountered in `pd.read_csv`. """ class DtypeWarning(Warning): """ - Warning that is raised for a dtype incompatiblity. This is + Warning that is raised for a dtype incompatiblity. This can happen whenever `pd.read_csv` encounters non- - uniform dtypes in a column(s) of a given CSV file + uniform dtypes in a column(s) of a given CSV file. """ class EmptyDataError(ValueError): """ Exception that is thrown in `pd.read_csv` (by both the C and - Python engines) when empty data or header is encountered + Python engines) when empty data or header is encountered. """ @@ -53,7 +56,7 @@ class ParserWarning(Warning): Warning that is raised in `pd.read_csv` whenever it is necessary to change parsers (generally from 'c' to 'python') contrary to the one specified by the user due to lack of support or functionality for - parsing particular attributes of a CSV file with the requsted engine + parsing particular attributes of a CSV file with the requsted engine. """ @@ -61,5 +64,4 @@ class MergeError(ValueError): """ Error raised when problems arise during merging due to problems with input data. Subclass of `ValueError`. - """ From d45e12b87ce867b2df3254c386c0f17f175efbf0 Mon Sep 17 00:00:00 2001 From: jschendel Date: Thu, 24 Aug 2017 03:50:18 -0600 Subject: [PATCH 019/188] CLN: replace %s syntax with .format in missing.py, nanops.py, ops.py (#17322) Replaced %s syntax with .format in missing.py, nanops.py, ops.py. Additionally, made some of the existing positional .format code more explicit. --- pandas/core/missing.py | 41 +++++++++++++-------- pandas/core/nanops.py | 8 ++-- pandas/core/ops.py | 84 +++++++++++++++++++++++------------------- 3 files changed, 76 insertions(+), 57 deletions(-) diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 93281e20a2a964..8a6a870834c83e 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -88,8 +88,8 @@ def clean_fill_method(method, allow_nearest=False): valid_methods.append('nearest') expecting = 'pad (ffill), backfill (bfill) or nearest' if method not in valid_methods: - msg = ('Invalid fill method. Expecting %s. Got %s' % - (expecting, method)) + msg = ('Invalid fill method. Expecting {expecting}. Got {method}' + .format(expecting=expecting, method=method)) raise ValueError(msg) return method @@ -104,8 +104,8 @@ def clean_interp_method(method, **kwargs): raise ValueError("You must specify the order of the spline or " "polynomial.") if method not in valid: - raise ValueError("method must be one of {0}." - "Got '{1}' instead.".format(valid, method)) + raise ValueError("method must be one of {valid}. Got '{method}' " + "instead.".format(valid=valid, method=method)) return method @@ -146,8 +146,10 @@ def interpolate_1d(xvalues, yvalues, method='linear', limit=None, valid_limit_directions = ['forward', 'backward', 'both'] limit_direction = limit_direction.lower() if limit_direction not in valid_limit_directions: - raise ValueError('Invalid limit_direction: expecting one of %r, got ' - '%r.' % (valid_limit_directions, limit_direction)) + msg = ('Invalid limit_direction: expecting one of {valid!r}, ' + 'got {invalid!r}.') + raise ValueError(msg.format(valid=valid_limit_directions, + invalid=limit_direction)) from pandas import Series ys = Series(yvalues) @@ -248,7 +250,8 @@ def _interpolate_scipy_wrapper(x, y, new_x, method, fill_value=None, # TODO: Why is DatetimeIndex being imported here? from pandas import DatetimeIndex # noqa except ImportError: - raise ImportError('{0} interpolation requires Scipy'.format(method)) + raise ImportError('{method} interpolation requires SciPy' + .format(method=method)) new_x = np.asarray(new_x) @@ -466,7 +469,8 @@ def pad_1d(values, limit=None, mask=None, dtype=None): dtype = values.dtype _method = None if is_float_dtype(values): - _method = getattr(algos, 'pad_inplace_%s' % dtype.name, None) + name = 'pad_inplace_{name}'.format(name=dtype.name) + _method = getattr(algos, name, None) elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype): _method = _pad_1d_datetime elif is_integer_dtype(values): @@ -476,7 +480,8 @@ def pad_1d(values, limit=None, mask=None, dtype=None): _method = algos.pad_inplace_object if _method is None: - raise ValueError('Invalid dtype for pad_1d [%s]' % dtype.name) + raise ValueError('Invalid dtype for pad_1d [{name}]' + .format(name=dtype.name)) if mask is None: mask = isna(values) @@ -490,7 +495,8 @@ def backfill_1d(values, limit=None, mask=None, dtype=None): dtype = values.dtype _method = None if is_float_dtype(values): - _method = getattr(algos, 'backfill_inplace_%s' % dtype.name, None) + name = 'backfill_inplace_{name}'.format(name=dtype.name) + _method = getattr(algos, name, None) elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype): _method = _backfill_1d_datetime elif is_integer_dtype(values): @@ -500,7 +506,8 @@ def backfill_1d(values, limit=None, mask=None, dtype=None): _method = algos.backfill_inplace_object if _method is None: - raise ValueError('Invalid dtype for backfill_1d [%s]' % dtype.name) + raise ValueError('Invalid dtype for backfill_1d [{name}]' + .format(name=dtype.name)) if mask is None: mask = isna(values) @@ -515,7 +522,8 @@ def pad_2d(values, limit=None, mask=None, dtype=None): dtype = values.dtype _method = None if is_float_dtype(values): - _method = getattr(algos, 'pad_2d_inplace_%s' % dtype.name, None) + name = 'pad_2d_inplace_{name}'.format(name=dtype.name) + _method = getattr(algos, name, None) elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype): _method = _pad_2d_datetime elif is_integer_dtype(values): @@ -525,7 +533,8 @@ def pad_2d(values, limit=None, mask=None, dtype=None): _method = algos.pad_2d_inplace_object if _method is None: - raise ValueError('Invalid dtype for pad_2d [%s]' % dtype.name) + raise ValueError('Invalid dtype for pad_2d [{name}]' + .format(name=dtype.name)) if mask is None: mask = isna(values) @@ -544,7 +553,8 @@ def backfill_2d(values, limit=None, mask=None, dtype=None): dtype = values.dtype _method = None if is_float_dtype(values): - _method = getattr(algos, 'backfill_2d_inplace_%s' % dtype.name, None) + name = 'backfill_2d_inplace_{name}'.format(name=dtype.name) + _method = getattr(algos, name, None) elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype): _method = _backfill_2d_datetime elif is_integer_dtype(values): @@ -554,7 +564,8 @@ def backfill_2d(values, limit=None, mask=None, dtype=None): _method = algos.backfill_2d_inplace_object if _method is None: - raise ValueError('Invalid dtype for backfill_2d [%s]' % dtype.name) + raise ValueError('Invalid dtype for backfill_2d [{name}]' + .format(name=dtype.name)) if mask is None: mask = isna(values) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index b2bbf1c75b7ea0..858aed7fd3e237 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -70,9 +70,8 @@ def __call__(self, f): def _f(*args, **kwargs): obj_iter = itertools.chain(args, compat.itervalues(kwargs)) if any(self.check(obj) for obj in obj_iter): - raise TypeError('reduction operation {0!r} not allowed for ' - 'this dtype'.format( - f.__name__.replace('nan', ''))) + msg = 'reduction operation {name!r} not allowed for this dtype' + raise TypeError(msg.format(name=f.__name__.replace('nan', ''))) try: with np.errstate(invalid='ignore'): return f(*args, **kwargs) @@ -786,7 +785,8 @@ def _ensure_numeric(x): try: x = complex(x) except Exception: - raise TypeError('Could not convert %s to numeric' % str(x)) + raise TypeError('Could not convert {value!s} to numeric' + .format(value=x)) return x # NA-friendly array comparisons diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 82101414e4aa61..221f6ff8b92c68 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -63,9 +63,9 @@ def _create_methods(arith_method, comp_method, bool_method, def names(x): if x[-1] == "_": - return "__%s_" % x + return "__{name}_".format(name=x) else: - return "__%s__" % x + return "__{name}__".format(name=x) else: names = lambda x: x @@ -388,8 +388,8 @@ def _validate(self, lvalues, rvalues, name): if name not in ('__div__', '__truediv__', '__mul__', '__rmul__'): raise TypeError("can only operate on a timedelta and an " "integer or a float for division and " - "multiplication, but the operator [%s] was" - "passed" % name) + "multiplication, but the operator [{name}] " + "was passed".format(name=name)) # 2 timedeltas elif ((self.is_timedelta_lhs and @@ -400,9 +400,9 @@ def _validate(self, lvalues, rvalues, name): if name not in ('__div__', '__rdiv__', '__truediv__', '__rtruediv__', '__add__', '__radd__', '__sub__', '__rsub__'): - raise TypeError("can only operate on a timedeltas for " - "addition, subtraction, and division, but the" - " operator [%s] was passed" % name) + raise TypeError("can only operate on a timedeltas for addition" + ", subtraction, and division, but the operator" + " [{name}] was passed".format(name=name)) # datetime and timedelta/DateOffset elif (self.is_datetime_lhs and @@ -411,23 +411,24 @@ def _validate(self, lvalues, rvalues, name): if name not in ('__add__', '__radd__', '__sub__'): raise TypeError("can only operate on a datetime with a rhs of " "a timedelta/DateOffset for addition and " - "subtraction, but the operator [%s] was " - "passed" % name) + "subtraction, but the operator [{name}] was " + "passed".format(name=name)) elif (self.is_datetime_rhs and (self.is_timedelta_lhs or self.is_offset_lhs)): if name not in ('__add__', '__radd__', '__rsub__'): raise TypeError("can only operate on a timedelta/DateOffset " "with a rhs of a datetime for addition, " - "but the operator [%s] was passed" % name) + "but the operator [{name}] was passed" + .format(name=name)) # 2 datetimes elif self.is_datetime_lhs and self.is_datetime_rhs: if name not in ('__sub__', '__rsub__'): raise TypeError("can only operate on a datetimes for" - " subtraction, but the operator [%s] was" - " passed" % name) + " subtraction, but the operator [{name}] was" + " passed".format(name=name)) # if tz's must be equal (same or None) if getattr(lvalues, 'tz', None) != getattr(rvalues, 'tz', None): @@ -439,8 +440,8 @@ def _validate(self, lvalues, rvalues, name): if name not in ('__add__', '__radd__'): raise TypeError("can only operate on a timedelta/DateOffset " - "and a datetime for addition, but the " - "operator [%s] was passed" % name) + "and a datetime for addition, but the operator" + " [{name}] was passed".format(name=name)) else: raise TypeError('cannot operate on a series without a rhs ' 'of a series/ndarray of type datetime64[ns] ' @@ -498,7 +499,7 @@ def _convert_to_array(self, values, name=None, other=None): values = values.to_timestamp().to_series() elif name not in ('__truediv__', '__div__', '__mul__', '__rmul__'): raise TypeError("incompatible type for a datetime/timedelta " - "operation [{0}]".format(name)) + "operation [{name}]".format(name=name)) elif inferred_type == 'floating': if (isna(values).all() and name in ('__add__', '__radd__', '__sub__', '__rsub__')): @@ -508,8 +509,9 @@ def _convert_to_array(self, values, name=None, other=None): elif self._is_offset(values): return values else: - raise TypeError("incompatible type [{0}] for a datetime/timedelta" - " operation".format(np.array(values).dtype)) + raise TypeError("incompatible type [{dtype}] for a " + "datetime/timedelta operation" + .format(dtype=np.array(values).dtype)) return values @@ -866,8 +868,8 @@ def wrapper(self, other, axis=None): with np.errstate(all='ignore'): res = na_op(values, other) if is_scalar(res): - raise TypeError('Could not compare %s type with Series' % - type(other)) + raise TypeError('Could not compare {typ} type with Series' + .format(typ=type(other))) # always return a full value series here res = _values_from_object(res) @@ -906,9 +908,10 @@ def na_op(x, y): y = bool(y) result = lib.scalar_binop(x, y, op) except: - raise TypeError("cannot compare a dtyped [{0}] array with " - "a scalar of type [{1}]".format( - x.dtype, type(y).__name__)) + msg = ("cannot compare a dtyped [{dtype}] array " + "with a scalar of type [{type}]" + ).format(dtype=x.dtype, type=type(y).__name__) + raise TypeError(msg) return result @@ -1140,14 +1143,17 @@ def _align_method_FRAME(left, right, axis): """ convert rhs to meet lhs dims if input is list, tuple or np.ndarray """ def to_series(right): - msg = 'Unable to coerce to Series, length must be {0}: given {1}' + msg = ('Unable to coerce to Series, length must be {req_len}: ' + 'given {given_len}') if axis is not None and left._get_axis_name(axis) == 'index': if len(left.index) != len(right): - raise ValueError(msg.format(len(left.index), len(right))) + raise ValueError(msg.format(req_len=len(left.index), + given_len=len(right))) right = left._constructor_sliced(right, index=left.index) else: if len(left.columns) != len(right): - raise ValueError(msg.format(len(left.columns), len(right))) + raise ValueError(msg.format(req_len=len(left.columns), + given_len=len(right))) right = left._constructor_sliced(right, index=left.columns) return right @@ -1161,15 +1167,16 @@ def to_series(right): elif right.ndim == 2: if left.shape != right.shape: - msg = ("Unable to coerce to DataFrame, " - "shape must be {0}: given {1}") - raise ValueError(msg.format(left.shape, right.shape)) + msg = ("Unable to coerce to DataFrame, shape " + "must be {req_shape}: given {given_shape}" + ).format(req_shape=left.shape, given_shape=right.shape) + raise ValueError(msg) right = left._constructor(right, index=left.index, columns=left.columns) else: - msg = 'Unable to coerce to Series/DataFrame, dim must be <= 2: {0}' - raise ValueError(msg.format(right.shape, )) + raise ValueError('Unable to coerce to Series/DataFrame, dim ' + 'must be <= 2: {dim}'.format(dim=right.shape)) return right @@ -1278,7 +1285,8 @@ def na_op(x, y): return result - @Appender('Wrapper for flexible comparison methods %s' % name) + @Appender('Wrapper for flexible comparison methods {name}' + .format(name=name)) def f(self, other, axis=default_axis, level=None): other = _align_method_FRAME(self, other, axis) @@ -1299,7 +1307,7 @@ def f(self, other, axis=default_axis, level=None): def _comp_method_FRAME(func, name, str_rep, masker=False): - @Appender('Wrapper for comparison method %s' % name) + @Appender('Wrapper for comparison method {name}'.format(name=name)) def f(self, other): if isinstance(other, pd.DataFrame): # Another DataFrame return self._compare_frame(other, func, str_rep) @@ -1349,9 +1357,9 @@ def na_op(x, y): # work only for scalars def f(self, other): if not is_scalar(other): - raise ValueError('Simple arithmetic with %s can only be ' - 'done with scalar values' % - self._constructor.__name__) + raise ValueError('Simple arithmetic with {name} can only be ' + 'done with scalar values' + .format(name=self._constructor.__name__)) return self._combine(other, op) @@ -1384,7 +1392,7 @@ def na_op(x, y): return result - @Appender('Wrapper for comparison method %s' % name) + @Appender('Wrapper for comparison method {name}'.format(name=name)) def f(self, other, axis=None): # Validate the axis parameter if axis is not None: @@ -1394,8 +1402,8 @@ def f(self, other, axis=None): return self._compare_constructor(other, na_op, try_cast=False) elif isinstance(other, (self._constructor_sliced, pd.DataFrame, ABCSeries)): - raise Exception("input needs alignment for this object [%s]" % - self._constructor) + raise Exception("input needs alignment for this object [{object}]" + .format(object=self._constructor)) else: return self._combine_const(other, na_op, try_cast=False) From 6993c1ba981554cdd8f45675db5807077a28e2c0 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 24 Aug 2017 03:03:18 -0700 Subject: [PATCH 020/188] Make pd.Period immutable (#17239) --- doc/source/whatsnew/v0.21.0.txt | 2 ++ pandas/_libs/period.pyx | 17 +++++++++++------ pandas/tests/scalar/test_period.py | 11 +++++++++++ 3 files changed, 24 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index f760d0b6359a2d..604d275511fa02 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -291,6 +291,8 @@ Other API Changes - Moved definition of ``MergeError`` to the ``pandas.errors`` module. - The signature of :func:`Series.set_axis` and :func:`DataFrame.set_axis` has been changed from ``set_axis(axis, labels)`` to ``set_axis(labels, axis=0)``, for consistency with the rest of the API. The old signature is deprecated and will show a ``FutureWarning`` (:issue:`14636`) - :func:`Series.argmin` and :func:`Series.argmax` will now raise a ``TypeError`` when used with ``object`` dtypes, instead of a ``ValueError`` (:issue:`13595`) +- :class:`Period` is now immutable, and will now raise an ``AttributeError`` when a user tries to assign a new value to the ``ordinal`` or ``freq`` attributes (:issue:`17116`). + .. _whatsnew_0210.deprecations: diff --git a/pandas/_libs/period.pyx b/pandas/_libs/period.pyx index 6ba7ec0270f30a..a1d04fea891517 100644 --- a/pandas/_libs/period.pyx +++ b/pandas/_libs/period.pyx @@ -29,7 +29,9 @@ from datetime cimport ( PANDAS_FR_ns, INT32_MIN) + cimport util, lib + from lib cimport is_null_datetimelike, is_period from pandas._libs import tslib, lib from pandas._libs.tslib import (Timedelta, Timestamp, iNaT, @@ -668,13 +670,17 @@ class IncompatibleFrequency(ValueError): cdef class _Period(object): - cdef public: + cdef readonly: int64_t ordinal object freq _comparables = ['name', 'freqstr'] _typ = 'period' + def __cinit__(self, ordinal, freq): + self.ordinal = ordinal + self.freq = freq + @classmethod def _maybe_convert_freq(cls, object freq): @@ -698,9 +704,8 @@ cdef class _Period(object): if ordinal == iNaT: return NaT else: - self = _Period.__new__(cls) - self.ordinal = ordinal - self.freq = cls._maybe_convert_freq(freq) + freq = cls._maybe_convert_freq(freq) + self = _Period.__new__(cls, ordinal, freq) return self def __richcmp__(self, other, op): @@ -752,7 +757,7 @@ cdef class _Period(object): def __add__(self, other): if isinstance(self, Period): if isinstance(other, (timedelta, np.timedelta64, - offsets.Tick, offsets.DateOffset, + offsets.DateOffset, Timedelta)): return self._add_delta(other) elif other is NaT: @@ -770,7 +775,7 @@ cdef class _Period(object): def __sub__(self, other): if isinstance(self, Period): if isinstance(other, (timedelta, np.timedelta64, - offsets.Tick, offsets.DateOffset, + offsets.DateOffset, Timedelta)): neg_other = -other return self + neg_other diff --git a/pandas/tests/scalar/test_period.py b/pandas/tests/scalar/test_period.py index 931d6b2b8f1f09..a167c9c738b0bf 100644 --- a/pandas/tests/scalar/test_period.py +++ b/pandas/tests/scalar/test_period.py @@ -1406,3 +1406,14 @@ def test_period_ops_offset(self): with tm.assert_raises_regex(period.IncompatibleFrequency, msg): p - offsets.Hour(2) + + +def test_period_immutable(): + # see gh-17116 + per = pd.Period('2014Q1') + with pytest.raises(AttributeError): + per.ordinal = 14 + + freq = per.freq + with pytest.raises(AttributeError): + per.freq = 2 * freq From 62527c0f328caa4ae716328246df75a6f2b33028 Mon Sep 17 00:00:00 2001 From: P-Tillmann Date: Thu, 24 Aug 2017 12:38:27 +0200 Subject: [PATCH 021/188] Bug: groupby multiindex levels equals rows (#16859) closes #16843 --- doc/source/whatsnew/v0.21.0.txt | 2 +- pandas/core/groupby.py | 9 +++++---- pandas/tests/groupby/test_groupby.py | 13 +++++++++++++ 3 files changed, 19 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 604d275511fa02..6317b4ae845656 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -383,7 +383,7 @@ Groupby/Resample/Rolling - Bug in ``groupby.transform()`` that would coerce boolean dtypes back to float (:issue:`16875`) - Bug in ``Series.resample(...).apply()`` where an empty ``Series`` modified the source index and did not return the name of a ``Series`` (:issue:`14313`) - Bug in ``.rolling(...).apply(...)`` with a ``DataFrame`` with a ``DatetimeIndex``, a ``window`` of a timedelta-convertible and ``min_periods >= 1` (:issue:`15305`) - +- Bug in ``DataFrame.groupby`` where index and column keys were not recognized correctly when the number of keys equaled the number of elements on the groupby axis (:issue:`16859`) Sparse ^^^^^^ diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index aa7c4517c0a016..c23b00dc740a43 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2629,13 +2629,14 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True, try: if isinstance(obj, DataFrame): - all_in_columns = all(g in obj.columns for g in keys) + all_in_columns_index = all(g in obj.columns or g in obj.index.names + for g in keys) else: - all_in_columns = False + all_in_columns_index = False except Exception: - all_in_columns = False + all_in_columns_index = False - if not any_callable and not all_in_columns and \ + if not any_callable and not all_in_columns_index and \ not any_arraylike and not any_groupers and \ match_axis_length and level is None: keys = [com._asarray_tuplesafe(keys)] diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index f9e1a0d2e744a9..8957beacab376d 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -3891,6 +3891,19 @@ def predictions(tool): result = df2.groupby('Key').apply(predictions).p1 tm.assert_series_equal(expected, result) + def test_gb_key_len_equal_axis_len(self): + # GH16843 + # test ensures that index and column keys are recognized correctly + # when number of keys equals axis length of groupby + df = pd.DataFrame([['foo', 'bar', 'B', 1], + ['foo', 'bar', 'B', 2], + ['foo', 'baz', 'C', 3]], + columns=['first', 'second', 'third', 'one']) + df = df.set_index(['first', 'second']) + df = df.groupby(['first', 'second', 'third']).size() + assert df.loc[('foo', 'bar', 'B')] == 2 + assert df.loc[('foo', 'baz', 'C')] == 1 + def _check_groupby(df, result, keys, field, f=lambda x: x.sum()): tups = lmap(tuple, df[keys].values) From 96f92eb1c696723b6465fdc273dc8406201c606a Mon Sep 17 00:00:00 2001 From: step4me Date: Thu, 24 Aug 2017 08:53:50 -0400 Subject: [PATCH 022/188] BUG: Cannot use tz-aware origin in to_datetime (#16842) closes #16842 Author: step4me Closes #17244 from step4me/step4me-feature and squashes the following commits: 09d051d48 [step4me] BUG: Cannot use tz-aware origin in to_datetime (#16842) --- doc/source/whatsnew/v0.21.0.txt | 5 +++-- pandas/core/tools/datetimes.py | 7 ++++++- pandas/tests/indexes/datetimes/test_tools.py | 6 ++++++ 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 6317b4ae845656..fcadd26156b1d4 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -292,6 +292,7 @@ Other API Changes - The signature of :func:`Series.set_axis` and :func:`DataFrame.set_axis` has been changed from ``set_axis(axis, labels)`` to ``set_axis(labels, axis=0)``, for consistency with the rest of the API. The old signature is deprecated and will show a ``FutureWarning`` (:issue:`14636`) - :func:`Series.argmin` and :func:`Series.argmax` will now raise a ``TypeError`` when used with ``object`` dtypes, instead of a ``ValueError`` (:issue:`13595`) - :class:`Period` is now immutable, and will now raise an ``AttributeError`` when a user tries to assign a new value to the ``ordinal`` or ``freq`` attributes (:issue:`17116`). +- :func:`to_datetime` when passed a tz-aware ``origin=`` kwarg will now raise a more informative ``ValueError`` rather than a ``TypeError`` (:issue:`16842`) .. _whatsnew_0210.deprecations: @@ -356,6 +357,7 @@ Indexing - Avoids ``IndexError`` when passing an Index or Series to ``.iloc`` with older numpy (:issue:`17193`) - Allow unicode empty strings as placeholders in multilevel columns in Python 2 (:issue:`17099`) - Bug in ``.iloc`` when used with inplace addition or assignment and an int indexer on a ``MultiIndex`` causing the wrong indexes to be read from and written to (:issue:`17148`) +- Bug in ``.isin()`` in which checking membership in empty ``Series`` objects raised an error (:issue:`16991`) I/O ^^^ @@ -402,6 +404,7 @@ Reshaping - Fixes dtype of result with integer dtype input, from :func:`pivot_table` when called with ``margins=True`` (:issue:`17013`) - Bug in :func:`crosstab` where passing two ``Series`` with the same name raised a ``KeyError`` (:issue:`13279`) - :func:`Series.argmin`, :func:`Series.argmax`, and their counterparts on ``DataFrame`` and groupby objects work correctly with floating point data that contains infinite values (:issue:`13595`). +- Bug in :func:`unique` where checking a tuple of strings raised a ``TypeError`` (:issue:`17108`) Numeric ^^^^^^^ @@ -420,5 +423,3 @@ Categorical Other ^^^^^ - Bug in :func:`eval` where the ``inplace`` parameter was being incorrectly handled (:issue:`16732`) -- Bug in ``.isin()`` in which checking membership in empty ``Series`` objects raised an error (:issue:`16991`) -- Bug in :func:`unique` where checking a tuple of strings raised a ``TypeError`` (:issue:`17108`) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 53f58660cabdb5..c0f234a36803d7 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -489,7 +489,7 @@ def _convert_listlike(arg, box, format, name=None, tz=tz): # we are going to offset back to unix / epoch time try: - offset = tslib.Timestamp(origin) - tslib.Timestamp(0) + offset = tslib.Timestamp(origin) except tslib.OutOfBoundsDatetime: raise tslib.OutOfBoundsDatetime( "origin {origin} is Out of Bounds".format(origin=origin)) @@ -497,6 +497,11 @@ def _convert_listlike(arg, box, format, name=None, tz=tz): raise ValueError("origin {origin} cannot be converted " "to a Timestamp".format(origin=origin)) + if offset.tz is not None: + raise ValueError( + "origin offset {} must be tz-naive".format(offset)) + offset -= tslib.Timestamp(0) + # convert the offset to the unit of the arg # this should be lossless in terms of precision offset = offset // tslib.Timedelta(1, unit=unit) diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index 9764b65d330af3..50669ee357bbdc 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -1589,6 +1589,12 @@ def test_invalid_origins(self, origin, exc, units, units_from_epochs): pd.to_datetime(units_from_epochs, unit=units, origin=origin) + def test_invalid_origins_tzinfo(self): + # GH16842 + with pytest.raises(ValueError): + pd.to_datetime(1, unit='D', + origin=datetime(2000, 1, 1, tzinfo=pytz.utc)) + def test_processing_order(self): # make sure we handle out-of-bounds *before* # constructing the dates From 473a7f3c186f6b0bfd9d3ce413fb627cf7a8f111 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 25 Aug 2017 13:29:57 -0700 Subject: [PATCH 023/188] Replace usage of total_seconds compat func with timedelta method (#17289) --- pandas/_libs/period.pyx | 7 ++--- pandas/_libs/src/datetime_helper.h | 36 ----------------------- pandas/_libs/src/ujson/python/objToJSON.c | 22 +++++++++++++- pandas/_libs/tslib.pyx | 28 +++++++----------- pandas/io/pytables.py | 2 +- pandas/tseries/offsets.py | 6 ++-- setup.py | 2 -- 7 files changed, 38 insertions(+), 65 deletions(-) delete mode 100644 pandas/_libs/src/datetime_helper.h diff --git a/pandas/_libs/period.pyx b/pandas/_libs/period.pyx index a1d04fea891517..816b7ebfff86de 100644 --- a/pandas/_libs/period.pyx +++ b/pandas/_libs/period.pyx @@ -10,9 +10,6 @@ from numpy cimport (int8_t, int32_t, int64_t, import_array, ndarray, NPY_INT64, NPY_DATETIME, NPY_TIMEDELTA) import numpy as np -cdef extern from "datetime_helper.h": - double total_seconds(object) - from libc.stdlib cimport free from pandas import compat @@ -552,7 +549,7 @@ cdef _reso_local(ndarray[int64_t] stamps, object tz): &dts) dt = datetime(dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us, tz) - delta = int(total_seconds(_get_utcoffset(tz, dt))) * 1000000000 + delta = int(_get_utcoffset(tz, dt).total_seconds()) * 1000000000 pandas_datetime_to_datetimestruct(stamps[i] + delta, PANDAS_FR_ns, &dts) curr_reso = _reso_stamp(&dts) @@ -619,7 +616,7 @@ cdef ndarray[int64_t] localize_dt64arr_to_period(ndarray[int64_t] stamps, &dts) dt = datetime(dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us, tz) - delta = int(total_seconds(_get_utcoffset(tz, dt))) * 1000000000 + delta = int(_get_utcoffset(tz, dt).total_seconds()) * 1000000000 pandas_datetime_to_datetimestruct(stamps[i] + delta, PANDAS_FR_ns, &dts) result[i] = get_period_ordinal(dts.year, dts.month, dts.day, diff --git a/pandas/_libs/src/datetime_helper.h b/pandas/_libs/src/datetime_helper.h deleted file mode 100644 index 8023285f85b9b3..00000000000000 --- a/pandas/_libs/src/datetime_helper.h +++ /dev/null @@ -1,36 +0,0 @@ -/* -Copyright (c) 2016, PyData Development Team -All rights reserved. - -Distributed under the terms of the BSD Simplified License. - -The full license is in the LICENSE file, distributed with this software. -*/ - -#ifndef PANDAS__LIBS_SRC_DATETIME_HELPER_H_ -#define PANDAS__LIBS_SRC_DATETIME_HELPER_H_ - -#include -#include "datetime.h" -#include "numpy/arrayobject.h" -#include "numpy/arrayscalars.h" - -npy_int64 get_long_attr(PyObject *o, const char *attr) { - npy_int64 long_val; - PyObject *value = PyObject_GetAttrString(o, attr); - long_val = (PyLong_Check(value) ? - PyLong_AsLongLong(value) : PyInt_AS_LONG(value)); - Py_DECREF(value); - return long_val; -} - -npy_float64 total_seconds(PyObject *td) { - // Python 2.6 compat - npy_int64 microseconds = get_long_attr(td, "microseconds"); - npy_int64 seconds = get_long_attr(td, "seconds"); - npy_int64 days = get_long_attr(td, "days"); - npy_int64 days_in_seconds = days * 24LL * 3600LL; - return (microseconds + (seconds + days_in_seconds) * 1000000.0) / 1000000.0; -} - -#endif // PANDAS__LIBS_SRC_DATETIME_HELPER_H_ diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index f2c0b18d351312..4beaa3fd449df2 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -47,9 +47,9 @@ Numeric decoder derived from from TCL library #include // NOLINT(build/include_order) #include // NOLINT(build/include_order) #include // NOLINT(build/include_order) -#include // NOLINT(build/include_order) #include // NOLINT(build/include_order) #include // NOLINT(build/include_order) +#include "datetime.h" static PyObject *type_decimal; @@ -329,6 +329,26 @@ static Py_ssize_t get_attr_length(PyObject *obj, char *attr) { return ret; } +npy_int64 get_long_attr(PyObject *o, const char *attr) { + npy_int64 long_val; + PyObject *value = PyObject_GetAttrString(o, attr); + long_val = (PyLong_Check(value) ? + PyLong_AsLongLong(value) : PyInt_AS_LONG(value)); + Py_DECREF(value); + return long_val; +} + +npy_float64 total_seconds(PyObject *td) { + // Python 2.6 compat + // TODO(anyone): remove this legacy workaround with a more + // direct td.total_seconds() + npy_int64 microseconds = get_long_attr(td, "microseconds"); + npy_int64 seconds = get_long_attr(td, "seconds"); + npy_int64 days = get_long_attr(td, "days"); + npy_int64 days_in_seconds = days * 24LL * 3600LL; + return (microseconds + (seconds + days_in_seconds) * 1000000.0) / 1000000.0; +} + static PyObject *get_item(PyObject *obj, Py_ssize_t i) { PyObject *tmp = PyInt_FromSsize_t(i); PyObject *ret; diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index c4a38ec660a4c3..b5aca2e3ec3094 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -26,9 +26,6 @@ from cpython cimport ( cdef extern from "Python.h": cdef PyTypeObject *Py_TYPE(object) -cdef extern from "datetime_helper.h": - double total_seconds(object) - # this is our datetime.pxd from libc.stdlib cimport free @@ -1639,7 +1636,7 @@ cdef inline void _localize_tso(_TSObject obj, object tz): pandas_datetime_to_datetimestruct(obj.value, PANDAS_FR_ns, &obj.dts) dt = datetime(obj.dts.year, obj.dts.month, obj.dts.day, obj.dts.hour, obj.dts.min, obj.dts.sec, obj.dts.us, tz) - delta = int(total_seconds(_get_utcoffset(tz, dt))) * 1000000000 + delta = int(_get_utcoffset(tz, dt).total_seconds()) * 1000000000 if obj.value != NPY_NAT: pandas_datetime_to_datetimestruct(obj.value + delta, PANDAS_FR_ns, &obj.dts) @@ -4136,7 +4133,7 @@ def tz_convert(ndarray[int64_t] vals, object tz1, object tz2): pandas_datetime_to_datetimestruct(v, PANDAS_FR_ns, &dts) dt = datetime(dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us, tz1) - delta = (int(total_seconds(_get_utcoffset(tz1, dt))) + delta = (int(_get_utcoffset(tz1, dt).total_seconds()) * 1000000000) utc_dates[i] = v - delta else: @@ -4176,8 +4173,8 @@ def tz_convert(ndarray[int64_t] vals, object tz1, object tz2): pandas_datetime_to_datetimestruct(v, PANDAS_FR_ns, &dts) dt = datetime(dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us, tz2) - delta = int(total_seconds( - _get_utcoffset(tz2, dt))) * 1000000000 + delta = (int(_get_utcoffset(tz2, dt).total_seconds()) + * 1000000000) result[i] = v + delta return result @@ -4243,7 +4240,7 @@ def tz_convert_single(int64_t val, object tz1, object tz2): pandas_datetime_to_datetimestruct(val, PANDAS_FR_ns, &dts) dt = datetime(dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us, tz1) - delta = int(total_seconds(_get_utcoffset(tz1, dt))) * 1000000000 + delta = int(_get_utcoffset(tz1, dt).total_seconds()) * 1000000000 utc_date = val - delta elif _get_zone(tz1) != 'UTC': trans, deltas, typ = _get_dst_info(tz1) @@ -4261,7 +4258,7 @@ def tz_convert_single(int64_t val, object tz1, object tz2): pandas_datetime_to_datetimestruct(val, PANDAS_FR_ns, &dts) dt = datetime(dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us, tz2) - delta = int(total_seconds(_get_utcoffset(tz2, dt))) * 1000000000 + delta = int(_get_utcoffset(tz2, dt).total_seconds()) * 1000000000 return utc_date + delta # Convert UTC to other timezone @@ -4333,7 +4330,7 @@ cdef object _get_dst_info(object tz): """ cache_key = _tz_cache_key(tz) if cache_key is None: - num = int(total_seconds(_get_utcoffset(tz, None))) * 1000000000 + num = int(_get_utcoffset(tz, None).total_seconds()) * 1000000000 return (np.array([NPY_NAT + 1], dtype=np.int64), np.array([num], dtype=np.int64), None) @@ -4380,7 +4377,7 @@ cdef object _get_dst_info(object tz): else: # static tzinfo trans = np.array([NPY_NAT + 1], dtype=np.int64) - num = int(total_seconds(_get_utcoffset(tz, None))) * 1000000000 + num = int(_get_utcoffset(tz, None).total_seconds()) * 1000000000 deltas = np.array([num], dtype=np.int64) typ = 'static' @@ -4403,9 +4400,6 @@ cdef object _get_utc_trans_times_from_dateutil_tz(object tz): return new_trans -def tot_seconds(td): - return total_seconds(td) - cpdef ndarray _unbox_utcoffsets(object transinfo): cdef: Py_ssize_t i, sz @@ -4415,7 +4409,7 @@ cpdef ndarray _unbox_utcoffsets(object transinfo): arr = np.empty(sz, dtype='i8') for i in range(sz): - arr[i] = int(total_seconds(transinfo[i][0])) * 1000000000 + arr[i] = int(transinfo[i][0].total_seconds()) * 1000000000 return arr @@ -4458,7 +4452,7 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None, pandas_datetime_to_datetimestruct(v, PANDAS_FR_ns, &dts) dt = datetime(dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us, tz) - delta = int(total_seconds(_get_utcoffset(tz, dt))) * 1000000000 + delta = int(_get_utcoffset(tz, dt).total_seconds()) * 1000000000 result[i] = v - delta return result @@ -5181,7 +5175,7 @@ cdef _normalize_local(ndarray[int64_t] stamps, object tz): pandas_datetime_to_datetimestruct(stamps[i], PANDAS_FR_ns, &dts) dt = datetime(dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us, tz) - delta = int(total_seconds(_get_utcoffset(tz, dt))) * 1000000000 + delta = int(_get_utcoffset(tz, dt).total_seconds()) * 1000000000 pandas_datetime_to_datetimestruct(stamps[i] + delta, PANDAS_FR_ns, &dts) result[i] = _normalized_stamp(&dts) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 82c80a13372d7a..712e9e9903f0a5 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -4381,7 +4381,7 @@ def _get_tz(tz): """ for a tz-aware type, return an encoded zone """ zone = tslib.get_timezone(tz) if zone is None: - zone = tslib.tot_seconds(tz.utcoffset()) + zone = tz.utcoffset().total_seconds() return zone diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index 29cdda55488965..7ccecaa84e6d6d 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -778,12 +778,12 @@ def _get_business_hours_by_sec(self): # create dummy datetime to calcurate businesshours in a day dtstart = datetime(2014, 4, 1, self.start.hour, self.start.minute) until = datetime(2014, 4, 1, self.end.hour, self.end.minute) - return tslib.tot_seconds(until - dtstart) + return (until - dtstart).total_seconds() else: self.daytime = False dtstart = datetime(2014, 4, 1, self.start.hour, self.start.minute) until = datetime(2014, 4, 2, self.end.hour, self.end.minute) - return tslib.tot_seconds(until - dtstart) + return (until - dtstart).total_seconds() @apply_wraps def rollback(self, dt): @@ -907,7 +907,7 @@ def _onOffset(self, dt, businesshours): op = self._prev_opening_time(dt) else: op = self._next_opening_time(dt) - span = tslib.tot_seconds(dt - op) + span = (dt - op).total_seconds() if span <= businesshours: return True else: diff --git a/setup.py b/setup.py index 04a5684c20fcd5..444db5bc4d275e 100755 --- a/setup.py +++ b/setup.py @@ -467,7 +467,6 @@ def pxd(name): tseries_depends = ['pandas/_libs/src/datetime/np_datetime.h', 'pandas/_libs/src/datetime/np_datetime_strings.h', - 'pandas/_libs/src/datetime_helper.h', 'pandas/_libs/src/period_helper.h', 'pandas/_libs/src/datetime.pxd'] @@ -597,7 +596,6 @@ def pxd(name): ujson_ext = Extension('pandas._libs.json', depends=['pandas/_libs/src/ujson/lib/ultrajson.h', - 'pandas/_libs/src/datetime_helper.h', 'pandas/_libs/src/numpy_helper.h'], sources=['pandas/_libs/src/ujson/python/ujson.c', 'pandas/_libs/src/ujson/python/objToJSON.c', From 376483e12e4a08140d594eab86bf22423684fbcb Mon Sep 17 00:00:00 2001 From: cbertinato Date: Mon, 28 Aug 2017 09:58:05 -0400 Subject: [PATCH 024/188] CLN: replace %s syntax with .format in core/indexing.py (#17357) Progress toward issue #16130. Converted old string formatting to new string formatting in core/indexing.py. --- pandas/core/indexing.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 6b9ad5cd2d93b7..b7a51afcedabfe 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -188,8 +188,9 @@ def _has_valid_tuple(self, key): if i >= self.obj.ndim: raise IndexingError('Too many indexers') if not self._has_valid_type(k, i): - raise ValueError("Location based indexing can only have [%s] " - "types" % self._valid_types) + raise ValueError("Location based indexing can only have " + "[{types}] types" + .format(types=self._valid_types)) def _should_validate_iterable(self, axis=0): """ return a boolean whether this axes needs validation for a passed @@ -263,11 +264,11 @@ def _has_valid_positional_setitem_indexer(self, indexer): pass elif is_integer(i): if i >= len(ax): - raise IndexError("{0} cannot enlarge its target object" - .format(self.name)) + raise IndexError("{name} cannot enlarge its target " + "object".format(name=self.name)) elif isinstance(i, dict): - raise IndexError("{0} cannot enlarge its target object" - .format(self.name)) + raise IndexError("{name} cannot enlarge its target object" + .format(name=self.name)) return True @@ -1235,7 +1236,8 @@ def _convert_to_indexer(self, obj, axis=0, is_setter=False): mask = check == -1 if mask.any(): - raise KeyError('%s not in index' % objarr[mask]) + raise KeyError('{mask} not in index' + .format(mask=objarr[mask])) return _values_from_object(indexer) @@ -1421,8 +1423,9 @@ def _has_valid_type(self, key, axis): if (not is_iterator(key) and len(key) and np.all(ax.get_indexer_for(key) < 0)): - raise KeyError("None of [%s] are in the [%s]" % - (key, self.obj._get_axis_name(axis))) + raise KeyError(u"None of [{key}] are in the [{axis}]" + .format(key=key, + axis=self.obj._get_axis_name(axis))) return True @@ -1432,8 +1435,9 @@ def error(): if isna(key): raise TypeError("cannot use label indexing with a null " "key") - raise KeyError("the label [%s] is not in the [%s]" % - (key, self.obj._get_axis_name(axis))) + raise KeyError(u"the label [{key}] is not in the [{axis}]" + .format(key=key, + axis=self.obj._get_axis_name(axis))) try: key = self._convert_scalar_indexer(key, axis) From 36dadd70376c6033037af281a4669a360fc71cfa Mon Sep 17 00:00:00 2001 From: gfyoung Date: Mon, 28 Aug 2017 07:05:29 -0700 Subject: [PATCH 025/188] DOC: Point to dev-docs in issue template (#17353) [ci skip] --- .github/ISSUE_TEMPLATE.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 237e61487d13a4..e33835c4625112 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -12,6 +12,12 @@ **Note**: Many problems can be resolved by simply upgrading `pandas` to the latest version. Before submitting, please check if that solution works for you. If possible, you may want to check if `master` addresses this issue, but that is not necessary. +For documentation-related issues, you can check the latest versions of the docs on `master` here: + +https://pandas-docs.github.io/pandas-docs-travis/ + +If the issue has not been resolved there, go ahead and file it in the issue tracker. + #### Expected Output #### Output of ``pd.show_versions()`` From df2ebfc9fd424ec760bfd2879993e44aaf983d42 Mon Sep 17 00:00:00 2001 From: chris-b1 Date: Tue, 29 Aug 2017 05:06:29 -0500 Subject: [PATCH 026/188] CLN: remove total_seconds compat from json (#17341) --- pandas/_libs/src/ujson/python/objToJSON.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 4beaa3fd449df2..1ee862b54cf0bc 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -329,7 +329,7 @@ static Py_ssize_t get_attr_length(PyObject *obj, char *attr) { return ret; } -npy_int64 get_long_attr(PyObject *o, const char *attr) { +static npy_int64 get_long_attr(PyObject *o, const char *attr) { npy_int64 long_val; PyObject *value = PyObject_GetAttrString(o, attr); long_val = (PyLong_Check(value) ? @@ -338,15 +338,12 @@ npy_int64 get_long_attr(PyObject *o, const char *attr) { return long_val; } -npy_float64 total_seconds(PyObject *td) { - // Python 2.6 compat - // TODO(anyone): remove this legacy workaround with a more - // direct td.total_seconds() - npy_int64 microseconds = get_long_attr(td, "microseconds"); - npy_int64 seconds = get_long_attr(td, "seconds"); - npy_int64 days = get_long_attr(td, "days"); - npy_int64 days_in_seconds = days * 24LL * 3600LL; - return (microseconds + (seconds + days_in_seconds) * 1000000.0) / 1000000.0; +static npy_float64 total_seconds(PyObject *td) { + npy_float64 double_val; + PyObject *value = PyObject_CallMethod(td, "total_seconds", NULL); + double_val = PyFloat_AS_DOUBLE(value); + Py_DECREF(value); + return double_val; } static PyObject *get_item(PyObject *obj, Py_ssize_t i) { From 6bab9d18bef3b7fccab2830d6dad78d0fb476ed8 Mon Sep 17 00:00:00 2001 From: jschendel Date: Tue, 29 Aug 2017 04:10:15 -0600 Subject: [PATCH 027/188] CLN: Move test_intersect_str_dates (#17366) Moves test_intersect_str_dates from tests/indexes/test_range.py to tests/indexes/test_base.py. --- pandas/tests/indexes/test_base.py | 9 +++++++++ pandas/tests/indexes/test_range.py | 9 --------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index ef36e4a91aa1c7..07e98c326bcaa6 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -663,6 +663,15 @@ def test_intersection(self): intersect = first.intersection(second) assert intersect.name is None + def test_intersect_str_dates(self): + dt_dates = [datetime(2012, 2, 9), datetime(2012, 2, 22)] + + i1 = Index(dt_dates, dtype=object) + i2 = Index(['aa'], dtype=object) + res = i2.intersection(i1) + + assert len(res) == 0 + def test_union(self): first = self.strIndex[5:20] second = self.strIndex[:10] diff --git a/pandas/tests/indexes/test_range.py b/pandas/tests/indexes/test_range.py index 566354da4870d0..5ecf467b57fc5c 100644 --- a/pandas/tests/indexes/test_range.py +++ b/pandas/tests/indexes/test_range.py @@ -639,15 +639,6 @@ def test_intersection(self): expected = RangeIndex(0, 0, 1) tm.assert_index_equal(result, expected) - def test_intersect_str_dates(self): - dt_dates = [datetime(2012, 2, 9), datetime(2012, 2, 22)] - - i1 = Index(dt_dates, dtype=object) - i2 = Index(['aa'], dtype=object) - res = i2.intersection(i1) - - assert len(res) == 0 - def test_union_noncomparable(self): from datetime import datetime, timedelta # corner case, non-Int64Index From 9a1dfca9182c86c90fffa26579844244cfd7cd7a Mon Sep 17 00:00:00 2001 From: gfyoung Date: Tue, 29 Aug 2017 05:52:51 -0700 Subject: [PATCH 028/188] BUG: Respect dups in reindexing CategoricalIndex (#17355) When the indexer is identical to the elements. We should still return duplicates when the indexer contains duplicates. Closes gh-17323. --- doc/source/whatsnew/v0.21.0.txt | 1 + pandas/core/indexes/category.py | 2 +- pandas/tests/indexes/test_category.py | 22 +++++++++++++++++----- 3 files changed, 19 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index fcadd26156b1d4..942e37a29f8d57 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -358,6 +358,7 @@ Indexing - Allow unicode empty strings as placeholders in multilevel columns in Python 2 (:issue:`17099`) - Bug in ``.iloc`` when used with inplace addition or assignment and an int indexer on a ``MultiIndex`` causing the wrong indexes to be read from and written to (:issue:`17148`) - Bug in ``.isin()`` in which checking membership in empty ``Series`` objects raised an error (:issue:`16991`) +- Bug in ``CategoricalIndex`` reindexing in which specified indices containing duplicates were not being respected (:issue:`17323`) I/O ^^^ diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index f22407308e0944..0681202289311e 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -487,7 +487,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): method = missing.clean_reindex_fill_method(method) target = ibase._ensure_index(target) - if self.equals(target): + if self.is_unique and self.equals(target): return np.arange(len(self), dtype='intp') if method == 'pad' or method == 'backfill': diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 64bd6df361aeb7..05d31af57b36c5 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -365,18 +365,18 @@ def test_astype(self): tm.assert_index_equal(result, expected) def test_reindex_base(self): - - # determined by cat ordering - idx = self.create_index() + # Determined by cat ordering. + idx = CategoricalIndex(list("cab"), categories=list("cab")) expected = np.arange(len(idx), dtype=np.intp) actual = idx.get_indexer(idx) tm.assert_numpy_array_equal(expected, actual) - with tm.assert_raises_regex(ValueError, 'Invalid fill method'): - idx.get_indexer(idx, method='invalid') + with tm.assert_raises_regex(ValueError, "Invalid fill method"): + idx.get_indexer(idx, method="invalid") def test_reindexing(self): + np.random.seed(123456789) ci = self.create_index() oidx = Index(np.array(ci)) @@ -388,6 +388,18 @@ def test_reindexing(self): actual = ci.get_indexer(finder) tm.assert_numpy_array_equal(expected, actual) + # see gh-17323 + # + # Even when indexer is equal to the + # members in the index, we should + # respect duplicates instead of taking + # the fast-track path. + for finder in [list("aabbca"), list("aababca")]: + expected = oidx.get_indexer_non_unique(finder)[0] + + actual = ci.get_indexer(finder) + tm.assert_numpy_array_equal(expected, actual) + def test_reindex_dtype(self): c = CategoricalIndex(['a', 'b', 'c', 'a']) res, indexer = c.reindex(['a', 'c']) From e8a1765edf91ec4d087b46b90d5e54530550029b Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 29 Aug 2017 06:23:38 -0700 Subject: [PATCH 029/188] Unify Index._dir_* with Series implementation (#17117) --- pandas/core/accessor.py | 35 +++++++++++++++++++++++++++++++++++ pandas/core/base.py | 22 +++------------------- pandas/core/generic.py | 5 +++-- pandas/core/indexes/base.py | 9 +++++++-- pandas/core/series.py | 17 +++-------------- pandas/core/strings.py | 20 ++------------------ 6 files changed, 53 insertions(+), 55 deletions(-) create mode 100644 pandas/core/accessor.py diff --git a/pandas/core/accessor.py b/pandas/core/accessor.py new file mode 100644 index 00000000000000..9f8556d1e69616 --- /dev/null +++ b/pandas/core/accessor.py @@ -0,0 +1,35 @@ +# -*- coding: utf-8 -*- +""" + +accessor.py contains base classes for implementing accessor properties +that can be mixed into or pinned onto other pandas classes. + +""" + + +class DirNamesMixin(object): + _accessors = frozenset([]) + + def _dir_deletions(self): + """ delete unwanted __dir__ for this object """ + return self._accessors + + def _dir_additions(self): + """ add addtional __dir__ for this object """ + rv = set() + for accessor in self._accessors: + try: + getattr(self, accessor) + rv.add(accessor) + except AttributeError: + pass + return rv + + def __dir__(self): + """ + Provide method name lookup and completion + Only provide 'public' methods + """ + rv = set(dir(type(self))) + rv = (rv - self._dir_deletions()) | self._dir_additions() + return sorted(rv) diff --git a/pandas/core/base.py b/pandas/core/base.py index a7c991dc8d2572..d60a8515dc920f 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -19,6 +19,7 @@ from pandas.util._decorators import (Appender, cache_readonly, deprecate_kwarg, Substitution) from pandas.core.common import AbstractMethodError +from pandas.core.accessor import DirNamesMixin _shared_docs = dict() _indexops_doc_kwargs = dict(klass='IndexOpsMixin', inplace='', @@ -73,7 +74,7 @@ def __repr__(self): return str(self) -class PandasObject(StringMixin): +class PandasObject(StringMixin, DirNamesMixin): """baseclass for various pandas objects""" @@ -92,23 +93,6 @@ def __unicode__(self): # Should be overwritten by base classes return object.__repr__(self) - def _dir_additions(self): - """ add addtional __dir__ for this object """ - return set() - - def _dir_deletions(self): - """ delete unwanted __dir__ for this object """ - return set() - - def __dir__(self): - """ - Provide method name lookup and completion - Only provide 'public' methods - """ - rv = set(dir(type(self))) - rv = (rv - self._dir_deletions()) | self._dir_additions() - return sorted(rv) - def _reset_cache(self, key=None): """ Reset cached properties. If ``key`` is passed, only clears that key. @@ -141,7 +125,7 @@ class NoNewAttributesMixin(object): Prevents additional attributes via xxx.attribute = "something" after a call to `self.__freeze()`. Mainly used to prevent the user from using - wrong attrirbutes on a accessor (`Series.cat/.str/.dt`). + wrong attributes on a accessor (`Series.cat/.str/.dt`). If you really want to add a new attribute at a later time, you need to use `object.__setattr__(self, key, value)`. diff --git a/pandas/core/generic.py b/pandas/core/generic.py index f8366c804e3e79..cdb08d8887e05b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -192,8 +192,9 @@ def __unicode__(self): def _dir_additions(self): """ add the string-like attributes from the info_axis """ - return set([c for c in self._info_axis - if isinstance(c, string_types) and isidentifier(c)]) + additions = set([c for c in self._info_axis + if isinstance(c, string_types) and isidentifier(c)]) + return super(NDFrame, self)._dir_additions().union(additions) @property def _constructor_sliced(self): diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index a21e6df3ffc93d..31cf1e48b85294 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -56,7 +56,7 @@ import pandas.core.sorting as sorting from pandas.io.formats.printing import pprint_thing from pandas.core.ops import _comp_method_OBJECT_ARRAY -from pandas.core.strings import StringAccessorMixin +from pandas.core import strings from pandas.core.config import get_option @@ -102,7 +102,7 @@ def _new_Index(cls, d): return cls.__new__(cls, **d) -class Index(IndexOpsMixin, StringAccessorMixin, PandasObject): +class Index(IndexOpsMixin, PandasObject): """ Immutable ndarray implementing an ordered, sliceable set. The basic object storing axis labels for all pandas objects @@ -155,6 +155,11 @@ class Index(IndexOpsMixin, StringAccessorMixin, PandasObject): _engine_type = libindex.ObjectEngine + _accessors = frozenset(['str']) + + # String Methods + str = base.AccessorProperty(strings.StringMethods) + def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=False, tupleize_cols=True, **kwargs): diff --git a/pandas/core/series.py b/pandas/core/series.py index 75dc3d6403650c..6905fc1aced742 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -114,8 +114,7 @@ def wrapper(self): # Series class -class Series(base.IndexOpsMixin, strings.StringAccessorMixin, - generic.NDFrame,): +class Series(base.IndexOpsMixin, generic.NDFrame): """ One-dimensional ndarray with axis labels (including time series). @@ -2923,18 +2922,8 @@ def to_period(self, freq=None, copy=True): # Categorical methods cat = base.AccessorProperty(CategoricalAccessor) - def _dir_deletions(self): - return self._accessors - - def _dir_additions(self): - rv = set() - for accessor in self._accessors: - try: - getattr(self, accessor) - rv.add(accessor) - except AttributeError: - pass - return rv + # String Methods + str = base.AccessorProperty(strings.StringMethods) # ---------------------------------------------------------------------- # Add plotting methods to Series diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 0b1db0277eee3f..2f95e510bba5ef 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -16,7 +16,7 @@ from pandas.core.algorithms import take_1d import pandas.compat as compat -from pandas.core.base import AccessorProperty, NoNewAttributesMixin +from pandas.core.base import NoNewAttributesMixin from pandas.util._decorators import Appender import re import pandas._libs.lib as lib @@ -1920,20 +1920,4 @@ def _make_accessor(cls, data): message = ("Can only use .str accessor with Index, not " "MultiIndex") raise AttributeError(message) - return StringMethods(data) - - -class StringAccessorMixin(object): - """ Mixin to add a `.str` acessor to the class.""" - - str = AccessorProperty(StringMethods) - - def _dir_additions(self): - return set() - - def _dir_deletions(self): - try: - getattr(self, 'str') - except AttributeError: - return set(['str']) - return set() + return cls(data) From 0618f9950ad72f6f30283bbcf44fcdcf5918756d Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Tue, 29 Aug 2017 19:03:17 +0200 Subject: [PATCH 030/188] BUG: make order of index from pd.concat deterministic (#17364) closes #17344 --- doc/source/whatsnew/v0.21.0.txt | 1 + pandas/core/common.py | 14 ++++++++++++++ pandas/core/indexes/api.py | 9 ++------- pandas/tests/reshape/test_concat.py | 13 ++++++++++++- 4 files changed, 29 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 942e37a29f8d57..a3673609147a6b 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -406,6 +406,7 @@ Reshaping - Bug in :func:`crosstab` where passing two ``Series`` with the same name raised a ``KeyError`` (:issue:`13279`) - :func:`Series.argmin`, :func:`Series.argmax`, and their counterparts on ``DataFrame`` and groupby objects work correctly with floating point data that contains infinite values (:issue:`13595`). - Bug in :func:`unique` where checking a tuple of strings raised a ``TypeError`` (:issue:`17108`) +- Bug in :func:`concat` where order of result index was unpredictable if it contained non-comparable elements (:issue:`17344`) Numeric ^^^^^^^ diff --git a/pandas/core/common.py b/pandas/core/common.py index 44cb36b8a32076..515a4010961205 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -629,3 +629,17 @@ def _random_state(state=None): else: raise ValueError("random_state must be an integer, a numpy " "RandomState, or None") + + +def _get_distinct_objs(objs): + """ + Return a list with distinct elements of "objs" (different ids). + Preserves order. + """ + ids = set() + res = [] + for obj in objs: + if not id(obj) in ids: + ids.add(id(obj)) + res.append(obj) + return res diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index db73a6878258ad..323d50166e7b6f 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -23,8 +23,7 @@ 'PeriodIndex', 'DatetimeIndex', '_new_Index', 'NaT', '_ensure_index', '_get_na_value', '_get_combined_index', - '_get_objs_combined_axis', - '_get_distinct_indexes', '_union_indexes', + '_get_objs_combined_axis', '_union_indexes', '_get_consensus_names', '_all_indexes_same'] @@ -41,7 +40,7 @@ def _get_objs_combined_axis(objs, intersect=False, axis=0): def _get_combined_index(indexes, intersect=False): # TODO: handle index names! - indexes = _get_distinct_indexes(indexes) + indexes = com._get_distinct_objs(indexes) if len(indexes) == 0: return Index([]) if len(indexes) == 1: @@ -55,10 +54,6 @@ def _get_combined_index(indexes, intersect=False): return _ensure_index(union) -def _get_distinct_indexes(indexes): - return list(dict((id(x), x) for x in indexes).values()) - - def _union_indexes(indexes): if len(indexes) == 0: raise AssertionError('Must have at least 1 Index to union') diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 52cd18126859a1..6e646f9b294429 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -5,7 +5,7 @@ from numpy.random import randn from datetime import datetime -from pandas.compat import StringIO, iteritems +from pandas.compat import StringIO, iteritems, PY2 import pandas as pd from pandas import (DataFrame, concat, read_csv, isna, Series, date_range, @@ -1944,6 +1944,17 @@ def test_concat_categoricalindex(self): index=exp_idx) tm.assert_frame_equal(result, exp) + def test_concat_order(self): + # GH 17344 + dfs = [pd.DataFrame(index=range(3), columns=['a', 1, None])] + dfs += [pd.DataFrame(index=range(3), columns=[None, 1, 'a']) + for i in range(100)] + result = pd.concat(dfs).columns + expected = dfs[0].columns + if PY2: + expected = expected.sort_values() + tm.assert_index_equal(result, expected) + @pytest.mark.parametrize('pdt', [pd.Series, pd.DataFrame, pd.Panel]) @pytest.mark.parametrize('dt', np.sctypes['float']) From 0d676a3ccf1d7aa986416a7488b941496f936d98 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 29 Aug 2017 10:04:07 -0700 Subject: [PATCH 031/188] Fix typo that causes several NaT methods to have incorrect docstrings (#17327) --- doc/source/whatsnew/v0.21.0.txt | 1 + pandas/_libs/tslib.pyx | 7 ++++--- pandas/tests/scalar/test_nat.py | 5 +++++ 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index a3673609147a6b..33b7e128ef8bfc 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -425,3 +425,4 @@ Categorical Other ^^^^^ - Bug in :func:`eval` where the ``inplace`` parameter was being incorrectly handled (:issue:`16732`) +- Several ``NaT`` method docstrings (e.g. :func:`NaT.ctime`) were incorrect (:issue:`17327`) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index b5aca2e3ec3094..5dd30072fb7aa0 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- # cython: profile=False import warnings @@ -3922,7 +3923,7 @@ for _method_name in _nat_methods: def f(*args, **kwargs): return NaT f.__name__ = func_name - f.__doc__ = _get_docstring(_method_name) + f.__doc__ = _get_docstring(func_name) return f setattr(NaTType, _method_name, _make_nat_func(_method_name)) @@ -3934,7 +3935,7 @@ for _method_name in _nan_methods: def f(*args, **kwargs): return np.nan f.__name__ = func_name - f.__doc__ = _get_docstring(_method_name) + f.__doc__ = _get_docstring(func_name) return f setattr(NaTType, _method_name, _make_nan_func(_method_name)) @@ -3952,7 +3953,7 @@ for _maybe_method_name in dir(NaTType): def f(*args, **kwargs): raise ValueError("NaTType does not support " + func_name) f.__name__ = func_name - f.__doc__ = _get_docstring(_method_name) + f.__doc__ = _get_docstring(func_name) return f setattr(NaTType, _maybe_method_name, diff --git a/pandas/tests/scalar/test_nat.py b/pandas/tests/scalar/test_nat.py index 5f247cae1099b6..6f852f2b394e18 100644 --- a/pandas/tests/scalar/test_nat.py +++ b/pandas/tests/scalar/test_nat.py @@ -247,3 +247,8 @@ def test_nat_arithmetic_index(): tm.assert_index_equal(right + left, exp) tm.assert_index_equal(left - right, exp) tm.assert_index_equal(right - left, exp) + + +def test_nat_pinned_docstrings(): + # GH17327 + assert NaT.ctime.__doc__ == datetime.ctime.__doc__ From b9d48e48904b0e607c4d18738df50dec744b745f Mon Sep 17 00:00:00 2001 From: cbertinato Date: Wed, 30 Aug 2017 06:19:44 -0400 Subject: [PATCH 032/188] CLN: replace %s syntax with .format in io/formats/format.py (#17358) Progress toward issue #16130. Converted old string formatting to new string formatting in io/formats/format.py. --- pandas/io/formats/format.py | 165 ++++++++++++++++++++---------------- 1 file changed, 93 insertions(+), 72 deletions(-) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 733fd3bd39b527..6a98497aa1bfef 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -47,6 +47,7 @@ import itertools import csv +from functools import partial common_docstring = """ Parameters @@ -109,7 +110,7 @@ def _get_footer(self): if self.length: if footer: footer += ', ' - footer += "Length: %d" % len(self.categorical) + footer += "Length: {length}".format(length=len(self.categorical)) level_info = self.categorical._repr_categories_info() @@ -135,7 +136,7 @@ def to_string(self): fmt_values = self._get_formatted_values() - result = ['%s' % i for i in fmt_values] + result = [u('{i}').format(i=i) for i in fmt_values] result = [i.strip() for i in result] result = u(', ').join(result) result = [u('[') + result + u(']')] @@ -191,7 +192,7 @@ def _get_footer(self): footer = u('') if getattr(self.series.index, 'freq', None) is not None: - footer += 'Freq: %s' % self.series.index.freqstr + footer += 'Freq: {freq}'.format(freq=self.series.index.freqstr) if self.name is not False and name is not None: if footer: @@ -199,20 +200,21 @@ def _get_footer(self): series_name = pprint_thing(name, escape_chars=('\t', '\r', '\n')) - footer += ("Name: %s" % series_name) if name is not None else "" + footer += ((u"Name: {sname}".format(sname=series_name)) + if name is not None else "") if (self.length is True or (self.length == 'truncate' and self.truncate_v)): if footer: footer += ', ' - footer += 'Length: %d' % len(self.series) + footer += 'Length: {length}'.format(length=len(self.series)) if self.dtype is not False and self.dtype is not None: name = getattr(self.tr_series.dtype, 'name', None) if name: if footer: footer += ', ' - footer += 'dtype: %s' % pprint_thing(name) + footer += u'dtype: {typ}'.format(typ=pprint_thing(name)) # level infos are added to the end and in a new line, like it is done # for Categoricals @@ -509,8 +511,10 @@ def _to_str_columns(self): else: if is_list_like(self.header): if len(self.header) != len(self.columns): - raise ValueError(('Writing %d cols but got %d aliases' - % (len(self.columns), len(self.header)))) + raise ValueError(('Writing {ncols} cols but got {nalias} ' + 'aliases' + .format(ncols=len(self.columns), + nalias=len(self.header)))) str_columns = [[label] for label in self.header] else: str_columns = self._get_formatted_column_labels(frame) @@ -578,10 +582,10 @@ def to_string(self): frame = self.frame if len(frame.columns) == 0 or len(frame.index) == 0: - info_line = (u('Empty %s\nColumns: %s\nIndex: %s') % - (type(self.frame).__name__, - pprint_thing(frame.columns), - pprint_thing(frame.index))) + info_line = (u('Empty {name}\nColumns: {col}\nIndex: {idx}') + .format(name=type(self.frame).__name__, + col=pprint_thing(frame.columns), + idx=pprint_thing(frame.index))) text = info_line else: @@ -630,8 +634,8 @@ def to_string(self): self.buf.writelines(text) if self.should_show_dimensions: - self.buf.write("\n\n[%d rows x %d columns]" % - (len(frame), len(frame.columns))) + self.buf.write("\n\n[{nrows} rows x {ncols} columns]" + .format(nrows=len(frame), ncols=len(frame.columns))) def _join_multiline(self, *strcols): lwidth = self.line_width @@ -805,7 +809,8 @@ def _get_formatted_index(self, frame): # empty space for columns if show_col_names: - col_header = ['%s' % x for x in self._get_column_name_list()] + col_header = ['{x}'.format(x=x) + for x in self._get_column_name_list()] else: col_header = [''] * columns.nlevels @@ -861,9 +866,10 @@ def write_result(self, buf): # string representation of the columns if len(self.frame.columns) == 0 or len(self.frame.index) == 0: - info_line = (u('Empty %s\nColumns: %s\nIndex: %s') % - (type(self.frame).__name__, self.frame.columns, - self.frame.index)) + info_line = (u('Empty {name}\nColumns: {col}\nIndex: {idx}') + .format(name=type(self.frame).__name__, + col=self.frame.columns, + idx=self.frame.index)) strcols = [[info_line]] else: strcols = self.fmt._to_str_columns() @@ -906,14 +912,16 @@ def get_col_type(dtype): column_format = index_format + column_format elif not isinstance(column_format, compat.string_types): # pragma: no cover - raise AssertionError('column_format must be str or unicode, not %s' - % type(column_format)) + raise AssertionError('column_format must be str or unicode, ' + 'not {typ}'.format(typ=type(column_format))) if not self.longtable: - buf.write('\\begin{tabular}{%s}\n' % column_format) + buf.write('\\begin{{tabular}}{{{fmt}}}\n' + .format(fmt=column_format)) buf.write('\\toprule\n') else: - buf.write('\\begin{longtable}{%s}\n' % column_format) + buf.write('\\begin{{longtable}}{{{fmt}}}\n' + .format(fmt=column_format)) buf.write('\\toprule\n') ilevels = self.frame.index.nlevels @@ -948,7 +956,7 @@ def get_col_type(dtype): crow = [x if x else '{}' for x in row] if self.bold_rows and self.fmt.index: # bold row labels - crow = ['\\textbf{%s}' % x + crow = ['\\textbf{{{x}}}'.format(x=x) if j < ilevels and x.strip() not in ['', '{}'] else x for j, x in enumerate(crow)] if i < clevels and self.fmt.header and self.multicolumn: @@ -986,9 +994,9 @@ def _format_multicolumn(self, row, ilevels): def append_col(): # write multicolumn if needed if ncol > 1: - row2.append('\\multicolumn{{{0:d}}}{{{1:s}}}{{{2:s}}}' - .format(ncol, self.multicolumn_format, - coltext.strip())) + row2.append('\\multicolumn{{{ncol:d}}}{{{fmt:s}}}{{{txt:s}}}' + .format(ncol=ncol, fmt=self.multicolumn_format, + txt=coltext.strip())) # don't modify where not needed else: row2.append(coltext) @@ -1027,8 +1035,8 @@ def _format_multirow(self, row, ilevels, i, rows): break if nrow > 1: # overwrite non-multirow entry - row[j] = '\\multirow{{{0:d}}}{{*}}{{{1:s}}}'.format( - nrow, row[j].strip()) + row[j] = '\\multirow{{{nrow:d}}}{{*}}{{{row:s}}}'.format( + nrow=nrow, row=row[j].strip()) # save when to end the current block with \cline self.clinebuf.append([i + nrow - 1, j + 1]) return row @@ -1039,7 +1047,8 @@ def _print_cline(self, buf, i, icol): """ for cl in self.clinebuf: if cl[0] == i: - buf.write('\cline{{{0:d}-{1:d}}}\n'.format(cl[1], icol)) + buf.write('\cline{{{cl:d}-{icol:d}}}\n' + .format(cl=cl[1], icol=icol)) # remove entries that have been written to buffer self.clinebuf = [x for x in self.clinebuf if x[0] != i] @@ -1076,7 +1085,8 @@ def write(self, s, indent=0): def write_th(self, s, indent=0, tags=None): if self.fmt.col_space is not None and self.fmt.col_space > 0: tags = (tags or "") - tags += 'style="min-width: %s;"' % self.fmt.col_space + tags += ('style="min-width: {colspace};"' + .format(colspace=self.fmt.col_space)) return self._write_cell(s, kind='th', indent=indent, tags=tags) @@ -1085,9 +1095,9 @@ def write_td(self, s, indent=0, tags=None): def _write_cell(self, s, kind='td', indent=0, tags=None): if tags is not None: - start_tag = '<%s %s>' % (kind, tags) + start_tag = '<{kind} {tags}>'.format(kind=kind, tags=tags) else: - start_tag = '<%s>' % kind + start_tag = '<{kind}>'.format(kind=kind) if self.escape: # escape & first to prevent double escaping of & @@ -1096,7 +1106,8 @@ def _write_cell(self, s, kind='td', indent=0, tags=None): else: esc = {} rs = pprint_thing(s, escape_chars=esc).strip() - self.write('%s%s' % (start_tag, rs, kind), indent) + self.write(u'{start}{rs}' + .format(start=start_tag, rs=rs, kind=kind), indent) def write_tr(self, line, indent=0, indent_delta=4, header=False, align=None, tags=None, nindex_levels=0): @@ -1106,7 +1117,8 @@ def write_tr(self, line, indent=0, indent_delta=4, header=False, if align is None: self.write('', indent) else: - self.write('' % align, indent) + self.write('' + .format(align=align), indent) indent += indent_delta for i, s in enumerate(line): @@ -1146,8 +1158,8 @@ def write_result(self, buf): if isinstance(self.classes, str): self.classes = self.classes.split() if not isinstance(self.classes, (list, tuple)): - raise AssertionError('classes must be list or tuple, ' - 'not %s' % type(self.classes)) + raise AssertionError('classes must be list or tuple, not {typ}' + .format(typ=type(self.classes))) _classes.extend(self.classes) if self.notebook: @@ -1159,12 +1171,11 @@ def write_result(self, buf): except (ImportError, AttributeError): pass - self.write(''.format(div_style)) + self.write(''.format(style=div_style)) self.write_style() - self.write('' % (self.border, - ' '.join(_classes)), - indent) + self.write('
' + .format(border=self.border, cls=' '.join(_classes)), indent) indent += self.indent_delta indent = self._write_header(indent) @@ -1173,8 +1184,10 @@ def write_result(self, buf): self.write('
', indent) if self.should_show_dimensions: by = chr(215) if compat.PY3 else unichr(215) # × - self.write(u('

%d rows %s %d columns

') % - (len(frame), by, len(frame.columns))) + self.write(u('

{rows} rows {by} {cols} columns

') + .format(rows=len(frame), + by=by, + cols=len(frame.columns))) if self.notebook: self.write('') @@ -1199,7 +1212,7 @@ def _column_header(): row.append(single_column_table(self.columns.names)) else: row.append('') - style = "text-align: %s;" % self.fmt.justify + style = "text-align: {just};".format(just=self.fmt.justify) row.extend([single_column_table(c, self.fmt.justify, style) for c in self.columns]) else: @@ -1214,7 +1227,7 @@ def _column_header(): indent += self.indent_delta if isinstance(self.columns, MultiIndex): - template = 'colspan="%d" halign="left"' + template = 'colspan="{span:d}" halign="left"' if self.fmt.sparsify: # GH3547 @@ -1282,7 +1295,7 @@ def _column_header(): for i, v in enumerate(values): if i in records: if records[i] > 1: - tags[j] = template % records[i] + tags[j] = template.format(span=records[i]) else: continue j += 1 @@ -1372,7 +1385,7 @@ def _write_regular_rows(self, fmt_values, indent): nindex_levels=1) def _write_hierarchical_rows(self, fmt_values, indent): - template = 'rowspan="%d" valign="top"' + template = 'rowspan="{span}" valign="top"' truncate_h = self.fmt.truncate_h truncate_v = self.fmt.truncate_v @@ -1447,7 +1460,7 @@ def _write_hierarchical_rows(self, fmt_values, indent): for records, v in zip(level_lengths, idx_values[i]): if i in records: if records[i] > 1: - tags[j] = template % records[i] + tags[j] = template.format(span=records[i]) else: sparse_offset += 1 continue @@ -1615,8 +1628,9 @@ def _save_header(self): return if has_aliases: if len(header) != len(cols): - raise ValueError(('Writing %d cols but got %d aliases' - % (len(cols), len(header)))) + raise ValueError(('Writing {ncols} cols but got {nalias} ' + 'aliases'.format(ncols=len(cols), + nalias=len(header)))) else: write_cols = header else: @@ -1790,8 +1804,9 @@ def _format_strings(self): if self.float_format is None: float_format = get_option("display.float_format") if float_format is None: - fmt_str = '%% .%dg' % get_option("display.precision") - float_format = lambda x: fmt_str % x + fmt_str = ('{{x: .{prec:d}g}}' + .format(prec=get_option("display.precision"))) + float_format = lambda x: fmt_str.format(x=x) else: float_format = self.float_format @@ -1807,10 +1822,10 @@ def _format(x): return 'NaT' return self.na_rep elif isinstance(x, PandasObject): - return '%s' % x + return u'{x}'.format(x=x) else: # object dtype - return '%s' % formatter(x) + return u'{x}'.format(x=formatter(x)) vals = self.values if isinstance(vals, Index): @@ -1824,11 +1839,11 @@ def _format(x): fmt_values = [] for i, v in enumerate(vals): if not is_float_type[i] and leading_space: - fmt_values.append(' %s' % _format(v)) + fmt_values.append(u' {v}'.format(v=_format(v))) elif is_float_type[i]: fmt_values.append(float_format(v)) else: - fmt_values.append(' %s' % _format(v)) + fmt_values.append(u' {v}'.format(v=_format(v))) return fmt_values @@ -1864,7 +1879,7 @@ def _value_formatter(self, float_format=None, threshold=None): # because str(0.0) = '0.0' while '%g' % 0.0 = '0' if float_format: def base_formatter(v): - return (float_format % v) if notna(v) else self.na_rep + return float_format(value=v) if notna(v) else self.na_rep else: def base_formatter(v): return str(v) if notna(v) else self.na_rep @@ -1925,10 +1940,14 @@ def format_values_with(float_format): # There is a special default string when we are fixed-width # The default is otherwise to use str instead of a formatting string - if self.float_format is None and self.fixed_width: - float_format = '%% .%df' % self.digits + if self.float_format is None: + if self.fixed_width: + float_format = partial('{value: .{digits:d}f}'.format, + digits=self.digits) + else: + float_format = self.float_format else: - float_format = self.float_format + float_format = lambda value: self.float_format % value formatted_values = format_values_with(float_format) @@ -1955,7 +1974,8 @@ def format_values_with(float_format): (abs_vals > 0)).any() if has_small_values or (too_long and has_large_values): - float_format = '%% .%de' % self.digits + float_format = partial('{value: .{digits:d}e}'.format, + digits=self.digits) formatted_values = format_values_with(float_format) return formatted_values @@ -1971,7 +1991,7 @@ def _format_strings(self): class IntArrayFormatter(GenericArrayFormatter): def _format_strings(self): - formatter = self.formatter or (lambda x: '% d' % x) + formatter = self.formatter or (lambda x: '{x: d}'.format(x=x)) fmt_values = [formatter(x) for x in self.values] return fmt_values @@ -2023,7 +2043,7 @@ def _format_strings(self): # periods may contains different freq values = Index(self.values, dtype='object').to_native_types() - formatter = self.formatter or (lambda x: '%s' % x) + formatter = self.formatter or (lambda x: '{x}'.format(x=x)) fmt_values = [formatter(x) for x in values] return fmt_values @@ -2223,7 +2243,7 @@ def _formatter(x): x = Timedelta(x) result = x._repr_base(format=format) if box: - result = "'{0}'".format(result) + result = "'{res}'".format(res=result) return result return _formatter @@ -2278,12 +2298,12 @@ def _cond(values): def single_column_table(column, align=None, style=None): table = '%s' % str(i)) + table += ('{i!s}'.format(i=i)) table += '' return table @@ -2291,7 +2311,7 @@ def single_column_table(column, align=None, style=None): def single_row_table(row): # pragma: no cover table = '' for i in row: - table += ('' % str(i)) + table += (''.format(i=i)) table += '
%s{i!s}
' return table @@ -2385,18 +2405,19 @@ def __call__(self, num): prefix = self.ENG_PREFIXES[int_pow10] else: if int_pow10 < 0: - prefix = 'E-%02d' % (-int_pow10) + prefix = 'E-{pow10:02d}'.format(pow10=-int_pow10) else: - prefix = 'E+%02d' % int_pow10 + prefix = 'E+{pow10:02d}'.format(pow10=int_pow10) mant = sign * dnum / (10**pow10) if self.accuracy is None: # pragma: no cover - format_str = u("% g%s") + format_str = u("{mant: g}{prefix}") else: - format_str = (u("%% .%if%%s") % self.accuracy) + format_str = (u("{{mant: .{acc:d}f}}{{prefix}}") + .format(acc=self.accuracy)) - formatted = format_str % (mant, prefix) + formatted = format_str.format(mant=mant, prefix=prefix) return formatted # .strip() From 77bfe21c7229e724d01721bb84861283baf7e9d3 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 30 Aug 2017 05:50:04 -0500 Subject: [PATCH 033/188] PKG: Added pyproject.toml for PEP 518 (#16745) Declaring build-time requirements: https://www.python.org/dev/peps/pep-0518/ --- MANIFEST.in | 1 + doc/source/whatsnew/v0.21.0.txt | 1 + pyproject.toml | 9 +++++++++ 3 files changed, 11 insertions(+) create mode 100644 pyproject.toml diff --git a/MANIFEST.in b/MANIFEST.in index 8bd83a7d569484..1a6b831c1b9752 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -3,6 +3,7 @@ include LICENSE include RELEASE.md include README.rst include setup.py +include pyproject.toml graft doc prune doc/build diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 33b7e128ef8bfc..014f251ffb90ab 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -112,6 +112,7 @@ Other Enhancements ^^^^^^^^^^^^^^^^^^ - The ``validate`` argument for :func:`merge` function now checks whether a merge is one-to-one, one-to-many, many-to-one, or many-to-many. If a merge is found to not be an example of specified merge type, an exception of type ``MergeError`` will be raised. For more, see :ref:`here ` (:issue:`16270`) +- Added support for `PEP 518 `_ to the build system (:issue:`16745`) - :func:`Series.to_dict` and :func:`DataFrame.to_dict` now support an ``into`` keyword which allows you to specify the ``collections.Mapping`` subclass that you would like returned. The default is ``dict``, which is backwards compatible. (:issue:`16122`) - :func:`RangeIndex.append` now returns a ``RangeIndex`` object when possible (:issue:`16212`) - :func:`Series.rename_axis` and :func:`DataFrame.rename_axis` with ``inplace=True`` now return ``None`` while renaming the axis inplace. (:issue:`15704`) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000000000..f0d57d1d808a25 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,9 @@ +[build-system] +requires = [ + "wheel", + "setuptools", + "Cython", # required for VCS build, optional for released source + "numpy==1.9.3; python_version=='3.5'", + "numpy==1.12.1; python_version=='3.6'", + "numpy==1.13.1; python_version>='3.7'", +] From ad7d6fc0248edaf098537e5674dcc0c9dd059491 Mon Sep 17 00:00:00 2001 From: iulia Date: Wed, 30 Aug 2017 19:39:45 +0300 Subject: [PATCH 034/188] DOC: Update Overview page in documentation (#17368) * Update Overview page in documentation * DOC Revise Overview page * DOC Make further revisions in Overview webpage * Update overview.rst Remove references to Panel --- doc/source/overview.rst | 71 ++++++++++++++++++++++------------------- 1 file changed, 38 insertions(+), 33 deletions(-) diff --git a/doc/source/overview.rst b/doc/source/overview.rst index 92caeec3191698..00a71603e12612 100644 --- a/doc/source/overview.rst +++ b/doc/source/overview.rst @@ -6,7 +6,11 @@ Package overview **************** -:mod:`pandas` consists of the following things +:mod:`pandas` is an open source, BSD-licensed library providing high-performance, +easy-to-use data structures and data analysis tools for the `Python `__ +programming language. + +:mod:`pandas` consists of the following elements * A set of labeled array data structures, the primary of which are Series and DataFrame @@ -21,27 +25,23 @@ Package overview * Memory-efficient "sparse" versions of the standard data structures for storing data that is mostly missing or mostly constant (some fixed value) * Moving window statistics (rolling mean, rolling standard deviation, etc.) - * Static and moving window linear and `panel regression - `__ -Data structures at a glance ---------------------------- +Data Structures +--------------- .. csv-table:: :header: "Dimensions", "Name", "Description" :widths: 15, 20, 50 - 1, Series, "1D labeled homogeneously-typed array" - 2, DataFrame, "General 2D labeled, size-mutable tabular structure with - potentially heterogeneously-typed columns" - 3, Panel, "General 3D labeled, also size-mutable array" + 1, "Series", "1D labeled homogeneously-typed array" + 2, "DataFrame", "General 2D labeled, size-mutable tabular structure with potentially heterogeneously-typed column" -Why more than 1 data structure? -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Why more than one data structure? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The best way to think about the pandas data structures is as flexible containers for lower dimensional data. For example, DataFrame is a container -for Series, and Panel is a container for DataFrame objects. We would like to be +for Series, and Series is a container for scalars. We would like to be able to insert and remove objects from these containers in a dictionary-like fashion. @@ -85,36 +85,41 @@ The first stop for pandas issues and ideas is the `Github Issue Tracker pandas community experts can answer through `Stack Overflow `__. -Longer discussions occur on the `developer mailing list -`__, and commercial support -inquiries for Lambda Foundry should be sent to: support@lambdafoundry.com +Community +--------- -Credits -------- +pandas is actively supported today by a community of like-minded individuals around +the world who contribute their valuable time and energy to help make open source +pandas possible. Thanks to `all of our contributors `__. + +If you're interested in contributing, please +visit `Contributing to pandas webpage `__. -pandas development began at `AQR Capital Management `__ in -April 2008. It was open-sourced at the end of 2009. AQR continued to provide -resources for development through the end of 2011, and continues to contribute -bug reports today. +pandas is a `NUMFocus `__ sponsored project. +This will help ensure the success of development of pandas as a world-class open-source +project, and makes it possible to `donate `__ to the project. -Since January 2012, `Lambda Foundry `__, has -been providing development resources, as well as commercial support, -training, and consulting for pandas. +Project Governance +------------------ -pandas is only made possible by a group of people around the world like you -who have contributed new code, bug reports, fixes, comments and ideas. A -complete list can be found `on Github `__. +The governance process that pandas project has used informally since its inception in 2008 is formalized in `Project Governance documents `__ . +The documents clarify how decisions are made and how the various elements of our community interact, including the relationship between open source collaborative development and work that may be funded by for-profit or non-profit entities. + +Wes McKinney is the Benevolent Dictator for Life (BDFL). Development Team ----------------- +----------------- + +The list of the Core Team members and more detailed information can be found on the `people’s page `__ of the governance repo. + -pandas is a part of the PyData project. The PyData Development Team is a -collection of developers focused on the improvement of Python's data -libraries. The core team that coordinates development can be found on `Github -`__. If you're interested in contributing, please -visit the `project website `__. +Institutional Partners +---------------------- + +The information about current institutional partners can be found on `pandas website page `__ License ------- .. literalinclude:: ../../LICENSE + From 64c8a8d6fecacb796da8265ace870a4fcab98092 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 30 Aug 2017 15:30:53 -0500 Subject: [PATCH 035/188] API: Have MultiIndex consturctors always return a MI (#17236) * API: Have MultiIndex constructors return MI This removes the special case for MultiIndex constructors returning an Index if all the levels are length-1. Now this will return a MultiIndex with a single level. This is a backwards incompatabile change, with no clear method for deprecation, so we're making a clean break. Closes #17178 * fixup! API: Have MultiIndex constructors return MI * Update for comments --- doc/source/whatsnew/v0.21.0.txt | 24 +++++++++++ pandas/core/frame.py | 11 ++--- pandas/core/indexes/api.py | 12 ++++-- pandas/core/indexes/base.py | 69 ++++++++++++++++++++++++++++++ pandas/core/indexes/multi.py | 10 ----- pandas/core/reshape/reshape.py | 21 ++++++--- pandas/core/sparse/scipy_sparse.py | 6 ++- pandas/core/strings.py | 7 ++- pandas/io/parsers.py | 13 +++--- pandas/tests/indexes/test_base.py | 18 +++++++- pandas/tests/indexes/test_multi.py | 20 ++++----- pandas/util/testing.py | 4 ++ 12 files changed, 170 insertions(+), 45 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 014f251ffb90ab..273cbd8357f853 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -274,6 +274,30 @@ named ``.isna()`` and ``.notna()``, these are included for classes ``Categorical The configuration option ``pd.options.mode.use_inf_as_null`` is deprecated, and ``pd.options.mode.use_inf_as_na`` is added as a replacement. +.. _whatsnew_210.api.multiindex_single: + +MultiIndex Constructor with a Single Level +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The ``MultiIndex`` constructors no longer squeeze a MultiIndex with all +length-one levels down to a regular ``Index``. This affects all the +``MultiIndex`` constructors. (:issue:`17178`) + +Previous behavior: + +.. code-block:: ipython + + In [2]: pd.MultiIndex.from_tuples([('a',), ('b',)]) + Out[2]: Index(['a', 'b'], dtype='object') + +Length 1 levels are no longer special-cased. They behave exactly as if you had +length 2+ levels, so a :class:`MultiIndex` is always returned from all of the +``MultiIndex`` constructors: + +.. ipython:: python + + pd.MultiIndex.from_tuples([('a',), ('b',)]) + .. _whatsnew_0210.api: Other API Changes diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b5b3df64d24c0b..5991ec825c8417 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -67,7 +67,8 @@ _dict_compat, standardize_mapping) from pandas.core.generic import NDFrame, _shared_docs -from pandas.core.index import Index, MultiIndex, _ensure_index +from pandas.core.index import (Index, MultiIndex, _ensure_index, + _ensure_index_from_sequences) from pandas.core.indexing import (maybe_droplevels, convert_to_index_sliceable, check_bool_indexer) from pandas.core.internals import (BlockManager, @@ -1155,9 +1156,9 @@ def from_records(cls, data, index=None, exclude=None, columns=None, else: try: to_remove = [arr_columns.get_loc(field) for field in index] - - result_index = MultiIndex.from_arrays( - [arrays[i] for i in to_remove], names=index) + index_data = [arrays[i] for i in to_remove] + result_index = _ensure_index_from_sequences(index_data, + names=index) exclude.update(index) except Exception: @@ -3000,7 +3001,7 @@ def set_index(self, keys, drop=True, append=False, inplace=False, to_remove.append(col) arrays.append(level) - index = MultiIndex.from_arrays(arrays, names=names) + index = _ensure_index_from_sequences(arrays, names) if verify_integrity and not index.is_unique: duplicates = index.get_duplicates() diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index 323d50166e7b6f..d20a0b0a2c73df 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -1,6 +1,9 @@ -from pandas.core.indexes.base import (Index, _new_Index, # noqa - _ensure_index, _get_na_value, - InvalidIndexError) +from pandas.core.indexes.base import (Index, + _new_Index, + _ensure_index, + _ensure_index_from_sequences, + _get_na_value, + InvalidIndexError) # noqa from pandas.core.indexes.category import CategoricalIndex # noqa from pandas.core.indexes.multi import MultiIndex # noqa from pandas.core.indexes.interval import IntervalIndex # noqa @@ -22,7 +25,8 @@ 'InvalidIndexError', 'TimedeltaIndex', 'PeriodIndex', 'DatetimeIndex', '_new_Index', 'NaT', - '_ensure_index', '_get_na_value', '_get_combined_index', + '_ensure_index', '_ensure_index_from_sequences', '_get_na_value', + '_get_combined_index', '_get_objs_combined_axis', '_union_indexes', '_get_consensus_names', '_all_indexes_same'] diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 31cf1e48b85294..6a30eaefaaae76 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4012,7 +4012,76 @@ def invalid_op(self, other=None): Index._add_comparison_methods() +def _ensure_index_from_sequences(sequences, names=None): + """Construct an index from sequences of data. + + A single sequence returns an Index. Many sequences returns a + MultiIndex. + + Parameters + ---------- + sequences : sequence of sequences + names : sequence of str + + Returns + ------- + index : Index or MultiIndex + + Examples + -------- + >>> _ensure_index_from_sequences([[1, 2, 3]], names=['name']) + Int64Index([1, 2, 3], dtype='int64', name='name') + + >>> _ensure_index_from_sequences([['a', 'a'], ['a', 'b']], + names=['L1', 'L2']) + MultiIndex(levels=[['a'], ['a', 'b']], + labels=[[0, 0], [0, 1]], + names=['L1', 'L2']) + + See Also + -------- + _ensure_index + """ + from .multi import MultiIndex + + if len(sequences) == 1: + if names is not None: + names = names[0] + return Index(sequences[0], name=names) + else: + return MultiIndex.from_arrays(sequences, names=names) + + def _ensure_index(index_like, copy=False): + """ + Ensure that we have an index from some index-like object + + Parameters + ---------- + index : sequence + An Index or other sequence + copy : bool + + Returns + ------- + index : Index or MultiIndex + + Examples + -------- + >>> _ensure_index(['a', 'b']) + Index(['a', 'b'], dtype='object') + + >>> _ensure_index([('a', 'a'), ('b', 'c')]) + Index([('a', 'a'), ('b', 'c')], dtype='object') + + >>> _ensure_index([['a', 'a'], ['b', 'c']]) + MultiIndex(levels=[['a'], ['b', 'c']], + labels=[[0, 0], [0, 1]]) + + See Also + -------- + _ensure_index_from_sequences + """ if isinstance(index_like, Index): if copy: index_like = index_like.copy() diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index ea45b4700172f0..d7d5b6d128a2c1 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -91,12 +91,6 @@ def __new__(cls, levels=None, labels=None, sortorder=None, names=None, raise ValueError('Length of levels and labels must be the same.') if len(levels) == 0: raise ValueError('Must pass non-zero number of levels/labels') - if len(levels) == 1: - if names: - name = names[0] - else: - name = None - return Index(levels[0], name=name, copy=True).take(labels[0]) result = object.__new__(MultiIndex) @@ -1084,10 +1078,6 @@ def from_arrays(cls, arrays, sortorder=None, names=None): MultiIndex.from_product : Make a MultiIndex from cartesian product of iterables """ - if len(arrays) == 1: - name = None if names is None else names[0] - return Index(arrays[0], name=name) - # Check if lengths of all arrays are equal or not, # raise ValueError, if not for i in range(1, len(arrays)): diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 455da9246783c1..b4abba8026b35b 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -31,7 +31,7 @@ from pandas.core.frame import _shared_docs from pandas.util._decorators import Appender -from pandas.core.index import MultiIndex, _get_na_value +from pandas.core.index import Index, MultiIndex, _get_na_value class _Unstacker(object): @@ -311,10 +311,14 @@ def _unstack_multiple(data, clocs): recons_labels = decons_obs_group_ids(comp_ids, obs_ids, shape, clabels, xnull=False) - dummy_index = MultiIndex(levels=rlevels + [obs_ids], - labels=rlabels + [comp_ids], - names=rnames + ['__placeholder__'], - verify_integrity=False) + if rlocs == []: + # Everything is in clocs, so the dummy df has a regular index + dummy_index = Index(obs_ids, name='__placeholder__') + else: + dummy_index = MultiIndex(levels=rlevels + [obs_ids], + labels=rlabels + [comp_ids], + names=rnames + ['__placeholder__'], + verify_integrity=False) if isinstance(data, Series): dummy = data.copy() @@ -446,7 +450,12 @@ def _slow_pivot(index, columns, values): def unstack(obj, level, fill_value=None): if isinstance(level, (tuple, list)): - return _unstack_multiple(obj, level) + if len(level) != 1: + # _unstack_multiple only handles MultiIndexes, + # and isn't needed for a single level + return _unstack_multiple(obj, level) + else: + level = level[0] if isinstance(obj, DataFrame): if isinstance(obj.index, MultiIndex): diff --git a/pandas/core/sparse/scipy_sparse.py b/pandas/core/sparse/scipy_sparse.py index ea108e3e899352..d2b9583d8efe5c 100644 --- a/pandas/core/sparse/scipy_sparse.py +++ b/pandas/core/sparse/scipy_sparse.py @@ -71,7 +71,11 @@ def robust_get_level_values(i): labels_to_i = Series(labels_to_i) if len(subset) > 1: labels_to_i.index = MultiIndex.from_tuples(labels_to_i.index) - labels_to_i.index.names = [index.names[i] for i in subset] + labels_to_i.index.names = [index.names[i] for i in subset] + else: + labels_to_i.index = Index(x[0] for x in labels_to_i.index) + labels_to_i.index.name = index.names[subset[0]] + labels_to_i.name = 'value' return (labels_to_i) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 2f95e510bba5ef..48bc2ee05dd680 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1452,7 +1452,12 @@ def cons_row(x): if expand: result = list(result) - return MultiIndex.from_tuples(result, names=name) + out = MultiIndex.from_tuples(result, names=name) + if out.nlevels == 1: + # We had all tuples of length-one, which are + # better represented as a regular Index. + out = out.get_level_values(0) + return out else: return Index(result, name=name) else: diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index a9821be3fa5e2d..8b1a921536a1dd 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -23,7 +23,8 @@ is_scalar, is_categorical_dtype) from pandas.core.dtypes.missing import isna from pandas.core.dtypes.cast import astype_nansafe -from pandas.core.index import Index, MultiIndex, RangeIndex +from pandas.core.index import (Index, MultiIndex, RangeIndex, + _ensure_index_from_sequences) from pandas.core.series import Series from pandas.core.frame import DataFrame from pandas.core.categorical import Categorical @@ -1444,7 +1445,8 @@ def _agg_index(self, index, try_parse_dates=True): arr, _ = self._infer_types(arr, col_na_values | col_na_fvalues) arrays.append(arr) - index = MultiIndex.from_arrays(arrays, names=self.index_names) + names = self.index_names + index = _ensure_index_from_sequences(arrays, names) return index @@ -1808,7 +1810,7 @@ def read(self, nrows=None): try_parse_dates=True) arrays.append(values) - index = MultiIndex.from_arrays(arrays) + index = _ensure_index_from_sequences(arrays) if self.usecols is not None: names = self._filter_usecols(names) @@ -3138,9 +3140,8 @@ def _get_empty_meta(columns, index_col, index_names, dtype=None): if index_col is None or index_col is False: index = Index([]) else: - index = [Series([], dtype=dtype[index_name]) - for index_name in index_names] - index = MultiIndex.from_arrays(index, names=index_names) + data = [Series([], dtype=dtype[name]) for name in index_names] + index = _ensure_index_from_sequences(data, names=index_names) index_col.sort() for i, n in enumerate(index_col): columns.pop(n - i) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 07e98c326bcaa6..aa32e75ba0d585 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -17,7 +17,7 @@ DataFrame, Float64Index, Int64Index, CategoricalIndex, DatetimeIndex, TimedeltaIndex, PeriodIndex, isna) -from pandas.core.index import _get_combined_index +from pandas.core.index import _get_combined_index, _ensure_index_from_sequences from pandas.util.testing import assert_almost_equal from pandas.compat.numpy import np_datetime64_compat @@ -2112,3 +2112,19 @@ def test_intersect_str_dates(self): res = i2.intersection(i1) assert len(res) == 0 + + +class TestIndexUtils(object): + + @pytest.mark.parametrize('data, names, expected', [ + ([[1, 2, 3]], None, Index([1, 2, 3])), + ([[1, 2, 3]], ['name'], Index([1, 2, 3], name='name')), + ([['a', 'a'], ['c', 'd']], None, + MultiIndex([['a'], ['c', 'd']], [[0, 0], [0, 1]])), + ([['a', 'a'], ['c', 'd']], ['L1', 'L2'], + MultiIndex([['a'], ['c', 'd']], [[0, 0], [0, 1]], + names=['L1', 'L2'])), + ]) + def test_ensure_index_from_sequences(self, data, names, expected): + result = _ensure_index_from_sequences(data, names) + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index c66775f4690cc5..798d2444689615 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -537,15 +537,12 @@ def test_astype(self): self.index.astype(np.dtype(int)) def test_constructor_single_level(self): - single_level = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']], - labels=[[0, 1, 2, 3]], names=['first']) - assert isinstance(single_level, Index) - assert not isinstance(single_level, MultiIndex) - assert single_level.name == 'first' - - single_level = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']], - labels=[[0, 1, 2, 3]]) - assert single_level.name is None + result = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']], + labels=[[0, 1, 2, 3]], names=['first']) + assert isinstance(result, MultiIndex) + expected = Index(['foo', 'bar', 'baz', 'qux'], name='first') + tm.assert_index_equal(result.levels[0], expected) + assert result.names == ['first'] def test_constructor_no_levels(self): tm.assert_raises_regex(ValueError, "non-zero number " @@ -768,8 +765,9 @@ def test_from_arrays_empty(self): # 1 level result = MultiIndex.from_arrays(arrays=[[]], names=['A']) + assert isinstance(result, MultiIndex) expected = Index([], name='A') - tm.assert_index_equal(result, expected) + tm.assert_index_equal(result.levels[0], expected) # N levels for N in [2, 3]: @@ -830,7 +828,7 @@ def test_from_product_empty(self): # 1 level result = MultiIndex.from_product([[]], names=['A']) expected = pd.Index([], name='A') - tm.assert_index_equal(result, expected) + tm.assert_index_equal(result.levels[0], expected) # 2 levels l1 = [[], ['foo', 'bar', 'baz'], []] diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 5a17cb6d7dc475..7dac83953ad8f7 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -1909,7 +1909,11 @@ def keyfunc(x): # convert tuples to index if nentries == 1: + # we have a single level of tuples, i.e. a regular Index index = Index(tuples[0], name=names[0]) + elif nlevels == 1: + name = None if names is None else names[0] + index = Index((x[0] for x in tuples), name=name) else: index = MultiIndex.from_tuples(tuples, names=names) return index From b98e688c7d483777a21fb46ec46e86b72b90e5a3 Mon Sep 17 00:00:00 2001 From: cbertinato Date: Thu, 31 Aug 2017 06:24:23 -0400 Subject: [PATCH 036/188] CLN: replace %s syntax with .format in io/formats/css.py, excel.py, printing.py, style.py, and terminal.py (#17387) Progress toward issue #16130. Converted old string formatting to new string formatting in io/formats/css.py, excel.py, printing.py, style.py, and terminal.py --- pandas/io/formats/css.py | 32 +++++++++++++------------ pandas/io/formats/excel.py | 18 ++++++++------ pandas/io/formats/printing.py | 31 ++++++++++++------------ pandas/io/formats/style.py | 45 ++++++++++++++++++++--------------- pandas/io/formats/terminal.py | 2 +- 5 files changed, 71 insertions(+), 57 deletions(-) diff --git a/pandas/io/formats/css.py b/pandas/io/formats/css.py index d12d2373e11908..429c98b579ca09 100644 --- a/pandas/io/formats/css.py +++ b/pandas/io/formats/css.py @@ -94,12 +94,13 @@ def __call__(self, declarations_str, inherited=None): # 3. TODO: resolve other font-relative units for side in self.SIDES: - prop = 'border-%s-width' % side + prop = 'border-{side}-width'.format(side=side) if prop in props: props[prop] = self.size_to_pt( props[prop], em_pt=font_size, conversions=self.BORDER_WIDTH_RATIOS) - for prop in ['margin-%s' % side, 'padding-%s' % side]: + for prop in ['margin-{side}'.format(side=side), + 'padding-{side}'.format(side=side)]: if prop in props: # TODO: support % props[prop] = self.size_to_pt( @@ -152,7 +153,8 @@ def __call__(self, declarations_str, inherited=None): def size_to_pt(self, in_val, em_pt=None, conversions=UNIT_RATIOS): def _error(): - warnings.warn('Unhandled size: %r' % in_val, CSSWarning) + warnings.warn('Unhandled size: {val!r}'.format(val=in_val), + CSSWarning) return self.size_to_pt('1!!default', conversions=conversions) try: @@ -185,10 +187,10 @@ def _error(): val = round(val, 5) if int(val) == val: - size_fmt = '%d' + size_fmt = '{fmt:d}pt'.format(fmt=int(val)) else: - size_fmt = '%f' - return (size_fmt + 'pt') % val + size_fmt = '{fmt:f}pt'.format(fmt=val) + return size_fmt def atomize(self, declarations): for prop, value in declarations: @@ -215,19 +217,19 @@ def expand(self, prop, value): try: mapping = self.SIDE_SHORTHANDS[len(tokens)] except KeyError: - warnings.warn('Could not expand "%s: %s"' % (prop, value), - CSSWarning) + warnings.warn('Could not expand "{prop}: {val}"' + .format(prop=prop, val=value), CSSWarning) return for key, idx in zip(self.SIDES, mapping): - yield prop_fmt % key, tokens[idx] + yield prop_fmt.format(key), tokens[idx] return expand - expand_border_color = _side_expander('border-%s-color') - expand_border_style = _side_expander('border-%s-style') - expand_border_width = _side_expander('border-%s-width') - expand_margin = _side_expander('margin-%s') - expand_padding = _side_expander('padding-%s') + expand_border_color = _side_expander('border-{:s}-color') + expand_border_style = _side_expander('border-{:s}-style') + expand_border_width = _side_expander('border-{:s}-width') + expand_margin = _side_expander('margin-{:s}') + expand_padding = _side_expander('padding-{:s}') def parse(self, declarations_str): """Generates (prop, value) pairs from declarations @@ -245,4 +247,4 @@ def parse(self, declarations_str): yield prop, val else: warnings.warn('Ill-formatted attribute: expected a colon ' - 'in %r' % decl, CSSWarning) + 'in {decl!r}'.format(decl=decl), CSSWarning) diff --git a/pandas/io/formats/excel.py b/pandas/io/formats/excel.py index 80c3880d39dfdc..ab689d196f4b60 100644 --- a/pandas/io/formats/excel.py +++ b/pandas/io/formats/excel.py @@ -132,10 +132,12 @@ def build_alignment(self, props): def build_border(self, props): return {side: { - 'style': self._border_style(props.get('border-%s-style' % side), - props.get('border-%s-width' % side)), + 'style': self._border_style(props.get('border-{side}-style' + .format(side=side)), + props.get('border-{side}-width' + .format(side=side))), 'color': self.color_to_excel( - props.get('border-%s-color' % side)), + props.get('border-{side}-color'.format(side=side))), } for side in ['top', 'right', 'bottom', 'left']} def _border_style(self, style, width): @@ -302,7 +304,8 @@ def color_to_excel(self, val): try: return self.NAMED_COLORS[val] except KeyError: - warnings.warn('Unhandled colour format: %r' % val, CSSWarning) + warnings.warn('Unhandled colour format: {val!r}'.format(val=val), + CSSWarning) class ExcelFormatter(object): @@ -369,7 +372,7 @@ def _format_value(self, val): if lib.isposinf_scalar(val): val = self.inf_rep elif lib.isneginf_scalar(val): - val = '-%s' % self.inf_rep + val = '-{inf}'.format(inf=self.inf_rep) elif self.float_format is not None: val = float(self.float_format % val) return val @@ -434,8 +437,9 @@ def _format_header_regular(self): colnames = self.columns if has_aliases: if len(self.header) != len(self.columns): - raise ValueError('Writing %d cols but got %d aliases' % - (len(self.columns), len(self.header))) + raise ValueError('Writing {cols} cols but got {alias} ' + 'aliases'.format(cols=len(self.columns), + alias=len(self.header))) else: colnames = self.header diff --git a/pandas/io/formats/printing.py b/pandas/io/formats/printing.py index cbad603630bd34..e0f53f671017aa 100644 --- a/pandas/io/formats/printing.py +++ b/pandas/io/formats/printing.py @@ -102,9 +102,9 @@ def _pprint_seq(seq, _nest_lvl=0, max_seq_items=None, **kwds): bounds length of printed sequence, depending on options """ if isinstance(seq, set): - fmt = u("{%s}") + fmt = u("{{{body}}}") else: - fmt = u("[%s]") if hasattr(seq, '__setitem__') else u("(%s)") + fmt = u("[{body}]") if hasattr(seq, '__setitem__') else u("({body})") if max_seq_items is False: nitems = len(seq) @@ -123,7 +123,7 @@ def _pprint_seq(seq, _nest_lvl=0, max_seq_items=None, **kwds): elif isinstance(seq, tuple) and len(seq) == 1: body += ',' - return fmt % body + return fmt.format(body=body) def _pprint_dict(seq, _nest_lvl=0, max_seq_items=None, **kwds): @@ -131,10 +131,10 @@ def _pprint_dict(seq, _nest_lvl=0, max_seq_items=None, **kwds): internal. pprinter for iterables. you should probably use pprint_thing() rather then calling this directly. """ - fmt = u("{%s}") + fmt = u("{{{things}}}") pairs = [] - pfmt = u("%s: %s") + pfmt = u("{key}: {val}") if max_seq_items is False: nitems = len(seq) @@ -142,16 +142,17 @@ def _pprint_dict(seq, _nest_lvl=0, max_seq_items=None, **kwds): nitems = max_seq_items or get_option("max_seq_items") or len(seq) for k, v in list(seq.items())[:nitems]: - pairs.append(pfmt % - (pprint_thing(k, _nest_lvl + 1, - max_seq_items=max_seq_items, **kwds), - pprint_thing(v, _nest_lvl + 1, - max_seq_items=max_seq_items, **kwds))) + pairs.append( + pfmt.format( + key=pprint_thing(k, _nest_lvl + 1, + max_seq_items=max_seq_items, **kwds), + val=pprint_thing(v, _nest_lvl + 1, + max_seq_items=max_seq_items, **kwds))) if nitems < len(seq): - return fmt % (", ".join(pairs) + ", ...") + return fmt.format(things=", ".join(pairs) + ", ...") else: - return fmt % ", ".join(pairs) + return fmt.format(things=", ".join(pairs)) def pprint_thing(thing, _nest_lvl=0, escape_chars=None, default_escapes=False, @@ -221,10 +222,10 @@ def as_escaped_unicode(thing, escape_chars=escape_chars): max_seq_items=max_seq_items) elif isinstance(thing, compat.string_types) and quote_strings: if compat.PY3: - fmt = "'%s'" + fmt = u("'{thing}'") else: - fmt = "u'%s'" - result = fmt % as_escaped_unicode(thing) + fmt = u("u'{thing}'") + result = fmt.format(thing=as_escaped_unicode(thing)) else: result = as_escaped_unicode(thing) diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 445fceb4b81467..87d672197be300 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -230,7 +230,7 @@ def format_attr(pair): # ... except maybe the last for columns.names name = self.data.columns.names[r] cs = [BLANK_CLASS if name is None else INDEX_NAME_CLASS, - "level%s" % r] + "level{lvl}".format(lvl=r)] name = BLANK_VALUE if name is None else name row_es.append({"type": "th", "value": name, @@ -240,7 +240,8 @@ def format_attr(pair): if clabels: for c, value in enumerate(clabels[r]): - cs = [COL_HEADING_CLASS, "level%s" % r, "col%s" % c] + cs = [COL_HEADING_CLASS, "level{lvl}".format(lvl=r), + "col{col}".format(col=c)] cs.extend(cell_context.get( "col_headings", {}).get(r, {}).get(c, [])) es = { @@ -264,7 +265,7 @@ def format_attr(pair): for c, name in enumerate(self.data.index.names): cs = [INDEX_NAME_CLASS, - "level%s" % c] + "level{lvl}".format(lvl=c)] name = '' if name is None else name index_header_row.append({"type": "th", "value": name, "class": " ".join(cs)}) @@ -281,7 +282,8 @@ def format_attr(pair): for r, idx in enumerate(self.data.index): row_es = [] for c, value in enumerate(rlabels[r]): - rid = [ROW_HEADING_CLASS, "level%s" % c, "row%s" % r] + rid = [ROW_HEADING_CLASS, "level{lvl}".format(lvl=c), + "row{row}".format(row=r)] es = { "type": "th", "is_visible": _is_visible(r, c, idx_lengths), @@ -298,7 +300,8 @@ def format_attr(pair): row_es.append(es) for c, col in enumerate(self.data.columns): - cs = [DATA_CLASS, "row%s" % r, "col%s" % c] + cs = [DATA_CLASS, "row{row}".format(row=r), + "col{col}".format(col=c)] cs.extend(cell_context.get("data", {}).get(r, {}).get(c, [])) formatter = self._display_funcs[(r, c)] value = self.data.iloc[r, c] @@ -317,7 +320,8 @@ def format_attr(pair): else: props.append(['', '']) cellstyle.append({'props': props, - 'selector': "row%s_col%s" % (r, c)}) + 'selector': "row{row}_col{col}" + .format(row=r, col=c)}) body.append(row_es) return dict(head=head, cellstyle=cellstyle, body=body, uuid=uuid, @@ -512,22 +516,23 @@ def _apply(self, func, axis=0, subset=None, **kwargs): result = func(data, **kwargs) if not isinstance(result, pd.DataFrame): raise TypeError( - "Function {!r} must return a DataFrame when " - "passed to `Styler.apply` with axis=None".format(func)) + "Function {func!r} must return a DataFrame when " + "passed to `Styler.apply` with axis=None" + .format(func=func)) if not (result.index.equals(data.index) and result.columns.equals(data.columns)): - msg = ('Result of {!r} must have identical index and columns ' - 'as the input'.format(func)) + msg = ('Result of {func!r} must have identical index and ' + 'columns as the input'.format(func=func)) raise ValueError(msg) result_shape = result.shape expected_shape = self.data.loc[subset].shape if result_shape != expected_shape: - msg = ("Function {!r} returned the wrong shape.\n" - "Result has shape: {}\n" - "Expected shape: {}".format(func, - result.shape, - expected_shape)) + msg = ("Function {func!r} returned the wrong shape.\n" + "Result has shape: {res}\n" + "Expected shape: {expect}".format(func=func, + res=result.shape, + expect=expected_shape)) raise ValueError(msg) self._update_ctx(result) return self @@ -771,7 +776,8 @@ def set_table_styles(self, table_styles): @staticmethod def _highlight_null(v, null_color): - return 'background-color: %s' % null_color if pd.isna(v) else '' + return ('background-color: {color}'.format(color=null_color) + if pd.isna(v) else '') def highlight_null(self, null_color='red'): """ @@ -839,7 +845,8 @@ def _background_gradient(s, cmap='PuBu', low=0, high=0): # https://github.com/matplotlib/matplotlib/issues/5427 normed = norm(s.values) c = [colors.rgb2hex(x) for x in plt.cm.get_cmap(cmap)(normed)] - return ['background-color: %s' % color for color in c] + return ['background-color: {color}'.format(color=color) + for color in c] def set_properties(self, subset=None, **kwargs): """ @@ -1182,6 +1189,6 @@ def _maybe_wrap_formatter(formatter): elif callable(formatter): return formatter else: - msg = "Expected a template string or callable, got {} instead".format( - formatter) + msg = ("Expected a template string or callable, got {formatter} " + "instead".format(formatter=formatter)) raise TypeError(msg) diff --git a/pandas/io/formats/terminal.py b/pandas/io/formats/terminal.py index 30bd1d16b538a4..4bcb28fa59b864 100644 --- a/pandas/io/formats/terminal.py +++ b/pandas/io/formats/terminal.py @@ -124,4 +124,4 @@ def ioctl_GWINSZ(fd): if __name__ == "__main__": sizex, sizey = get_terminal_size() - print('width = %s height = %s' % (sizex, sizey)) + print('width = {w} height = {h}'.format(w=sizex, h=sizey)) From 764cf2abca9ae3d0c730c98b5103fcde5b4fd88b Mon Sep 17 00:00:00 2001 From: Sylvia Date: Thu, 31 Aug 2017 06:35:51 -0400 Subject: [PATCH 037/188] BUG: not correctly using OrderedDict in test_series_apply (#17384) in Python versions <3.6 this syntax will result in an unordered dic --- pandas/tests/series/test_apply.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/series/test_apply.py b/pandas/tests/series/test_apply.py index e3be5427588b38..d0693984689a65 100644 --- a/pandas/tests/series/test_apply.py +++ b/pandas/tests/series/test_apply.py @@ -317,9 +317,9 @@ def test_non_callable_aggregates(self): # test when mixed w/ callable reducers result = s.agg(['size', 'count', 'mean']) - expected = Series(OrderedDict({'size': 3.0, - 'count': 2.0, - 'mean': 1.5})) + expected = Series(OrderedDict([('size', 3.0), + ('count', 2.0), + ('mean', 1.5)])) assert_series_equal(result[expected.index], expected) From 062f6f118fe4ea439ae255a8ff886a532e20ecdb Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 31 Aug 2017 03:37:58 -0700 Subject: [PATCH 038/188] Remove boxplot from _dataframe_apply_whitelist (#17381) --- pandas/core/groupby.py | 11 ++++++----- pandas/tests/groupby/test_whitelist.py | 1 - 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index c23b00dc740a43..248f3b2095a785 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -63,6 +63,8 @@ import pandas.core.common as com from pandas.core.config import option_context +from pandas.plotting._core import boxplot_frame_groupby + from pandas._libs import lib, groupby as libgroupby, Timestamp, NaT, iNaT from pandas._libs.lib import count_level_2d @@ -168,8 +170,9 @@ {'nlargest', 'nsmallest'}) - {'boxplot'}) | frozenset(['dtype', 'unique']) -_dataframe_apply_whitelist = (_common_apply_whitelist | - frozenset(['dtypes', 'corrwith'])) +_dataframe_apply_whitelist = ((_common_apply_whitelist | + frozenset(['dtypes', 'corrwith'])) - + {'boxplot'}) _cython_transforms = frozenset(['cumprod', 'cumsum', 'shift', 'cummin', 'cummax']) @@ -4280,9 +4283,7 @@ def groupby_series(obj, col=None): results.index = _default_index(len(results)) return results - -from pandas.plotting._core import boxplot_frame_groupby # noqa -DataFrameGroupBy.boxplot = boxplot_frame_groupby + boxplot = boxplot_frame_groupby class PanelGroupBy(NDFrameGroupBy): diff --git a/pandas/tests/groupby/test_whitelist.py b/pandas/tests/groupby/test_whitelist.py index 2c8bf57f20faea..1c5161d2ffb431 100644 --- a/pandas/tests/groupby/test_whitelist.py +++ b/pandas/tests/groupby/test_whitelist.py @@ -42,7 +42,6 @@ 'pct_change', 'skew', 'plot', - 'boxplot', 'hist', 'median', 'dtypes', From dad39d593eacd1ee2b2465dc2ac025b0cfaffe2a Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Fri, 1 Sep 2017 05:19:30 -0700 Subject: [PATCH 039/188] API: Localize Series when calling to_datetime with utc=True (#6415) (#17109) --- doc/source/whatsnew/v0.21.0.txt | 30 +++++++++++ pandas/core/tools/datetimes.py | 2 +- pandas/io/sql.py | 13 ++--- pandas/tests/indexes/datetimes/test_tools.py | 56 ++++++++++++++++---- pandas/tests/io/test_sql.py | 23 +++++--- pandas/tests/test_multilevel.py | 2 +- 6 files changed, 101 insertions(+), 25 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 273cbd8357f853..e0963a1908bbc6 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -298,6 +298,36 @@ length 2+ levels, so a :class:`MultiIndex` is always returned from all of the pd.MultiIndex.from_tuples([('a',), ('b',)]) +.. _whatsnew_0210.api.utc_localization_with_series: + +UTC Localization with Series +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Previously, :func:`to_datetime` did not localize datetime ``Series`` data when ``utc=True`` was passed. Now, :func:`to_datetime` will correctly localize ``Series`` with a ``datetime64[ns, UTC]`` dtype to be consistent with how list-like and ``Index`` data are handled. (:issue:`6415`). + + Previous Behavior + + .. ipython:: python + + s = Series(['20130101 00:00:00'] * 3) + + .. code-block:: ipython + + In [12]: pd.to_datetime(s, utc=True) + Out[12]: + 0 2013-01-01 + 1 2013-01-01 + 2 2013-01-01 + dtype: datetime64[ns] + + New Behavior + + .. ipython:: python + + pd.to_datetime(s, utc=True) + +Additionally, DataFrames with datetime columns that were parsed by :func:`read_sql_table` and :func:`read_sql_query` will also be localized to UTC only if the original SQL columns were timezone aware datetime columns. + .. _whatsnew_0210.api: Other API Changes diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index c0f234a36803d7..9ff0275a7c3708 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -516,7 +516,7 @@ def _convert_listlike(arg, box, format, name=None, tz=tz): result = arg elif isinstance(arg, ABCSeries): from pandas import Series - values = _convert_listlike(arg._values, False, format) + values = _convert_listlike(arg._values, True, format) result = Series(values, index=arg.index, name=arg.name) elif isinstance(arg, (ABCDataFrame, MutableMapping)): result = _assemble_from_unit_mappings(arg, errors=errors) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 9aa47e5c698503..9c6d01d236c576 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -99,24 +99,24 @@ def _convert_params(sql, params): return args -def _handle_date_column(col, format=None): +def _handle_date_column(col, utc=None, format=None): if isinstance(format, dict): return to_datetime(col, errors='ignore', **format) else: if format in ['D', 's', 'ms', 'us', 'ns']: - return to_datetime(col, errors='coerce', unit=format, utc=True) + return to_datetime(col, errors='coerce', unit=format, utc=utc) elif (issubclass(col.dtype.type, np.floating) or issubclass(col.dtype.type, np.integer)): # parse dates as timestamp format = 's' if format is None else format - return to_datetime(col, errors='coerce', unit=format, utc=True) + return to_datetime(col, errors='coerce', unit=format, utc=utc) elif is_datetime64tz_dtype(col): # coerce to UTC timezone # GH11216 return (to_datetime(col, errors='coerce') .astype('datetime64[ns, UTC]')) else: - return to_datetime(col, errors='coerce', format=format, utc=True) + return to_datetime(col, errors='coerce', format=format, utc=utc) def _parse_date_columns(data_frame, parse_dates): @@ -821,8 +821,9 @@ def _harmonize_columns(self, parse_dates=None): if (col_type is datetime or col_type is date or col_type is DatetimeTZDtype): - self.frame[col_name] = _handle_date_column(df_col) - + # Convert tz-aware Datetime SQL columns to UTC + utc = col_type is DatetimeTZDtype + self.frame[col_name] = _handle_date_column(df_col, utc=utc) elif col_type is float: # floats support NA, can always convert! self.frame[col_name] = df_col.astype(col_type, copy=False) diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index 50669ee357bbdc..089d74a1d69b8c 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -260,15 +260,53 @@ def test_to_datetime_tz_pytz(self): dtype='datetime64[ns, UTC]', freq=None) tm.assert_index_equal(result, expected) - def test_to_datetime_utc_is_true(self): - # See gh-11934 - start = pd.Timestamp('2014-01-01', tz='utc') - end = pd.Timestamp('2014-01-03', tz='utc') - date_range = pd.bdate_range(start, end) - - result = pd.to_datetime(date_range, utc=True) - expected = pd.DatetimeIndex(data=date_range) - tm.assert_index_equal(result, expected) + @pytest.mark.parametrize("init_constructor, end_constructor, test_method", + [(Index, DatetimeIndex, tm.assert_index_equal), + (list, DatetimeIndex, tm.assert_index_equal), + (np.array, DatetimeIndex, tm.assert_index_equal), + (Series, Series, tm.assert_series_equal)]) + def test_to_datetime_utc_true(self, + init_constructor, + end_constructor, + test_method): + # See gh-11934 & gh-6415 + data = ['20100102 121314', '20100102 121315'] + expected_data = [pd.Timestamp('2010-01-02 12:13:14', tz='utc'), + pd.Timestamp('2010-01-02 12:13:15', tz='utc')] + + result = pd.to_datetime(init_constructor(data), + format='%Y%m%d %H%M%S', + utc=True) + expected = end_constructor(expected_data) + test_method(result, expected) + + # Test scalar case as well + for scalar, expected in zip(data, expected_data): + result = pd.to_datetime(scalar, format='%Y%m%d %H%M%S', utc=True) + assert result == expected + + def test_to_datetime_utc_true_with_series_single_value(self): + # GH 15760 UTC=True with Series + ts = 1.5e18 + result = pd.to_datetime(pd.Series([ts]), utc=True) + expected = pd.Series([pd.Timestamp(ts, tz='utc')]) + tm.assert_series_equal(result, expected) + + def test_to_datetime_utc_true_with_series_tzaware_string(self): + ts = '2013-01-01 00:00:00-01:00' + expected_ts = '2013-01-01 01:00:00' + data = pd.Series([ts] * 3) + result = pd.to_datetime(data, utc=True) + expected = pd.Series([pd.Timestamp(expected_ts, tz='utc')] * 3) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize('date, dtype', + [('2013-01-01 01:00:00', 'datetime64[ns]'), + ('2013-01-01 01:00:00', 'datetime64[ns, UTC]')]) + def test_to_datetime_utc_true_with_series_datetime_ns(self, date, dtype): + expected = pd.Series([pd.Timestamp('2013-01-01 01:00:00', tz='UTC')]) + result = pd.to_datetime(pd.Series([date], dtype=dtype), utc=True) + tm.assert_series_equal(result, expected) def test_to_datetime_tz_psycopg2(self): diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index a7c42391effe66..93eb0ff0ac1f26 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -602,7 +602,7 @@ def test_execute_sql(self): tm.equalContents(row, [5.1, 3.5, 1.4, 0.2, 'Iris-setosa']) def test_date_parsing(self): - # Test date parsing in read_sq + # Test date parsing in read_sql # No Parsing df = sql.read_sql_query("SELECT * FROM types_test_data", self.conn) assert not issubclass(df.DateCol.dtype.type, np.datetime64) @@ -1271,11 +1271,13 @@ def check(col): # "2000-01-01 00:00:00-08:00" should convert to # "2000-01-01 08:00:00" - assert col[0] == Timestamp('2000-01-01 08:00:00', tz='UTC') - # "2000-06-01 00:00:00-07:00" should convert to # "2000-06-01 07:00:00" - assert col[1] == Timestamp('2000-06-01 07:00:00', tz='UTC') + # GH 6415 + expected_data = [Timestamp('2000-01-01 08:00:00', tz='UTC'), + Timestamp('2000-06-01 07:00:00', tz='UTC')] + expected = Series(expected_data, name=col.name) + tm.assert_series_equal(col, expected) else: raise AssertionError("DateCol loaded with incorrect type " @@ -1298,6 +1300,9 @@ def check(col): self.conn, parse_dates=['DateColWithTz']) if not hasattr(df, 'DateColWithTz'): pytest.skip("no column with datetime with time zone") + col = df.DateColWithTz + assert is_datetime64tz_dtype(col.dtype) + assert str(col.dt.tz) == 'UTC' check(df.DateColWithTz) df = pd.concat(list(pd.read_sql_query("select * from types_test_data", @@ -1307,9 +1312,9 @@ def check(col): assert is_datetime64tz_dtype(col.dtype) assert str(col.dt.tz) == 'UTC' expected = sql.read_sql_table("types_test_data", self.conn) - tm.assert_series_equal(df.DateColWithTz, - expected.DateColWithTz - .astype('datetime64[ns, UTC]')) + col = expected.DateColWithTz + assert is_datetime64tz_dtype(col.dtype) + tm.assert_series_equal(df.DateColWithTz, expected.DateColWithTz) # xref #7139 # this might or might not be converted depending on the postgres driver @@ -1388,8 +1393,10 @@ def test_datetime_date(self): df = DataFrame([date(2014, 1, 1), date(2014, 1, 2)], columns=["a"]) df.to_sql('test_date', self.conn, index=False) res = read_sql_table('test_date', self.conn) + result = res['a'] + expected = to_datetime(df['a']) # comes back as datetime64 - tm.assert_series_equal(res['a'], to_datetime(df['a'])) + tm.assert_series_equal(result, expected) def test_datetime_time(self): # test support for datetime.time diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index a765e2c4ca1bf7..6976fe162c5d5c 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -2137,7 +2137,7 @@ def test_set_index_datetime(self): '2011-07-19 08:00:00', '2011-07-19 09:00:00'], 'value': range(6)}) df.index = pd.to_datetime(df.pop('datetime'), utc=True) - df.index = df.index.tz_localize('UTC').tz_convert('US/Pacific') + df.index = df.index.tz_convert('US/Pacific') expected = pd.DatetimeIndex(['2011-07-19 07:00:00', '2011-07-19 08:00:00', From 9e425d637b0c635f1ec73407e6b45d1c53cd7fca Mon Sep 17 00:00:00 2001 From: jschendel Date: Fri, 1 Sep 2017 08:52:44 -0600 Subject: [PATCH 040/188] TST: Enable tests in test_tools.py (#17405) Enabled tests that currently aren't running. Small fix to make sure all tests pass. Verified that the raised messages match expectations for TestToDatetimeUnit::test_frame. --- pandas/core/tools/datetimes.py | 2 +- pandas/tests/indexes/datetimes/test_tools.py | 24 +++++++++++++------- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 9ff0275a7c3708..9dde26f43ad337 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -605,7 +605,7 @@ def f(value): if len(excess): raise ValueError("extra keys have been passed " "to the datetime assemblage: " - "[{excess}]".format(','.join(excess=excess))) + "[{excess}]".format(excess=','.join(excess))) def coerce(values): # we allow coercion to if errors allows diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index 089d74a1d69b8c..5152c1019d8de0 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -25,7 +25,7 @@ compat) -class TimeConversionFormats(object): +class TestTimeConversionFormats(object): def test_to_datetime_format(self): values = ['1/1/2000', '1/2/2000', '1/3/2000'] @@ -372,7 +372,7 @@ def test_datetime_invalid_datatype(self): pd.to_datetime(pd.to_datetime) -class ToDatetimeUnit(object): +class TestToDatetimeUnit(object): def test_unit(self): # GH 11758 @@ -566,7 +566,10 @@ def test_dataframe(self): df2 = DataFrame({'year': [2015, 2016], 'month': [2, 20], 'day': [4, 5]}) - with pytest.raises(ValueError): + + msg = ("cannot assemble the datetimes: time data .+ does not " + "match format '%Y%m%d' \(match\)") + with tm.assert_raises_regex(ValueError, msg): to_datetime(df2) result = to_datetime(df2, errors='coerce') expected = Series([Timestamp('20150204 00:00:00'), @@ -574,26 +577,31 @@ def test_dataframe(self): assert_series_equal(result, expected) # extra columns - with pytest.raises(ValueError): + msg = ("extra keys have been passed to the datetime assemblage: " + "\[foo\]") + with tm.assert_raises_regex(ValueError, msg): df2 = df.copy() df2['foo'] = 1 to_datetime(df2) # not enough + msg = ('to assemble mappings requires at least that \[year, month, ' + 'day\] be specified: \[.+\] is missing') for c in [['year'], ['year', 'month'], ['year', 'month', 'second'], ['month', 'day'], ['year', 'day', 'second']]: - with pytest.raises(ValueError): + with tm.assert_raises_regex(ValueError, msg): to_datetime(df[c]) # duplicates + msg = 'cannot assemble with duplicate keys' df2 = DataFrame({'year': [2015, 2016], 'month': [2, 20], 'day': [4, 5]}) df2.columns = ['year', 'year', 'day'] - with pytest.raises(ValueError): + with tm.assert_raises_regex(ValueError, msg): to_datetime(df2) df2 = DataFrame({'year': [2015, 2016], @@ -601,7 +609,7 @@ def test_dataframe(self): 'day': [4, 5], 'hour': [4, 5]}) df2.columns = ['year', 'month', 'day', 'day'] - with pytest.raises(ValueError): + with tm.assert_raises_regex(ValueError, msg): to_datetime(df2) def test_dataframe_dtypes(self): @@ -632,7 +640,7 @@ def test_dataframe_dtypes(self): to_datetime(df) -class ToDatetimeMisc(object): +class TestToDatetimeMisc(object): def test_index_to_datetime(self): idx = Index(['1/1/2000', '1/2/2000', '1/3/2000']) From f7fe4295f84937bc0fa82c9718e62ec19fc36e6a Mon Sep 17 00:00:00 2001 From: topper-123 Date: Fri, 1 Sep 2017 17:36:00 +0100 Subject: [PATCH 041/188] TST: remove tests and docs for legacy (pre 0.12) hdf5 support (#17404) --- doc/source/io.rst | 38 ------------------ doc/source/whatsnew/v0.21.0.txt | 2 + .../tests/io/data/legacy_hdf/legacy_0.10.h5 | Bin 238321 -> 0 bytes .../io/data/legacy_hdf/legacy_table_0.11.h5 | Bin 293877 -> 0 bytes pandas/tests/io/test_pytables.py | 34 +--------------- 5 files changed, 3 insertions(+), 71 deletions(-) delete mode 100644 pandas/tests/io/data/legacy_hdf/legacy_0.10.h5 delete mode 100644 pandas/tests/io/data/legacy_hdf/legacy_table_0.11.h5 diff --git a/doc/source/io.rst b/doc/source/io.rst index e3384073617054..f55c72bae5a20e 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -4419,44 +4419,6 @@ Now you can import the ``DataFrame`` into R: starting point if you have stored multiple ``DataFrame`` objects to a single HDF5 file. -Backwards Compatibility -''''''''''''''''''''''' - -0.10.1 of ``HDFStore`` can read tables created in a prior version of pandas, -however query terms using the -prior (undocumented) methodology are unsupported. ``HDFStore`` will -issue a warning if you try to use a legacy-format file. You must -read in the entire file and write it out using the new format, using the -method ``copy`` to take advantage of the updates. The group attribute -``pandas_version`` contains the version information. ``copy`` takes a -number of options, please see the docstring. - - -.. ipython:: python - :suppress: - - import os - legacy_file_path = os.path.abspath('source/_static/legacy_0.10.h5') - -.. ipython:: python - :okwarning: - - # a legacy store - legacy_store = pd.HDFStore(legacy_file_path,'r') - legacy_store - - # copy (and return the new handle) - new_store = legacy_store.copy('store_new.h5') - new_store - new_store.close() - -.. ipython:: python - :suppress: - - legacy_store.close() - import os - os.remove('store_new.h5') - Performance ''''''''''' diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index e0963a1908bbc6..81e52266f972e5 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -371,6 +371,8 @@ Removal of prior version deprecations/changes - ``Categorical`` has dropped the ``.order()`` and ``.sort()`` methods in favor of ``.sort_values()`` (:issue:`12882`) - :func:`eval` and :func:`DataFrame.eval` have changed the default of ``inplace`` from ``None`` to ``False`` (:issue:`11149`) - The function ``get_offset_name`` has been dropped in favor of the ``.freqstr`` attribute for an offset (:issue:`11834`) +- pandas no longer tests for compatibility with hdf5-files created with pandas < 0.11 (:issue:`17404`). + .. _whatsnew_0210.performance: diff --git a/pandas/tests/io/data/legacy_hdf/legacy_0.10.h5 b/pandas/tests/io/data/legacy_hdf/legacy_0.10.h5 deleted file mode 100644 index b1439ef16361abbc0756fbf7d344fd65d8a1a473..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 238321 zcmeEP1zZ(P*WW9Mh@u#X2_kkN4d&3&T_)1fDF_yJcOWK;ofw$lV1pvkCDIMjDWKo% z-Q9bENA}+5jqmsV-q%&`Y|MXW&Y3f3&YUy5yGKi1Q&3>60ETb;{1`7L#N4HRevw=K zB_8G}>X^E|n%rTK>25N;-jCQ}V0;*UZ8Ujah`j$q;&xa$dEU%KLjyxOsryqYc^Ql4 zBlf8KQ0M=o{~H|8(oiwNDUBe>WgejY%+16JO0ISX6UXa_1RIP^XYS#0$dCS+X`AUG zLBmF%KU^*;xn*WTKiYl{?kf0?D*mH}4I|~Xw_N|-`}WSSrl(?Rips<5(I5R$^;lwJ zXpYmy&X3ocAG6RfG1WFSVCKiW4wYY8QVJd6`IVR{1NAgKztK`N6;(YAQ)^8_6MYpk z>%QZevD$y#LUMe1*g z6V4v?4(rf?t(&KvtCY3BjoVrWU-q#x>U{ViX0y)OR~n^M2K*S8cuM|-@JwhkH84~s z`Z#gS&r76z{ZhEq6Z^_RoTudYup@mwiq6p+>GMq|FXxP;&o`R9ypTga`n&x7tCYTz zBKc=Twi{a?ai>QIV}xG9^$Miidf2!-;9SS^ch}PflK5F%PgC^q zqkWX0Sy5jM*R$w9+LW2U6kn*6jq6X~?48~5jD6uqb=|Ezyfl8y%f`dr#@E^}(2H43 zW8Ej2_yqj zd!qjYnK_%1w6nEp5oPbWOrnLGuh0v`NAQ&GC~*V$Egdi}8U9XxFnf8T+pnh%%h zOhc)x(`be1DBaq2dpn*c?NLd=(~0BK#b=<@^R|l@o~~FueZ@?a-l~tfiKjod8n@%A z_Q=vDvru~KO3+C>-LX_@>ui*!R>mjdsl)~wu{kL95o~eC(^u#BM&apQ6Rqo#DD5iB zX~9#ywcZ+YQF?}t?>L@ThMX>(D|qOb%@(b_89*zvzoFW&c(_Fto=%wF2vY}K)qPd> z5{x#MD2lgjfb+g@D_$i&L^%unH}$4 z0n(&$ecJCA0zRXbyOM1U@bpu^x8+zp*r&>$n}7NPc%{!X(rRfbs2On6r8A%w)@c+L z#qX{M(=Ke-F>hfB=!>sw^t|SoSgHmo>KLr?@X=J)mDMuRRB~6a@K$kJE90VPr6Rk` z&);&bgR8cgbCBjb8!aCeoHRnm`+GsZJPf21jSPIAz?(=zW?L^9+{byx@;89<8 zn2L#s%2HHkHa*K9>5+>@#w(Vlh8law9r=;PpxpTOHQ0sT>Rva|E_3baPE6#?{P=jq|= z=d;$%&(lZ1OIl#M9bsKd`Vm&Nq^F~!uY;ejfa+RbXAcKoUv-;60WTQ=J8Ns(wa#vS z&K}m*0-m-m4t9P5Ua|rPekNXW0@LliOnxC{G+YR3c zIVJWnthdf&_hW8#~{Ef>1$=8fynWd>eT=0{`{w({XQ6(>p% zEy8VFF>xx9pBT-k*8G$HuW+FE98!xUJa$q~4*aX*ic~yo_e7$c>tuQC@s?XVTZ(yu zKk+bvkN?T>uGq1^ThYCgl={E@wqjF6+=zrEvOyXAZ5=y^=^N(`GW)AM}q zqlx5+$-{eJ>3K{t4;wwB=lR}8{N#z4DZQ`sJZ6n|r2nMn)qmd+TfhBvUfO;Bf&5?p zmHti2kI;9&;xs=X`~;L=hw2}c51ZZJrq3i=F@|zEFb{k8H+Fk)=g+13`SxMN-NT6F zZ}(3UhVhpPwtbpW16C*bCo7nB!4+01#Y;EmgUaof`JZh02(?#4YDJhQLiahvS`UW( zgsE>#RD-}>IKEz1cF3nw9lX_WEj<>Auxq zTe-LGpn3JcYLb}zFR^qecxim|5Saofqat!6{Co^Js4Zb!|Dh5VI%^I2wxbRj?(@x^ z99IHd7mK-l-CPbV*6@{wDZK)h>;~4`&CCJ3?-Fm^^vMA>Z?8V?e4Gzb4xL(Gd9@9` zo*2SAcuE7<`ouduKDHj-UdhODvB`mLzB4|2tFMQ`mpiAO`_u|o6c@iR=P3fpm(o<` zWmiK5$*;CI9ExDfhr@e=Q{thT$R`(%qg^m|#t6-E8J(bB>K%AKq7EK7I`u`r+-6{M zE?Ib3OdO~iza%d%D-|5Oco(W{`3enkopgrRCBy3ZEoq6r8la`u#jYD6)sR2j#?rYW z3JU(}7r1j$88Ck59QAZS3fTTZ-)>*wFKF6mX*FOU7V}8B|F4-zMR42d#wU?&Mev3C z!hvqiHDJ)*(vF0PG@vud@_FT@LU7UWQSgbRF4$PR+(EP+$;Fw_l2q3itfQ; zVo%)^(d*0wr$2n%niW|Jz0V}i{;bso-_4!lxBpc%thng8eD%F1aO(bY5Ao-v@WL{Xz;}(QhNe%TlZma1fsVPeo2#XX)EXlPcU>PJ zO+OjOKqYe@KWR%D4HZ3U&v|}Uy6U!m8gAakn%?g6{*G45*1G%I2dXXechPrK^j7o? zG}6&m(DN`^=dGmVp9Oi$iz`)nY*o!jj$r8=I{ zOSISMYI&&WnYh}j>#ft+Jz`GniTzhVC*RKrYgTzdal0-VfOJSr*3z=|$%I?-KtQB}&ZFRnJqzX(8J>ER2umQL}*6UE?&jIQK z(?yea*8*J&y<&%N%^+CvRX=rrV(1(9Khh0qhazL&ZTD8G1IqmK8=XsIz}kjziPO2w zaOHacy1ApSfJfKml_eh)fUAyr23-eU!v{&0Pp1#gfWBKgVr`>4pxhi8&xp^7;N!wK zUq%clfp}*Z=e{ zZq`n5dR|6(G0`rRKcGh^`E&4J&jHH6yjqbU)t>CNz4bn4edxu!0lwx5mrHUTkEa{4 zjEA|7X$i_fIrXNy;R}|~G9c4w%x5H~3HYIvNjB59WzZRS8y8O>Yy6Q7{Io27S{6U; zj8-MFPRj|{pan#OvFl{HgkRLd)5Dr5+h!g9909(5D)_JEqSZ)ls65kYZqjIR5-I`h z$)G)SQ?w_G_T=$Bv^WaCu0ZbNITZ1ITTf3WP>DLgugt>_&^iemftEWF2lxfF>dD5> zOB#_UiRw#@8+|_yf%{1*K6#+$EUc-8Y3Tev?gysICEAC1OZHP7NQSWd+B^h{crYnu=zS_$+<>c4)&pZ` zALj+`!)J>zIzStKuKT(lwmdU0WB39q9E%s(gKXp^#uxwKjyimq7JeO1@eS{;L>2ja zV$eftxGxyT;GRG{59Q$d-I{_PsGec|*eA&LG#ep{ErV64(Fy{yh$` z)&Y_WO4blU#d;q8pSRnP5TlKOq@1}1T3bU;-CD}pKt*2zKaa}@B{%pn)t}hvkT;Ec9vwQinJ@DQ2W{*CBNd7%ujypW+!@tMNbB9O$ z`1g1P?(nEjv&Wn6^h;T5YU?q3BHnH|?DiTCbPYzS-9?c1ke9h{O{yN zhK2uo<;t>f-Sy?~%T>ltqxuLU-AdJ4j)ni7?j-UvVXU9u(=X4u&p$@L0t??a{S@E$ zP#f@L1}27Q2!8>KHMP3=uk!=;x*y6u7h=vU z*mG_IjN3`w{VCM*W9;X_D0_*-v+RF1zWcm{T`q+q{7$6ZFJ)~{yxIaU7dK%jyB7Bt z;mtJ|Gr|*eo_H?_3rCWQ%`VY;6AZsE>5_`gDH| z!_m#t#t$)}kNQYctjFtvFA<~YApJT_?-#Q&R82^?8R3=dK5ytzADQm;!TllBeUaZx z7GDB4n0)H87k^~B*GH@yj;s&qc4O8E?X$Vl2VZ2n*Mq;?{fPFJk?W%#vK;C`It-ch zK>OY6LHVmVJ>RN;YJ*{%{PtEB%xp;J|7`z@9O%6sU5I3Ym2ppQP_ARDpK0~H{I`#& zc)@5x0;)uo*L(iRIbN`uc>}t50lsvWsyDNr6Y&DJitiaTU}65AL1L-)r~Lfg>))e}vgmC@B!;r3C=5;$?#id)0%4Zu0B!lCf4^H? zAb2@(X(O_$$TvK&aS?XJ#eDL{z2gG0X2kVBzkiPdz2gGwNI2!s`{&~Vl>Oe*zuY}8 z@b~@uH1Pq}C_xa?-A~JoOZ1En{C)pEm+^r<{rg!IBwtxR0^V75a{QsPvp6&1d$LMGK`TM4y z;v08`wzK|Se)_aN{Pa`9d$uI5jPZ@7kcq=TzkfLJhXa2&@P`9`IPix9|3w^N z+lznv?Rc{?&}*CUc>_7$<-*@?#gv9#{_P(AcHD7ASy`lin{q=j9m|v(in(l+azl}h zceXv=qh5df?#x!Ad)Fx8ckk<+jYiJQX2aS@{{kofe1toZ|Brv3;KTpgZ(siTwlzID z@$d4_KXoL^$t26`J#XafpVwmEfX+WZ2Ic6Xr|`Fb;!EJsr!`2|e7E>y1bM%{{qQ^f zdO!4x0I^PAWg5P4968_7FY>yhUv=g$8E|8&LNegS^v+WOQ>J&G3Yaq2@#6>hI&$Vh zell*zT*ptw4bd`wv{)al-zVb-l%Jw+|9f};6!U#8-5l_}^)dPV11`Q}K$Y9~dVR|O ziZiyG|D9Qd|2_Y^7*iIlNP+yT|GkcReJTn%t08_-?`gt$hx=&oH3B_+*2q1BpTm9R zxGNgJE$Nb`U2~7G;m0u((I1W>-6PoV{82s1L%uxxfm9qzwJYT#@9u-@fkFPeMI?rk zU%kx(Wkj15Nklilx{(iYDH&Z-BX-$-_23}l%&9*7>RQZI4HW86$}I+Q+!A+hcI?*90m{`9}^59iXK-lspDOMiNw{%|h+>8L-k^>ugp+5Ygp z{kfFB7DV@;){V3M;QyFEo9z$(o&F%&m)Xp$pYH9)t_RyM-d&gc9{)SJVf(}Trk}!b z=g+3(y|-QR@A7B2^x@C$xI}4Lf`zye#5a~ABo6=l{^7tM4*cQ39}fKCz#k6$7jb}X z?{$A~0XOT;DSM}fpOuUC-@o<$N_AgX{2Tu-<<4OGe<^nc)9FFEGwA)lT==Cae@~Bf z={@|?zgc(vd$$ZCfX&w){Q2XD=GqVa`|p;=lIAMWTAbI3$7~}yEB}8{pZ!A$_Z;ZU z4;{R@=RIuEw{P){bf5jx-@c;mZbR=LttoPbQM8 z*J|=U|LTNV*RD){KY2CDA8zNT|M+`G1erV^O%g<{gXPDa;b`+B`#Yz2)9&3we(FBN z(VyS{6%O?7=gK7EYe@Z1@uB;7squ5J!h=RNV~aBwIaN7%vK$N#32{P}mi--`S1HlUCh>-mgC{9!44BL`$w9Em}_kqM6u zu(#gN2lRe#B%8lI>E$fvRDSN>W6Irk8~T%9x5KcQ5H=0J+ashNy>BY?cURfx2q{P3 z9Y4!H_q#oQ^5nW*zq`slN9o_hz3g-UNqcO4^`HKJPWpcJkA9byT0cbf4^|wyUq5D# z;ru<$iOl9?kR5}HA5-xXs=u)NUH(3z{gv@$2&}Dlzhk!xcXU&B=tQy~gpqgdeIHK! z)Y|Q$;k>sEplwgUT~*tB*w#<>{cF!QP_#Yc+UG+h@WqOxVaFZXK#a_aZSEt!!ew0x zEVk)IgAv7BR_CWxLGv7K-ltg|@Y2logZ^K(!e_%}3*-~qp-|V{;M}5Iz_X=3TIX#F z3~!QE7%FrZ%yZuDBxh0yZ+mQVu2(65ftU7)SA}K(&FSe616)(UM|shqk&?0Ct55aZeCv_H>|W#_UYH37$WI&g)YNyAx92lg$oUk4g&rl1loL*P4Bd8FRmll9^=h|UrmgJY&mWkl5VtcVg zNg{mR3WDBSSAc@>#i3>!ia>5hyhOOjdlFn(Hg11F3QM)sr{3z+W5MZ=N0f9oQF%BxpaW1Dit^>5Ax-z_oSh`}}iqf#J?3 z**91#d^jX%hv|b_IH=^!7v-PLz%j_!@1SWr$mUJDZB+9WR0kfNqw*^SI7qlC-`W!k zIs(^rm941)l2^(_^e&e}=@q92j#^#|w)~37s+R8p*K1}4je1cHhu$rWD!W++vu(F| zoa6Zh&sDt1A26^2P99&n?|VxwoG3W;OYNg7$Y_+`W%aV3^14g&g&nW;Q@$Vbi&2o* z0bkYLif}>iy7zB8`2OZ4!I&Y@%P;1iONXBVTrARdJc8vrhhC5?ZinJImxn0$R6()H zyyA6!*AE8>H^EQ3YM=Qg zRKpjGjY89&G{CzBvx{aby@rjm%6-Ehr9jc(56Y#VKZ7fiv#Z+D%m>=OHC?D$soac^O21ci@~Eo&6n)06JTglm1xM_HaMvM=*@%X2Z7{rbzSL# zuV8_@UeWz|EpSK8Q-{2?0vJ5pW^=)g0?4=i#X(2(oe;s(sUf@Nf51b7Ewl8p>)`t6PK;pKJ-~DfLroDIhYl}BXgA(oGxhtuiMWCobV`vhQ8kt6#Tw}mr{yneqJpF zDvZhMqw|X4p^d@gwbiO&amCIZ!@noM8Ot*zN)zirqiWqo-w6dEqHgEpct#z3>0MMS zZS(^nA>e#Rs`+8c^R+1oduRB4cwnC(*h+2mwgpo zej8*rMBaVia2cA{kJZ_u*9_)1deztEmBOLIN%hj(KEdR*iWh4oGof~>FW>ob(ZHbQ z(F60J4CTUdL!Hb1d9YRHd<=g;G8|A*ENVWW98TyD1%}9d2HTH`WB1=Tz|;w)7o_4! z;Y2Id^ViS42MJG1<)sZO!F}sX;=GZ$(C?ex26f>&C_76^xW1+Wu8#g5^*X8!td1J~ zJosG$7zxvWzg{66rlMG$|EwH@9Ltd#i~3pO(X&QJPGtiRhec|$+c0GV>DAg_f(oGA z%YrXE=3+7ZX6br~eD8$ok9DaBoT~$E=B-PIZ%u`Z&2y^#HF%X@-Fx>^leZqagdQ?` zG5aSR^JVaciGERV;+L;RAUhd+E$@G5#@jOJyLqh0I-L%%>D#3mO|fRsAO{mcT@>1zh^K}?Pnw0`!uX@zI+o{p~!nleM=}fbvb>-8SQo;|3T=yh*bkz zUvbj)MO_n|{oM7sz|>M0`XKtQ)SE)k*mm#C*}Ph4Fis$O?lbyhThVG1EIQo{bg+y2=w@Z_T%on{?S@RVMQ!-GRGWRq@F=h-3v_pR<|5Uqz- zqZ&L{94my)Q7R=Tw&uX7ev?&iNR^=Waa!xwm$Il0T;4nV#-oHX9hXN*1^jcU%gb@kO8uw z7EeG>4a|1j;UksZ3U4{>u>aDY4~Ov?Shp@H0@GAXSCnka25Ze9S@C`-1YV&rzYZ_^ z3?A&8TrNB&8uE_1qBcvv5!CHoKUpfS5a#;l2Grlcl#6YghE}@N0o55>7XR8(4SjvQ zOz-Wf2M1-R$`|LQ!`y*qW>37B2*S(;Xv{j&0{m+}jE*h44Q7?YeU-kYVE%wB7YxNP z<%-3#%QwbWfJq_+b*(q+Kw++GwM%Rf6xhE|TwCTje5#L?#cof6y!DCujh7Tc>9r># zcOR*M^FJ;6;lsERFFtg=n;O3DjaP!J*7GeTraL6j&*(VM# zV)jZm2#@ot0FgSfu_G*!fV%f~Q!x2EXi#`^I{91;G?MRk`l0rBSiLw-Ml7xg?mYbI zV%zaHC^L88vu`3{;FS0IEC?1jqq;u@o=npa)_GhH!P3TxORx<-x-k0OkS&Gd2@ zFl?_?-jo8E_&hgUG&c`OcdU@|h)9AH-uIu^bw3Ua96E2PeNi)1w~ARNR@??3Ze3lQ ztH4mMdA&!dwK5%E^_8`an|l#Hm6jU+zOxSOS=#S`fNwp_%u(p#o81P)M6HAm&if2j zm%m&1aprgEnfSn|(5n-e`-*Jy2j78tko>I1&@MRA^SLHw+6Xfr34HjK)&y}xrss;&|^6a@XeDP0EMz1;X|8~Xm!_^wm7 z0&*>&LEZGi54A>Exj6pznANRt&4^0rA1}*5lF-zV^Q|FRZ=-B5aAy{X9xf|>PwqZg zudG`+Y%8XG;oJt%_hALdesztnF8cx2pE%>%b-D%)IrQtvORzaQij5I?l=W9Mb2bb|@_!ccKLx zsgUAdaJmJu*K-YGt}FBNvGMTr^7N$~=zXm#Bd6yG^Y`Y-?EIYGQ-l`Fq3!Q{mj(AD z^fmuy&u4hasWrVHfn85xKEr7q!p{GnJ}>3YpTVYwyZIbDKez2l_=ot(3FlwuvD|&< zo|?C^+jj)nzI9Cbp?)~cS2?#YaX$un%Q{YB_wU>%Fy)RY$jBRSM42qB|RM-eI5LK1ytAiI(s+I&|>|t#!fF3$vwp} z<1EkN_78v>f2pS$(E1s)w5JDJmBEPC(VQTh-vBB!-Y;Ds)&_Qp-<>u0ODp68t%hM$ z9y2zbF|3DD>4x{JPB*}~!9F6dpVf1NR%b9KMDF~x<7zsvXlm1U5h(yO3nf;~Zu~%x zmf}00G+9uE!H9d(Ja5mFHn9K7?A)p@OgY&h{O#Bqzv$^E(5e_FJ-ooV>6PuDO(+5jQKSAWu0^( zoOSfY++f=b;AK(8SEwFMU-sq@@m);1E93SB%pQ{iZjNr*K6XI`P&YUby=hAcec1$B zm4QuOtMkKg;yZBZqo8)PR3)@i7v4~Xy!^E7O`s9o@6EEV6qja!_LY0x;tw;F6PBF3 zy)C$ao0=oK7sL;p`T0!-eE2wM$7<1EzHQz1w>B~--{x}WMjbT5IIm1@uA1jXc z0TtIXp?t7G;;K`H^koxhM7KY`#n4Wt5)kEa7pqyB4gJHjOJv6X;s%ZAHu1Xqd0Rw2 z7_a~=YkZA-S@xD=bw}rMgGO?0^y$>dZ>I{tg5*hW9o!SZ<6{fnFY5Y9kDl;cXYv%B z?)#l_k2Vkd3FXbdxgR;70Ie@9kKQ{hh8r}Z`$drRTnV#xVAMQu?Q>TXVa3wHms2H5 zxk0O8jNBya)Lfl`?Y5g8S--y?3~txmoNdwu z72eq0($s_WXaNs?p{a;&2LGP`DJeC1Qi#3=X zdb$ZT(rY7pOMhK9S`U9Nyd;p|(gt<%*H1p4z*X654D2~?{57#tO+fnWiRfn{17bSb zjh38H=fHEKY(%$E!v52`=Uc$Fn-Ao4UlziJiD`L1ORDMVzJ6Hx4zs<@Grko)I@$RSPS8~nx)xwvmp z0bCPX9|<&D=+T-Fm+4HyI`_uv- z*p$mki+#Rv>eQj<%eEp!`jxXkUAToPm{JwQRxG5XSxq7qI=69=V!9~f{ zNr}6k!{p)mZ@q02>FFlQM(sT&$~;UgAsYr+`dmEowidd9a|S=AbFjF!w%y*&q-zM^pY zniFUw=Zi$%WQ4jj!0JOuy2{6|z%WMA=Z!l#aE(ACx|2lS+(AKcXgO-R`&IFL=xgE- z8Sy-Yp65w>R8nxd8Sy0}ALndr0M&jUzs+nc02zyRJ&3Wn!VMbHEwg}cmrPX(7_78e z=I#3e_&6&0w$JinZqSHs-5-I%(u1ntE?0h;nxXkHCTCzlJBo$T_7bAz$ZmMA?XL->R_H;hRdp z;MphI!`D>8QMU&D7;>M3J}1yfuMNHE+4#Vy76zK#+8J=E4qTsS_dWi3EPacuSUr8k zOkA(Y_?7gZGmlR>`P;ITrM_9PD@bZ)o7F>Z(1`BTj!m*28{R|z5cAi2-133Y^^Z|& zPV>ajN-9w{ve!;5nYWfztRUdT|(T!pA8zRSBf2@U~v8PtvBajuMud} z-cfrJY+qEjz?Gv8Td#Rv2@C>5wFppjlH5PI=etECXQeu}*6fo9T8!o`opW+&03wMUjNnT6V$@lDTt z#r&ULaQ((Y`3YT(;LxXjrH1=BXhi~z=)V1O9egCrP;NauK*u({9{hT!oV_=WD>Rbx zyqXz1%-f5=bHn|i%U{)lRR>R9HnVS}=NeHqYHuY*^frEx6sRc#kI6-rfSjr->8_(3 zG}EaoK_?O246HV5iXddvgX@Ct48?;!!Q{2sJu07@veCRmd;S1;*rEc6 zt*{FXSyKoPjnA4I+t@~5_Ku}WTW2%rK496XwDlu6D&q-+HYdZCd(9_Go{gX{n?R%S zdwJ@^>m>$C5M(Q{bVDhZYRW zE|~FIGQ>S5lN&VB=LP9kO|)M%z{B+`NLyd6jysW21k@>*MkfHcFpF?{~Y_JiVgWGEn z*S?JXvZx7MSUBc}ooOMME-hUx#&|_fH-ScUUv_z=Zd{1Pq<*|!si|56vM;_!Y^=M( z4I1_DueBGVO0zzI#)w=KUbI%PTsz_9^!O9ppiz5&7c~*jSIULL4KbHC-f4vA((2Z{ zY2?sDeFR(FnRM%{db>k;X9?Ic>{$Mc_&1<*QTjg1$N}^tFo8yLUd2;%dWCfo7|b}F zV>jjpEGT%gGD@x$(2m~(8nyS!FCR4w?xw;`SyMJ$(5r%jPQ6|lsD^P9uW7^^f1@Ed)GaPqDb>*Ko>^u*i@R9IX4hCR*1eaXD9$vyzR9_zn(q zJeo5`stTl4jW6TT7|80qxQ~EpSJJIGvp}=-m+!U)k9$akR|okAAF%l zcNOKdFzKGIcC96HOfksnT)y$_h;-03h+oOPlR?V$u3_3xjJ z^FHmd?f{bDqP>gP2k6{ayO+MLhvGqg6IsF;*o`W-de8&;p4D6}e0dhCTji@xzxTeR2gL}o?ppjk^U9fQeNT(Q3zI=>y4ORyY9$AYBw_l?{ zqY4-JwN*&q_PKw)c%tj(dbs$G#K(6(zJlrR`vpz7-^>jf-NyFF5aEv&`S5fEcGz}P zCTyKM;nCD{bef4(Hlq9P`Q~fU!#{wahW4(Ln$5tNarn4^$3AY#M(rIq*RB>DQVcGX ztY?IKyaHP#Mfq**=y;CGX2nB?he*^_l*WO<1zo}8_tk^*5l>3}vl%pFC<~4Bn!XVb zk2{tEJBosz-P+O$EW&gKKW(7%fV0p@&JT*2<~w$lfHL*m(lfH<@MFOTk*lV3^D!10 z(QOzOaJIoR6O8-VdDWsd4`f?Uo@7{0HxeN<84tx6VvWYxP4nyFtwp@q(hZeBRN0_Y zK!^j~tazyXy;*bb`!<0OD}?lvPS(McV(*4lQ@W@Ft8CQXDj`p!r@2*uoR7_ei{l!> zz8TZK=Ga8hw>MEXlJgS*5h^JU-hqB_^ZdattD(w}lF{;RbUsZ~HWd$LoOiU$+L&Aj z!^8(*>GHwN5*dG=L`W{PIXw=?Ljj}>ZAJqXp@qS(lN8bZ7I&;em zLpkuAjE6E9eE!0G>p#_jyKOfD8bSn=g-S(!Ovs_r$f#@rjr6&l#<6`$-Zh~AbNL{( zxE82AVgBC3Tj}gI78;FX=Z3dVG%#YsJOlAQ{^3`khgJ3=Uq?Ef%0i>|j(TOeDKzsn zsIOhf(>S0E1|*z0o@d9QY$_g#mF?LrqIx6>I%imJ5kAlfM~LR`b`7VSL8G!+@lcm= ztF2#PE0}-VM1s$_9*&i7&A+{kF51FEqxMeSZ;|AxTLm3Pj1tSamj_}igRB|f=zLr( zG@@H(#^KbcsuD0sP+`^$DptOO){0HL3re4;O#6%XCGDC}nbm`JE}BPGwtt_&VueJ*3k5;{K_zhWhwy{^wb3c5cVLFg$`=V~bfZ5i zn-veO$=@y))%5~Of8SP!4Q>WQkK37+2ei<)7=cE9rn8|V-+Zqx0hU9c-Kp6Pp#93? zs}oF{=xa{KLm7-u2U0fa&TIg81I$+@C>4T+*>!h;c@>R>vfx;bpLNVrzz2`Z$Vrm(9C(s%1LwT zn&HDXnVMs3D#7gS)1^hj=w>x6G#bC9_iAm4(D(t&G?&1W-*TYD&tFF>rqZp3VxbY; zHK+T%@PnhESeJx#d`unW4SMYSup@(>ZZaN# zbAv{e*gI_F(Nz#Vo#(93G+d|=o)NgPZ0sUBPZ0tUWg|HcJKR5A$Ug^`y-nC+7r+}c z@YJ`eexh-ZwttiHPzL52{~^7#G!@)dFj}-|QZbxvl^t_L{4;&c$#^KnC_QQS#dAeI ztO=R1;E`@7Fqpsl++7zAy@VAH9pg87-JD2-KI_efXHjP1#7V1*=}T zy1uvwgzO3*`n0PCxJQgjv>hMM4I0fu!$;Jfce+~xC)^!=#ARdy7%um|3PVyu%QY$< ziVZP+$*3Ch5KK*ceS^oh4u(6Vy{J0Op_dS7G=7I%%{scxuNpWWiX6LO@qm~ydLIwX z*iEg_V#A!_!8h#?s@Ivi+E7KM?Pt^ULif*R6G=GtM_?0sHy=>I5}_4 zI=v3KJTE7B$P*`eG%FtZ*dv2)Wa(SzH;n&s(7|_bg}XxBTFo!?TqEP57{h33ou9qJ zYj{vsaJ(xYrd-$fC5lIv!`y%s53P7KP-t{{0jQbscxK4>N~mevIBJp$hgl7QM&q~j zo%p@Gz88ZlMpNaMq9=d!$dlJQUmV?<(1+S!M-KqqX$fP<6DL1g|tUtJ>( zGYbNZ^x9r)HDAW4Pw=bQ=e2>uGC||hhcxRbn!7HEo3~Df!&f9|ICQA1&@cF`JRlPBp3NGd%dR&o%bD~$#^KnI8$8HIjK4w zrk(NAJ-@60@E^RlZ`wsVp9c$#B&9*LgD(uOXy{diicwU zYo6~^{8l{`=xQP z*Qj_XgO@S!s8?YVYy*pL?XBqqhjoi9Z>Foz%Q=BYbf?K*cd(t<4!jF)Khv961%3Co zb!?31FfU=nLnoiuexoAiD;&Ii_w`$^OJLQq&Tu)uTyDxndTrCyM)^ms-{3;GS<_Va zdY>B4)dlXt&CvI|LB{tW4z>eo73e2?+SKrlgN04T zLoo(#G{5nu-K8)#N#l-E zCvW61hLZ763=^9$&dTpg32Znf2WlJ9`w9{w9p<}nu&~K^D8?9ct?b4Q$4dCr?aiqV z7m`3%x#^UETsr?b;u?WQ4FVt*PhG^#m) zMms}iYJHfKnE>y{hOC&KR|mHl4E7qYNcXG*3ytJlpLg<{XXBp0*Xx$=-*WaUFs%I9 zx$6{%`8O*b`uOw>rP{Md8$NKqsdzIJtSDZg7-3sRFXs|OJQRKFNYd+Ja_t*9C&#<^ zd~poi>}Xusu!w_|OQ6ww%=X+(N0G@bz-ejc_LA>tL`uJX=<#w6BQSwR^O8lGX}=a( z6oTWsr}C+fX@cRW^44?~a*!4Rjr6%$xc$M!C>9%L$?tBL*$fo>PaW}a4~O|T6%WPC zqHc-HPG|&5l}ATd6ug3?bj{{We1_4tHyIDbFbH%c6Zu==`O%dlijW^U_xZV$mvtPp z0~rrxU@3yG>lxQ8(VTzs8OfQI@Zf=%pk!gXr!-v>2(Vk4`pEMmu`JdnI{PFGj0e8+QjN&ZJwXiCT<|hoUtHHyVv5JV^yz*PKqfwif`C zfW|ZaOE}C0S@FRyBcTI>#0qm#qhqWvA_BTR3<`S@BS*^n?X7zLtP>!_M!O z-B|?`CrN_lDb@5`Bhbh`zrJNx{7K0hP-l}wpvU-1AZePnS9s2M`t~N!sJ-iE4%wP$ zk_alFCnxLADuI*Eg<0rZaD_&4zM>?|PItj)Fud^m9p%;suxS5~XwOxV^fhP2LqE3i zR}Bv>gwLj|21Vne;O6BCbyh+t^ff2rp%@mv=hbe{rL8a_Z1?1@v<|51BN{r{or6v# z(1`B6yiVJWjcfwNiv$9&qu*ip5_75CIJ&qXk`@Au=xz^Hd|n+_4jlG4ZP`1l1X``w zS5{`iA;Lw)LmB6C7G2=oSP3lVD>CM*wSaA>;m%E*v?41WD)spDvSlxGz~pgyYfUBy z!~{kh`xr0rou21pJQTw^RXZd#zrKgtee?zo8}ec4nz6pM^W z?-zPK9wY|cSDWEp3gh-TYJU!+vs+NvgS2m`8*Js6%QTJ5+i)4?lHX4b^Q5OvvkneKF#XQn4k1qqvD~AFw9d#RWw=Us?5fiG3zy1#FUgWCpbdt{VkI<}m=(q>D9`WLbX z%&me=Efdui%4NYP`hm@j5h?UMC(wv)vw^Qp9n>uYS=U_-Wh$kDNX^+(-X+mJU5bjN z;-QSiI&v?6J~|7H9L5GJrtWpw^C#^u$mePDTNpUVa zH{^qvN&0J6Jg2jI5SomKqGwx_B5Gx{vq3}jx={4;OE_oP)JaDVa*$L4jpk!l8%vM8 z&QAid4+o`&<`saS7w24;4|z||H3E(Lcg&+lsmEO(0{pmq2y#oHt4MnP81 z(R?gLp!Hng=u+4)|4R9qhDH#wRW!_?Kiz6agr?%5C`Ogwx${;g4D%JcI%0qyG(LAI zMEL{NKyI zT=;ynv||+Q>KInd(fB=Wn7c^ysdjKA$zJF7t@DgB<&OA8ebWyt79XRdQ?g1aTQtyJ#J16PF%mlog6 zhO}cS84qP(Q*t^UTGU(0ux^Wx#dxq-F|Rp?@|sKp30ve&kH z6#kIOsK2JhYhqhe_P87I<^U{4VdbR%o~=>GWP34w6d7 zLovp+8NBkPt7G6(TVun=Vuj$=%ZHbeuG9GyQO(JCC!E`e!R?Sg+YYl7}8}Oh5-d!@`plEOk9CbFSAtRYXi?QON7xFKZ9QyDE z4B0A^aCGN87`OSwCW(*N=p~gE4=p#Db}>n=9qjKao+`I05B6_tcV96jlN&UW^E#>5 zvUA)Uq0K10HD*&1fqua1O}<8SaUw)F6%S<$2@y0%I`IJrS!)hPb~+d{-~^A$nJ4r- zXT?K5?X}y!KCu|SXc^ii)E{|>ekCb+iMDWqMs|zDzF6%8AF7~n=!wT&1ubBXRkYv1 z5p*+2RC885bZ?+u>CAKK0Dd>xo357uO^2@vI`ZQtH)W&ods9MQf?r}IxT^WWTWnJy zcrP_idEKHmdNeB@nlrLC=ego95G~oif#S@F(&8Cc7y8RfPkBfyya<0*( zzPRH{BaA#3kgM7l3kOW9nVYuo5Ixlf*U;>TgT=1axw!=%CfcKq_;mLtNz;+RnCjSK-xJJf9 zF|6P4F|$0zHp3|s`R!s9^I^i%{kh|oR?*j-KqI;X)>NL*UiA(>6K=eERQVztZ7^)@ zg&QyEYtD*?3Tf{@ZolU}m@P9fd*VSAo#zPrc;pe#GBb8;qy7Inw z=dd5}dqH}*RSXB)fsBWuH}1%XA5QRm38rcvD+yc17h_h=`)cBLy7dmIY$_hgfblCI z#ecsBi{1oZVECqhSL*WemUHUqxkkl9u@UVpdFf*5Fy`3!le09k;F+V2mN5(He44DX zk$t{skn5V#V-@h0N0|(ErxNgeTX(JXDTnc!iia}ZHJvUiUt0{#@~anb_?!%mCSb2a ze{mSUS@F=H!@o4kpf_qAF+!1tT^~WD;kom=j z;42N0JdITa^wL7cLoqDYT(M9ssR6bPGyb@-b{AZ5obRBm);D^dlkrdncH~p$Sd$S= zV9X3b$N44Ya9e43)~yK~xJJc8vD_hrJpL*HT?6T?UX2nBaE8kdG5tRf5ON%xo{w#n;=SS~+rCLId zrsAR4z4=mFk2kf!AcMU7x`)2Q83Jd23StlFxkkl98AJI3olY)BtJhP#c=Ri>LE-gf z{Y~c4tuaCEO`wt8GV48m?w-N*&|fWk)25jZfX3~sLj)&quzJXND1+fKbj%YJyjT8a zZuErzav@NC7Fkh%z6e086^W>#3Wh1%oe=kYcQ%Z+&kR%lUJJ+QB?eh}aOj}~8rdxe z#BYWcZE1m{wKVLW%zq0E=WoCFf9>6AT#Z}UKk&UvnUk5!awtPIDZ{avG^bHWNku9t zB~6ksLuMlLkSVi_8M4Z(NOMZ_sChRh(zBfHoLA?z&#UM2|NNidE3d9~t?PTOweI`2 zlEJ~<`x^FF2jj-5TL*Jj??0Pus=U4nW*=}$CQ#5;yg=(8Gex4acm)8a-9hbO8 z_h%*ezV{LQzTf}chnCb%EkE8rhe-8yjFG)`pMJlp@Uc_-cXyJkWB=TTesKRD6}|O4 zjotdHM#Vpy8mZi@67v!9=JD%3l*hX~v{7a%d-{Lc?z}!@Z5ovu+G~U9```VH$v?T- z_xFQ_j!&CU@HH+gq;0(3+(1{<)J&Vv6!-gn|8pOzxYH%5uLDnG%Pc=>@xu)?cTPYA zpZy&AZ&%B&`%oV5aAJU&Oa3XkPBv`+*;D~Ft-Q5&iQa$8&HA0C7k|;esgABU?o#cp z*hGSQw%ZyV7xC8f>pqmnugg4jCLy|zhDLl;Jj;InT<_6Syxv|f{Qb=Tz7OR;?mvHV z?Y<)Fn#z6@J0XLV-H!5}RQ|i4Ut{LK?n8Mz_wdX$mX{mpyjdx0?h^rBkrY=hyY>2i z$<69ss36b#F}{{gmv2>>EF&N`#og|HnBGEuYyRs#l*c>MwR*$Dr~)b(t2B7v#dsQ@ z6zYD=Uc@^8`#zLs6{4r!*zFUk)HZ8+eDo6aTdkq5Rd(a|GynTOl>N(f@8#4}Kvi!_ z9+ypMqN_)*i;E2v@m2HdK9u!)NQTmCN%ns#`E)t=Mo>!{txw*3(YyM;ENOB}3e^#8SlR@asO5&u_bzZJTc$PizOKO0K zhxSxZ_tB5fq}^U7>h+sgOm~=F36tt8Vu${^4`shetSC<<{7xm+PMi99mu?f?v!J(( zjMDG^T@71nf8U4lznzU48K>Dq%8LJp?ntO1ZQEbgE$S`e&Ewa7D37OLcafiUF z{F3_oJ8cg?&`Rlo+jDyq5w$%xCau_&^k1UcxfZ=n&f)2>5)xY;U@$xV4Rt1OpU5>Q z{(iOmb06wHL~3rWTnzcVLp-J7csZ4hR+$&0nDYDFzwSeMyd6C}GsCh>S8nZF}U;fL?*>}koC&?oQZY?BG zL4VhM_LB;N#@8#xKIc;VfA2T=Vnz^v00bZa0SG_<0uX=z1Rwwb2tWV=5P$##AOHaf zKmY;|fB*y_009U<00Izz00bZa0SG_<0uX=z1Rwwb2tWV=5P$##AOHafKmY;|fB*y_ z009U<00Izz00bZa0SG_<0uX=z1Rwwb2tWV=5P$##AOHafKmY;|fB*y_009U<00Izz z00bZa0SG_<0uX=z1Rwwb2tWV=5P$##AOHafKmY;|fB*y_009U<00Izz00bZa0SG_< z0uX=z1Rwwb2tWV=5P$##AOHafKmY;|fB*y_009U<00Izz00bZaf&Y5~g+6T+vb{C= z{G%hC@`q(5k$rY2Hg6mwAm;5UL4LW7g#XJ#?AM>po}P{l-aMW#P>au3+i_&<6aRCh z-eZr1*`|7uGeXL_>3$jgeV{gSWsp3lPz z6{k0+&~tNoJX~oVOXrVmmDKDS$eD4P;$9l}6X;z&g)3Jh(Ikq?~e_iqrzY0k_+bc zC9}Tg(8w+iY-~=R=TsGDWPOy*?lmiOODk>D{V>FRS~VH{aJr^T-$D_AI()vivC+`$ zE|p~Jpoxc;ITsS=>;qk#@(MVoI^wb6xIccb)wX%5Wa{&T78qdwM;iGE4lfY8Q6WZ$?dq5r(Cl8GCqilOE63D!)+7aL|+W z{MgosVa5+{@HAFvY)G4JRU;yh_0h3&Xjn?#dve25>B{O;Su|^TcmjQ$AR>^h)kdEu z8#8)+p&|1&d>pm)8*To)DJ$BwNJJppk2~Ax(^#=)`t0naRCA3S!V7C}nxs-nxVF`z z&XY@i*2tUc)24WQ)GK;-{>Otim0hWmQsLmL&!PfZAI-I1N;fTQqz&iBsg86lChoK6 ze*JN&oO7zejI72~KPs6Lxtl7u%rX|F=hDl|+k$%*)rpuYTdRDlhr3?8wv!7wc9&nv z7ZKN-_ZDBaa&N21{famKsxd)5`h$MbO4?t4*L6&!&zIk(~D{1zP%0ZHWHkNl`xR4Rpl`?XLL`ekOlLeDleMzm1qsQ;uc%{ofAhG z$ojY^V)O%3-F&)yzFhyD@2TX-;A9!?@Jvpe)-kWYu^RdONTT~YhSYX#IB9Qg5*EtQ7^sIV1U+>^ixu(y=D{RSujpw;fIblZD$12|RciEbiw6E>V z@11>8sAkAKsiGBWoHYsqb@;p~CdcI5FNtZio4twN7#>ZDgG5p{d+v7V-nB8Dz-r`G zYHf2moR>&uK5XzfdZCsI?DG92!;&~N3IkamJqPz18(WZ1YGWkl41bhO9j`a<^h)K% z5eBkzHRPa}MdpZVI=|jnb>`&;^6*7VrPHN+PE}zbJ9FAQ4;o(F-9(4GuX7shdx9>w zcQW_3lYn!o$}5IUm-~6<@Q29v{d09KH9bDUGvjS3k<5HlyWu?dDJKkM&qu$N^)p7_ zDXc99qk**7QmJg3n|W2{W@Z^@#?5Wlx3U`fd^zodh z`SYIJ?mgQ(iQT$|Hhcbv;>;)vWczW>*YADyc5kMA{}@#0E6dXutb9mYDy3dTAbVfk z=O(Ec$a_U>#&2s(l=whr1xf5!Ggu&Es%))3daat^;vY_z551EvA*iOa#!Swcs#DAv zIC@yd82O)T)lBw@l=Z$6dSsqnl!j9&>Hnx;lX3rY5rJAfo(j**LDj#LZho^j=~zky z705~_ZVcvrA`3IJK8m{xtm*vrhI%bKljpoLg(fTV!uGAo5|NSBxFMaixtHhBOPd<^ z>)K_};6A>e-Oi?Q26|q1Tg__Z^Th(UNp5-EM#!HkP{m*AgH9Y#W`zAEY6co)380Dk39$c35s$BVj97M?y=SY~v235p{`25gD7rG`Jq+ z!i?%C?pKc#}!h zG$Fpv9OeG?oKqDBvi%rztXRqXRxGg#zol!KUP5~qZd%#>k~rt{Q5eW-EOS07e&I?z zIe4vIEnKyPtV$5Cdv~2LB9OfUoZ6^9_4tBjk`SNLxWvwv6u8FFZpY&|r+Tga=~Y%E zkGG=v?xTWY@z^1AR@^_eyn&v#iGM8dxQ=qE3Io}GOxNuBZsN>B8g7=?_m=aAs`ZZ2tAH8U=@TTfUBZOFtJ6a@v!eOcy$jP_KW*{k|#;WNWo=U$w{2 z`_+?8)^!VlFK3YPp~;iZUak_6k)1ho8t-%L$ZF!f@K&8pk2-pxRqczTv8buC{ivE{ z)_dLgR60OB?9iV5C3LsgT>I~rpK)e17*M7+>1Q9~x1Q7QWmG{=Y+JN8Ix>N#O!5i# zmN`kd`X~%!XO8jQh`FC+TWPpnp0e!RFc;Z^e z??l7n@Hwpm+%*a_vS&x11rlwB&l^e7YV{jpFG}c;e$wJvK~0=fJ$f9aw^~FXtI;T4 z(O$4TkB$p0eNvxONxltf^&~&ui3ntUe5V|oY~j~L>rX}R`<_%qH@)U5hsdW=u6Beh zQr|rJ=d{5iiN3F8uqS%(%aFgn?|Wda2%;tbeGHJTn`;MyVr* z$cI^X>vy1vbE?8XwjX20yna`=w4AQ#J$do6DXG+T)~)v;L*hjQvi&GHXk%l4tb~l{ zR`G4{cL9;PpxAfpe2xMz@dv1r;-GdDzf2q{-i-kMrg0Z_KE=fP;43rIW7&YbR^HEA8QOZWShP>52 zpO&naNj#I<^Ur&8XA}mq8pmr7%(-HCk?iyQR2OIXie7neCT>z4_h+CmkoEDSUyRhY zfL+vV;K0#Q$Fk^y<#LPa+PN2iFp#ZP@2fE<7az^18)n9w*r=63-CF&(_bSSuTx-?4 zcgrePBaiPL_hs^kh#I=n&Chsw=tp9j!Jm`q%KeQj3}nv^DXV7nS0?2&;a#@b^RNce zf98Ztm6nSlGP36I;G)T)iEraX1hN_xw&?cRKDC-I?|1Hw z)$=cOT-nvRIUBjZJK|1UIm;r3Io}hqj&U6ukFE^MD;9{`!h3{q*S3xUM363=EmB?gER|C+u(}z0k^piD`6n(W1dxB>afLCbgW_bci&`h(4kYB z&$=IY$%!KjWNX#+m~7?BUEip*_Hh}L2m#p^CoMLkphQF<>tofBC)4}SswH_f-#6}U z%%j6gpFHR@G?z2bc&*pmsXzDQXW1vd?zSEDnR9brmFj%5-${D(9)5Suw?|!bd* zhSq`K+llo+_w&6huLELZ1 z!a&x?%eAp)#+TQUH68vu$q2(l(mv z@AbIn_%Bo{!_r`vCil@H%*f81zRT^EV}jx+Q9M>=vG)$;C6A20cRiCcV`cd9qMv=_ zS?#cYGe<9j5y)z+$nV#1Fu#f%?RBH9&$D`BoO0{W=Ml-2 z>m(Ou{Q2w%ReMqKDV@#;te9yYf0ZWJNM@%!$`g^1eOqa(6lXYZ7SP1uW+UzQRg$h< ziSmJH?oZwSdpC8c7y=N000bZa0SG_<0uX=z1Rwwb2tWV=5P$##AOHafKmY;|fB*y_ z009U<00Izz00bZa0SG_<0uX=z1Rwwb2tWV=5P$##AOHafKmY;|fB*y_009U<00Izz z00bZa0SG_<0uX=z1Rwwb2tWV=5P$##AOHafKmY;|fB*y_009U<00Izz00bZa0SG_< z0uX=z1Rwwb2tWV=5P$##AOHafKmY;|fB*y_009U<00Izz00bZa0SG_<0uX=z1Rwwb z2tWV=5P$##AOHafKmY;|fB*y_009U<00Izz00bZa0SG_<0uX=z1Rwwb2tWV=5P$## zAOHafKmY;|fB*y_009U<00Izz00bZa0SG_<0uX=z1Rwwb2tWV=5P$##AOHafKmY;| zfB*y_009U<00RFj0xoKcmRV^n_gU@F&M)@_|P=sW_zt( zqxLTKe;_lx|9%0-5Nn*wHI*C`6<4{kSrjS6Oi5YYO#809`E8_i2lch7H0b&`!7^qb12MPDoA4nfnf|+ljf#%3 zm!*mMGDkLxsB6`_i{I?06S{96#vhYI7Jk2EqT6zVMp~!;IQ*c6V-}7x=KGqs`7JQ9 zVneK#eD}U$S|dGJXI2q$r-q!|E#Y^4{};-CkR@&HU(7Ltg;9X+>KQIdrdq5c51Oat ztqgia)^?k;Yn(w5@$t2CIBF42SKbg$aeLmtQOLAqPMXV1&AjziGmB+qo(HZktfKRe z$Y0rYE|vPlcr8{Nl0q9gqF#R6|B7Q4CSLBA>bh#~?iOqoS7skt7k9da>{;dFqBb#_ zm`~W=`><>Q4VbgC>U4WK#}H-#TGLktdIg#ovmti6kNK!oQBFq9TIq4XtCmcDZnU!f zP!m0$D!=J;)mM%}bOZh8t1A1f)b(UTbc=bS|17PO*qcVlPCd|7!$M$E^Y(55k=i!O zX^lb?#}Lz|&+ze3@iAYc#)b$882Of5Y#@geW;-^iHIf2_`) zGY{}GF`KTbqsN*$|9(G{%Q6kLVCCbldKqP8LC}u`gTZf!gM3+zO=bbdEF4xlu5wvu zK37|r&EnMC$h1!nUlNt0_j7ma6p{`_Tm4(J1@w6POQl(6r5r>mWM;1~K{+7%^*#eFs9JHKuSL+z5Xgl!P7`7Abr0!G@ zfyKMw6SsEI){v`9o}8?s{sp5`jz8ta{`(#=)iXCSSmvN)Y0gIY8d4wB&##JZu`|on z+F3(-j~%dA-zJm}IN`P6?CwJH??VKv)^bvwZth~O$Y$}et$6b3UF{@z!2o}Y_&lmL zB6COD>q7ccO@B9o^hT_Ui50Lc04})6J!&%^Vhv zy59Qc&RWXGR&0p2IeT?|A{*$6TaV7PH>6RO^=`H~ON+_lOKx`sAFDWq(4Xd}s%+%% z@8Q7;8RiijX<%PQO+9+gEE(2G0vEL0nL0L=o~-N8&K#1@F@&D`0v9cp!0FxwY>1aF zHgmU5`a(m7gie@Z`i^vVkN)wtwL*T+=xh!PH7y4(O+DXLW;$$vf40u(Rg!&= z$W$4ag^VvHzN;GEdW}e=A0m!gdWJM}3}Lj;#AemBWe$$ZnMK%Bx%DrGrP27sy+3;k zKGC}hi}Eab))OPgfZn>T>E&xef@1tqWL1Y^h)s0n1aaY_ULW<`RVc$9nW*Me{~ggmE3^YA^bijk~RF-KSr`Zl!JY#yi?roepGnG7L)47p+LkJ`q zr$Oz%2}9W1;P>%^`j&dym6yn_dpk}&Rn?_7Zlq+t`5P>bfhyK1*{4I@)+|?zL%QFT zms87(YyEML{9V50Qls@|_@l3zzsr>VF6a$3dE4v5i-N+clwTdfQ_@;<$lK~OnOY`O ze$&Q=ne#93h1y%%0xG}gI_1yLE8zdles+k=Cym#rR{CdQGnCI84ElJ>xUtvECS`&bf1KvP= z$QO*cAM*1&w~#0O4&wU`=Jze|3C7}nrpCTo%8{OobcE8KJ~4nxi(Pw~I-SP(h~pEv zdpVh>l8r1$u@MvL?r<3s=Y*`2|A@Tga)vuy(Y6!~VMTlRJ(lLjbF;W0-ld{T2*cJ3=gkAkzXS_6Z3W( zr$hEe>9#$xn=v0bG?CI*gtHM_)ZwT@e1ZT7fB*=900{IW0(G6GfsP3^^TsZ`*HiKI zdn@eje=Q1+>Ad(O*S2Gx3*C3&eIZ4-{7h>8(dSbzxKI9TmR+yQ{}Bk zj4Iw0`Pui+sXA=ov#vQ!XV)~$Zw!oIbowE`{MQZhUvh2ufBAo1F!{5{s0Z)u8oTGI z$ocbzKC<|`yCU=3&YWLRd-kNNf|WyVx+Z)1mdM4MR{W#r$<3~t9%(%5{b7>C355y&)q$G`v%v5CCj%iKkrl5{Icc}Z&UtQ-=z6n&%X1kXa3{* z`DyKImU~}tP5Z~?K3{2wY~Z-92yO)Uq?1L(^w2?#L>*BmMDrUE@cr`@H;; zk6ds5A^nNZ@?MEFJhd`??c*Cdgi(-$6UP2wY6o_Nn7u}&t?1W`R|u)W?g@OZOXdi zs<*l7YR@StYVjhFVIF|3TKKPhGY@O@H9MMgNIB?6~^yoty58JQK|Pz`1Kvq-tz&<-CfL z(7drVlm0mGqUG1#!olA5-GUt;f;r%^|GsEj{n@b^QRWg?|lEl z&`)m~yC?F4A59tC^oxgF6S5z<(*3~)5%0Q*S30&0?3&d!XUe?NV()^65zm})>B%oY z6S?i_KVN(4celIVeEknsz54t+k-uI(ebTzKw?%?;&pmTdVe5=eU;Pz3cQ}9XTh|3w zef;Iz4cj7VPi%MW9QKmy?kDD#t-17VS73a@-02Gn&#hVX`^#^+z3ZBPyWDsA{4d?K z!*$dN8?PUG&4-btd28E#6517MuN^HLdB?~dD>Ei`ZjYQ^+`g#em!Cy$`dv=NgKaNH zp5A@M^&egJNu;xCMs>bt!mQ$slOEZ%>W*b^xn4Z%#~W{rL|muca@EU|8@5NL-+W?q zY1$UoxlmLFJ9ak8MeOgvn}7+?V5brYv14Y_G6Kw17lu2>5wlX zp0g(u&L7(_rMTdWOGp0Y`PIz*RNGNG?*DFd{paSNZobj6HS+kTJ5PFdM_SkX&gyB! zK2KS5@vS$Uxpl?2wz$q(^Tp3M4f)j7^vh>{SI=&fx>{;ct#NZ_n-RV8PZH&2{3 zXU^j9|L)a2uH!ov3_Nnn3z6rJ$h~0qm=7Y?|MtnV&;Q{^u72dU(G?H?0T2KI5C8!X z009sH0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X009sH z0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI z5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X z009sH0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X009sH z0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI z5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X z009sH0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!XNErbe z8^CP5v>aVxHp`ZKYmX3T8Cxqq8LO);D`Q-8k7d)BS=6EQ zBLAWKD@1#I?as!wK-k+7@P(YsZ6Rma+u-*($22t;jFG#HWwKn^LMm@^WwFPj%NuAa zudbr1w2VVArXSu7v$yZa)-tphbxa!Ee+bGm>!4rgyS&hI3@2^ z;%rHQI-aPHk@Rl@G!9qmZqC(dO_8z#^6zY%I;QcO%wE%Z8A_TlvR5lbPgQlPjZ2aL zUw$Ozuu9@M83EEyaEjt5ohr|e$e$C&}8Zi4KU^5dh-0LP~K+J>tNabwqV%rTj2A{%={w}KXK0yn~`6&KjdrbXbiWsah1z0 zQGV)&jh7(z^@#zTme{qYsncotSll-=_>Yb5gLW!GRwHuA9hx4}@etlWA!*b1FS{(y z#@gq0G&lQwvP_Xtcn;w|ntaWEZ`ddEi1RcaW*@KW>k=pVHiFBGO`N^q+RM%{>@U{7 zhiEw=FK^GqwY9~w#F5xkIclFV<;MnSYi6wL6l*_8Ql+{2Fz4jta>5S9<~$*}9QkAS z*yM6WH&*?c{ohcN zyr(S`_BA=Dd;J~0c3BVkr9pChmCdXxt(dCDRs3?8vd>KdL!Pkddo8D zt=1cu<1>a?pY^n)$9l?#WCtZ}9f2lqXb}ya zzWO(e8+HFBzCD>QnEjY1>M?=J|P9pvj~S=!51@`%@or z?kmv&vhV&XOZmVa-skDLZ>C4xM zauc;&&3BT^b!+T?Jm~#=$}L+*Eu_amzCc3u(P&pp|f2*P%e-Ul+IJgIoX(6 zucr*w=R?x}&ZAX{#M`Eyi;ej!JyEE1Z}#1j=Rr{;w{Q0ux-&cvW$QM7?TdX|*G(i; z{-JO&##bN3LK>q9X{+8LN8;5xi*?$nQos={~IE0^iW|3qHjc2527o7_%je=}p%F(3c<(g%w^`1p=ruUUBB%)dYTz|H^M z?0fv^w;w$EvVv^3wBm^));yMV_(kizPyMRl!w+38Gw<7R?Tb%M^Vo(vmMmFgFTefD zamPJ*O4i9YZq9jg+Uj+8?%sa%qYIbcwsq2tOPOQYyc6y{b@+qd{&4HLukHGx^qCWH zxO(m5o?{&jer@w9PS2=sf6?{vGmqXf?K$71d$Jave%Q3b#-4cHnh%2m?ary$ z@ARqdhy9}9k?_--HdZZs{&yqlw%IRnv-Hmn&3$9{;{z9NSv=>SKTIA|)iUajCq6Xc z*~-qxx1WC5SiU?o{#O$s?`}Bl+ix7T?$;ed55H^7Mfd-9MehH)9qDd&{nF3xT0iQs zUo}q{a>lNPaX)_M_GO=3amkpiZ$EgGs(Ra?vOTSZ0IP&|q+e%!L-Y^AZ8BG{R4d;9%?V6~FxYRC2sBtUV?Gj-pcI$eEM z_mQu|@mYOKJ?%N6oS{OmAu{sq2J zy9^c;`OUw_ke^q;|BH6I`4!RMAu^wYZd5DlJyc$oV$2&1`gqIuuTRvscKE|BvK{=A z@%tF692uQl&!dy|!ndgZC{sDP{$N2yZbg2EXS5U@oe?a|2nWYzc=KdMnQXR9Mf|GZ zq2A9J1&Z><@*mQZEYCAqi^(YA`g~ca*E71o8{)gXY|rTCwl=<7MAh)p=jwVCw?JE< zUbWrJe{Do@KiyhSZUt|nul-14x-aviG7mWEm6n`~(OkO{c-{f<7y^^VHLHBnc-4&+ZI-Shra`=xZTHjvSaHos6S8Q%p zG=vrH;TL#|V2=|7yI=k09$DY7(fcNP zUuD>PY&@RXztt6&$lnJum}|7fhl&o?$N|Ah<1qHM(wI-bbFNZ=l20>h@pp!XgSf!bveSv zQ9145P)lG=QNCdZs(x-?J*hu>A4*ieC};2W7Zw@qR?nkVJgCRR+nLCI$^Rr;58t+j zXO6QiIgfg-E{dCq*rGQUb%;+8009sH0T2Lzen{ZTVdF9aXKamJedxeZcWwT+Ye(n* z4SN5%ovyY3U)SN27Mdj-dCiEQ{dnQ)k=AAPPoH(lGm+Ih{&CTgs^3S#3;FXbm!qf2 z6{mlBL*193MNXOOob%+CXI*ng9R2qJ?w4HQxgnpA%hB`h?`>!2JTY*CYx}(O|9NZC zhmotF`u)$ZF5jl=<+C~c_{%~o-dQ(f)aFR-_Q_eB7w?F?_Rss)Z1}+%m#mk|iDzoD zy&B%QF6Z)hB2V49xU21hZIPxsKYrYH&kmAXtfx2Z%oz<8-~ZG4$c1;LUvlW|HzObY zW<&mW#=RAh?Q8HZ7IRXu-8cD@dz*jziRHRgWixI;cITmPYI-~5hFdOr5v zyYV;dxEEbl&wBUnm%2W5jk@#Fzpb;srt0N#gH>_tq9ZpvebTPouJhgJ{demHpGGFU z^4k9mulZcn%jL8f5)c3Z5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI z5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X z009sH0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X009sH z0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI z5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X z009sH0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X009sH z0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI z5C8!X009sH0T2KI5C8!X009sH0T2KI5ctXo*w_GOSBm+4x4zOJ^c zjB&|5mQ7z~4>u}#2c1CrjUrH9R$L=GJX81Dsna*=m0jpV))Hv)b@GD-Z%CeMZfldN zutBj+R7sCNhiiLXd7i50VO?J)E0<;1*g)ps|LQ90D!Bo(L-@}L{D(_rDYs~ zAunoou<2#Bo{H)!DWAsWsQ!Yye3{S2hRSm}@`&+Xsd31Bs$W5kQH?`;!g07{t`c-p zISbWsZ@KF1NDeXrG!9qmcF)!6&PB=&$gb{Ktd6;<$}O3_rgI6az3Oby#8XvWYGY|+ zwQTkks{E*cWrys%{GsG>bU)dt??;wm`rRSvCtLHAPL=mEQ+r18KW2Y+7{5;Xk^W5j zFzIQzU-Ngg2F(5_)_)B!=U48l$NsA@uR!V`laU{D@VpK-rJ}NqyF8Z76{vLRmU7fi zn@#N*{Te3owRZ%4p$30j0`x z)zSQ_CZ{~6_A?egf8*SafNpy7{HjpiX62b}Y_M#9TQKbRE%5neX8w_gpSb6U&B(9X zAM!PIG=^K+xXR_0C_nYX#)c@{(I*CQZHZlbnmSGPxVA<0_hDVGjqZnbDoJ0^Txgm| zoQCjz3Q3zD?si#^jkVA1Xm0lVw2cTm7WLa$ldswD4f|vsQL;nNP}A&bRJ5#^W0dAK zLYE*;{^i=s&N1vU*4~GRc8EgxCU4ipwY9~w)Uono1GF{s`N(|TV0Ag>GgqHk)|OmO z*rC{#{4lv3`Qy!<$>rR#G{)wJlgnwb{ho+BfnaR4zuw+wta>#2y`j8((xay>6!tYa zr+fV!zIJ}z$TvuitFoDOr4>_Ed*Y90g~-M-WSOgV!PfB*?H3pCUXHQ^TZ+bmb`QKQ zy&exTYpTXW@Ac)DAmiPqM)518ud0sL20qztYj#SH>1+w8Nw>_*Kl-Y_ReRGP_Zrf1 zFt@BSkw2r+MNWr~zXE?I?scMg*{prl-{j)Sd=|p@gL{;a1fU#Zls%3>c#8_r}~;& zya8uOs3#SEFjeE|Le4;J1e1Ac70#W?eb{vVh0NF!HTSuVD8(Ks|sjq$yZrSvB&o@qfh3+?%-uAGw zwr!!aT|JmCkPoKLQ^do|L7)y`>sh0?v*xqLX3?M(b!sqK8j#afj- z4`utx>|8?S9|{)}?}@}h9HWVBtKK08u6q2X{oN)nymlS;vckSslHmJaSQ?=00JNY0wB;I2qc;> zoRpw*BR`8jUx-df6Xe5b5C8!X009sHfrFO-#rHbBY98R$>tV+F@<18iQCd-3Cqj6{ zI*56FgVsTa0Ij>xIw`H=IDf9n?`fRvs$b~#=z<-$sAF0OrTUQePXdYJY~Q#|*#X(r zvfI`1zOFZ0=~1~mA5`VmXaQp#FZR9WNG?O;phCpE%lJ?pfgg+glqt4Ead0#qHvdwMhb}4IyZ3SC z_siCJP+Eo8#YW%9skeKKmEWN1rR&yI(O_)Vb;;$jb;dhZB$p#Uvi8>Ga&BGT++Qb` zv$l7C9VdzBl`1#&q=0NcS7FzV>IF=*m9& zH5y&?-iwa+_8xX`el3pPuH#%p@iOba>g`~S4_(q@J*k;J#et|%V8k7!PA}28(6M8+ zPDEX?dPzL5IsoxG^g6xI@oaq^Hx&O#sZZ+uweKC@TS%+ioC@2Oc2|9!w~ zlgl}DrsY>Bmn+wDJ8w=dH&M&Iabt2hb6fZ4ekz|!oV?+5ABo2A^)bG4?Ny2!@!pT| zoonyaD&zL8mdttoV&|{>X5ZFz6Vmy?7T@XCPGF|)K(ck#p2T-r^-J;Tlp5u$^~SGz zoM#Ow){>r{AZ{b-NG1OX5L0T2KI5IFb=^fu1($^`R;=*0BkM}i)J00@8p2!H?x zSO`#@=Q91hhj-|8u~f%-I{r^Bdi6BUbJ?Rx-mTla?OAn9>z{50&m`X?s`wgQ5$7pvFrKJ@`IQ;w+a? zdh=c{**NLFzLbR?3~`>x{2GleidUoKy~P{s&9B87A9tOs;yh*6ebrlkj`LJ{%8?#y zU*kM`u9s+>=d@|sXNihq^+NN#uf_MU?bkR@rH}aY0gCISaYM&n!MG9rP89#O9XI0M zPw^uMBF+=zru*N4F~?`6`uje65?2U)bk~P@-f7bVXUSrxtv4i>3k%)T)CFp@R#Ir6SZ8|)5+ybP2I1%seCSR@|M$mI#qF=s~=X} zh_^KkkiNf@(!JR?y%$fkULu|=w0*CBU8{`ep=>F$@BR|!nX8?^Ol`HsdSL82>o+mZ zb8>>w{7{0p4dJ`7fdB}A00@8p2=q?^y^ZtSm0-RQotXAdOz0H|fB*=900?}e2~eD8 zQ2*WU<$7H#)p4HN-qVX-dLcs0Q^cwsy@#Zfl;QYL$Ta(j zJJQEtZv+2p)r{4*)YGoKLhHq}POioGn$UQdxk1@qr>@Uf&yHPx6m?O3RDo6A{C-0G z&Na!OSzA4=h94X9vGJ*5$dJ78eL1RUHu-J44i@zCKU>*eA6^s`G}QM$n&00whrF#m z6Cz`L598k$8O^`I7iyP(tXzD5r1+cWRBwJ>0sk-BN%anq`6P6sTKT<`=J)E}V9>`~ z#+6hrn~=%Wio5tb=b~Qz=azh(^^I+Q{fCo8y+@hq&GiQhGIA^OGd!cE=;(}KVMaJO zHp838r9^w&G9?b`?|K;pit@w{^+T5D8C^0u!|NGc%s*8zyd0qgA*Xl*+5+{eZQf4t zhg(G%-f6l1zwylsBoAXStq*3>vIXS6R%_P5C!_KMbZv<7&os@$HnFIR~3_9^=a zcmwqzUr@HoY#;o*lr!|J{@$PbBlf(;M1Plx_}JKCl9fZBh>bg4C9faGD}9;QBpJyX zlYf}&EzzRn+Y{v!TtwfBwSchaWSfv*s|DyiWzA+aj%!3gd^5d=S@kb*5%>JmZxTKO# zAo%#|J}#!0#s7P=g?Zy8b3?wTew^MjtMNOfqO7vCKEJ-Ic&eygjz4p|XgnC>&z!HI zTRveKH10}d>qLLz)mNA>AB_#eFK6*G9E{pIrJ}M@6m1@V;+X0c`(lenRwYuhebom-_zvD*i`E zJzPGK-e}w;TJO7P56>KDJ9c|_eKxe$Dy^&e-(khtp5a$)DY|#FjQNGvQAC(C@wTCYvD2u!+gmpI7auZXGsg zc&;VO(j7}}*?g|U1z&k_Fw1wOy_k2(PM4KV$aM?VI*Q z9{IyYH(Pq1eOURQhW$^E>mnoJf4_zQYA+of)G19;`%#x@*4vM|L<Rkpz(TYXaqE#WkXXGj%VWI?dE} zqz_q3pvl+C4@&fjVx0;b6kBiYWw^H2=RZ^Sx8-w{X0mcwo{bG;4*svMqOOt~Fgt|* zoWOsmz6#MEU%RuhEfDs$1biW9b6dz6_BQx^&M{5RW8@x}&t$p$Q{+?SO|C5Vc=)j> zZ=gKSKloNxQB_*TAsF(ab_bhYR_m##u9EVig;ak*UcSs{V?*V+9C^feuhcj!+O7H% zRKtm>acECC4jTJ=(Q&nARV`VL@B)X^AjY2zxFTcZ5b4;vezY)79Mz_lfI?P=;X+2d}_`(a(K zjqZnbDoJ0^Txgm|oQ81w6_Pf6AG6DPY^;55M{~2^r)@;ov8dn1ntaWEZ`ddEh>{)J zv!vP6sAyR+$0*Hfgf2mx{L8hMonzQzti2Br?GT0XP2R4HYio;Vsbl5G254($%sbM=`=2PT&jb||(7Y{})wA0L&ST+S^^W9-Hu$>p@zeoxZ)$Q6vO_Sf6{j8%_j zzc-YZPkQvUg~Glj=X9^X!`IHw8~FywaaA_6uC!vRYL6o3}jQgUp($@z8sHxh2YY_o-2mue{8vI$9g}WV@}|DLtmMC7>qV z>T2Khw`y?gGjXpI#mi>xtNtbzPv%oV!=LqDfBBk6 z>~m8Vze3}!)*G1PGlpBA$J?@A>nlgKQ_myh0so+zL+bQnry`k?OorDJs~+T9rN*otxYqeMJ&#^7OwEv&i^0h^n|?l|b%mw+Ih4{= z+-F3S((Ly4N9f5pPt+8WpO2jnbe~6=*Ne^bD87tbS6e;9G-HbvDZ0hzdDJP77(0G| zu3R5m6E!PhES#2Hj%vC6%jD~7<+_5RtmNfQ)N<(=$>pr=-CxIKH$_@`^qg&spV)PJ z(UaKwhg&u|-t&#qBwy)m4?Any7CPJ2gXseKVCp=DoENS4Qw53ae1(3mMtZQ?Ijyfy zx;Hx~?Ie4y)OJ4dNG;40%J!AnxrE9;6fP#-6N!a5Mibdqy+aPht9KUbv{k1ne`PI| zr^|_+Abk=~lIv|s_H;#zJ*klp&pw9pN~r#*eNabr1ET8-e1P)*d~?fB>pEY+=l8Yx z0%18h;S$z$7OE#h=H`Bj)?@gwQl?X0>1(;-eUw@KZ?P~U0Yjn+wN9jER@m8++5wy&I|t@NneQ+0f8jTSK0@nYX=j^Hvh4k|>vyNnMl ziN=PCB8+wL*!M_?4;B1WeC0PUJ~UI=jy53cdx|u^N-ZuO!hgh@q(pI4^nQZYhv|54 z?&Z33kp@eblsZf>JzWz zCzs3Ca&PH=Sj(Yf@*}$^B`?pdGu${Xxg3?dztbe~yi(q*V* zDGr4E8tEr>dWpt`PMfA~eX^=py{J0?*||9m4lC{r=OU3 z(68;d5%+#=-mE0W@t_ztYCVAX9_Tnw=%e@gFwf6Z9sdb^^j;rve&0YxU-cy(H+kHr z^*NKC`$m3GJ>SgMH~LiEKZLK~ah}iXqQ!b5J%;w`xiVRtC$;|){T$b++fRu#?z41E z^5-};>*2o-_?_f(6le8hZgRPD-H=l!B$u10HT1f?+j}vFjIFR**a@a;ybPS zrTBD8jq=rcQVA;t)}A})G@7l z>eER2DS**dP_j7BwfuUj<2*%~V%uAsXZkcX&bp-Z{)_W$`hlufR~wV4 z(azYT=O&jEb||(dbwAWm>?iq=_ZyR!=hj&-sZTB!Tf_c26DN*M4n5vlsq)^xpHpuf zD4q&&o=T5NI7`Fm&3ip_yoEI$y7&5W%Lc~N*Vir1GnrqbxK4^!qvO5B8|=-m#nIbs z9GECxX5Cl4_2)QGrKcQaSNr(=pFP)0G|ux(?zkvhoH&r&Zgy^dPu}0-Je59l|1IBt zOmUqwZs_a=DdTz&zprGDUrF`%eJ;_8-upZn z`XGIn=N$(k&J*LK``<}A&~ctZAG8j#Td%Lo=I6wgiu;EcqCC#?foA>aM9(Cm{NCI@ z%zniBeV){QzkZJE6os2MYn3ikMHyBpN4>5fdB}A00@9U5&{(GdAt7p!YB2**uKVjcG~{EzQD%v9Z$ONx#*HF zjydh3%OVrkzTiK>o_=fk7bC|7u4PLXk3K5h(KPsiTQ?2;k&w5E1=~Nb+ELv)Y|!xB z%$)q$T}y0f1J%yb9ZPNeMV*ZczFfdpdK_s5uiXFMg^lT1%NMSHYg~KAaObwOR!#ri zPmVl$iTj@a9zN8b?#On!m+-%Aj2%FLC4A+hVCnhxtZxq)R?wTFynb*Nb7rQm{>Pjz mvu?Zi_G{KZaK@%LKb(GY_pu*!0;Zk%q2GJwO*Wy&mHmG_5(N4H diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index f33ba7627101e9..b5ecc4d34cd08b 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -4599,41 +4599,13 @@ def test_legacy_table_read(self): expected = df2[df2.index > df2.index[2]] assert_frame_equal(expected, result) - def test_legacy_0_10_read(self): - # legacy from 0.10 - with catch_warnings(record=True): - path = tm.get_data_path('legacy_hdf/legacy_0.10.h5') - with ensure_clean_store(path, mode='r') as store: - str(store) - for k in store.keys(): - store.select(k) - - def test_legacy_0_11_read(self): - # legacy from 0.11 - path = os.path.join('legacy_hdf', 'legacy_table_0.11.h5') - with ensure_clean_store(tm.get_data_path(path), mode='r') as store: - str(store) - assert 'df' in store - assert 'df1' in store - assert 'mi' in store - df = store.select('df') - df1 = store.select('df1') - mi = store.select('mi') - assert isinstance(df, DataFrame) - assert isinstance(df1, DataFrame) - assert isinstance(mi, DataFrame) - def test_copy(self): with catch_warnings(record=True): - def do_copy(f=None, new_f=None, keys=None, + def do_copy(f, new_f=None, keys=None, propindexes=True, **kwargs): try: - if f is None: - f = tm.get_data_path(os.path.join('legacy_hdf', - 'legacy_0.10.h5')) - store = HDFStore(f, 'r') if new_f is None: @@ -4671,10 +4643,6 @@ def do_copy(f=None, new_f=None, keys=None, pass safe_remove(new_f) - do_copy() - do_copy(keys=['/a', '/b', '/df1_mixed']) - do_copy(propindexes=False) - # new table df = tm.makeDataFrame() From 8351f86a0079b6b0cb95414807a2c2248530ef2c Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 1 Sep 2017 10:11:40 -0700 Subject: [PATCH 042/188] Tslib unused (#17402) --- pandas/_libs/tslib.pyx | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 5dd30072fb7aa0..50e0b77c6d3a0d 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -2622,8 +2622,6 @@ cdef class _Timedelta(timedelta): int ndim if isinstance(other, _Timedelta): - if isinstance(other, _NaT): - return _cmp_nat_dt(other, self, _reverse_ops[op]) ots = other elif isinstance(other, timedelta): ots = Timedelta(other) @@ -3882,7 +3880,7 @@ fields = ['year', 'quarter', 'month', 'day', 'hour', 'minute', 'second', 'millisecond', 'microsecond', 'nanosecond', 'week', 'dayofyear', 'weekofyear', 'days_in_month', 'daysinmonth', 'dayofweek', 'weekday_name', 'days', 'seconds', 'microseconds', - 'nanoseconds', 'qyear', 'quarter'] + 'nanoseconds', 'qyear'] for field in fields: prop = property(fget=lambda self: np.nan) setattr(NaTType, field, prop) @@ -4620,7 +4618,6 @@ def build_field_sarray(ndarray[int64_t] dtindex): """ cdef: Py_ssize_t i, count = 0 - int isleap pandas_datetimestruct dts ndarray[int32_t] years, months, days, hours, minutes, seconds, mus @@ -5270,7 +5267,6 @@ cpdef _isleapyear_arr(ndarray years): def monthrange(int64_t year, int64_t month): cdef: int64_t days - int64_t day_of_week if month < 1 or month > 12: raise ValueError("bad month number 0; must be 1-12") From 1981b679b0619de0765c2009684ce4abd886189d Mon Sep 17 00:00:00 2001 From: topper-123 Date: Sat, 2 Sep 2017 12:50:55 +0100 Subject: [PATCH 043/188] DOC: Cleaned references to pandas ` .. ipython:: python diff --git a/doc/source/dsintro.rst b/doc/source/dsintro.rst index 3c6572229802d8..4652ccbf0ad34e 100644 --- a/doc/source/dsintro.rst +++ b/doc/source/dsintro.rst @@ -73,7 +73,7 @@ index is passed, one will be created having values ``[0, ..., len(data) - 1]``. .. note:: - Starting in v0.8.0, pandas supports non-unique index values. If an operation + pandas supports non-unique index values. If an operation that does not support duplicate index values is attempted, an exception will be raised at that time. The reason for being lazy is nearly all performance-based (there are many instances in computations, like parts of GroupBy, where the index @@ -698,7 +698,7 @@ DataFrame in tabular form, though it won't always fit the console width: print(baseball.iloc[-20:, :12].to_string()) -New since 0.10.0, wide DataFrames will now be printed across multiple rows by +Wide DataFrames will be printed across multiple rows by default: .. ipython:: python @@ -845,19 +845,16 @@ DataFrame objects with mixed-type columns, all of the data will get upcasted to .. note:: - Unfortunately Panel, being less commonly used than Series and DataFrame, + Panel, being less commonly used than Series and DataFrame, has been slightly neglected feature-wise. A number of methods and options - available in DataFrame are not available in Panel. This will get worked - on, of course, in future releases. And faster if you join me in working on - the codebase. + available in DataFrame are not available in Panel. .. _dsintro.to_panel: From DataFrame using ``to_panel`` method ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -This method was introduced in v0.7 to replace ``LongPanel.to_long``, and converts -a DataFrame with a two-level index to a Panel. +``to_panel`` converts a DataFrame with a two-level index to a Panel. .. ipython:: python :okwarning: diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index 937d682d238b37..53c0b771555f83 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -140,7 +140,7 @@ columns: In [5]: grouped = df.groupby(get_letter_type, axis=1) -Starting with 0.8, pandas Index objects now support duplicate values. If a +pandas Index objects support duplicate values. If a non-unique index is used as the group key in a groupby operation, all values for the same index value will be considered to be in one group and thus the output of aggregation functions will only contain unique index values: @@ -288,8 +288,6 @@ chosen level: s.sum(level='second') -.. versionadded:: 0.6 - Grouping with multiple levels is supported. .. ipython:: python diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index 53a259ad6eb158..4687e464905627 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -66,8 +66,6 @@ See the :ref:`cookbook` for some advanced strategies Different Choices for Indexing ------------------------------ -.. versionadded:: 0.11.0 - Object selection has had a number of user-requested additions in order to support more explicit location based indexing. Pandas now supports three types of multi-axis indexing. diff --git a/doc/source/io.rst b/doc/source/io.rst index f55c72bae5a20e..f68358764a40e0 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -364,7 +364,7 @@ warn_bad_lines : boolean, default ``True`` Specifying column data types '''''''''''''''''''''''''''' -Starting with v0.10, you can indicate the data type for the whole DataFrame or +You can indicate the data type for the whole DataFrame or individual columns: .. ipython:: python @@ -3346,7 +3346,7 @@ Read/Write API '''''''''''''' ``HDFStore`` supports an top-level API using ``read_hdf`` for reading and ``to_hdf`` for writing, -similar to how ``read_csv`` and ``to_csv`` work. (new in 0.11.0) +similar to how ``read_csv`` and ``to_csv`` work. .. ipython:: python @@ -3791,7 +3791,7 @@ indexed dimension as the ``where``. .. note:: - Indexes are automagically created (starting ``0.10.1``) on the indexables + Indexes are automagically created on the indexables and any data columns you specify. This behavior can be turned off by passing ``index=False`` to ``append``. @@ -3878,7 +3878,7 @@ create a new table!) Iterator ++++++++ -Starting in ``0.11.0``, you can pass, ``iterator=True`` or ``chunksize=number_in_a_chunk`` +You can pass ``iterator=True`` or ``chunksize=number_in_a_chunk`` to ``select`` and ``select_as_multiple`` to return an iterator on the results. The default is 50,000 rows returned in a chunk. @@ -3986,8 +3986,8 @@ of rows in an object. Multiple Table Queries ++++++++++++++++++++++ -New in 0.10.1 are the methods ``append_to_multiple`` and -``select_as_multiple``, that can perform appending/selecting from +The methods ``append_to_multiple`` and +``select_as_multiple`` can perform appending/selecting from multiple tables at once. The idea is to have one table (call it the selector table) that you index most/all of the columns, and perform your queries. The other table(s) are data tables with an index matching the @@ -4291,7 +4291,7 @@ Pass ``min_itemsize`` on the first table creation to a-priori specify the minimu ``min_itemsize`` can be an integer, or a dict mapping a column name to an integer. You can pass ``values`` as a key to allow all *indexables* or *data_columns* to have this min_itemsize. -Starting in 0.11.0, passing a ``min_itemsize`` dict will cause all passed columns to be created as *data_columns* automatically. +Passing a ``min_itemsize`` dict will cause all passed columns to be created as *data_columns* automatically. .. note:: diff --git a/doc/source/missing_data.rst b/doc/source/missing_data.rst index d54288baa389b9..64a321d67a825f 100644 --- a/doc/source/missing_data.rst +++ b/doc/source/missing_data.rst @@ -67,9 +67,8 @@ arise and we wish to also consider that "missing" or "not available" or "NA". .. note:: - Prior to version v0.10.0 ``inf`` and ``-inf`` were also - considered to be "NA" in computations. This is no longer the case by - default; use the ``mode.use_inf_as_na`` option to recover it. + If you want to consider ``inf`` and ``-inf`` to be "NA" in computations, + you can set ``pandas.options.mode.use_inf_as_na = True``. .. _missing.isna: @@ -485,8 +484,8 @@ respectively: Replacing Generic Values ~~~~~~~~~~~~~~~~~~~~~~~~ -Often times we want to replace arbitrary values with other values. New in v0.8 -is the ``replace`` method in Series/DataFrame that provides an efficient yet +Often times we want to replace arbitrary values with other values. The +``replace`` method in Series/DataFrame provides an efficient yet flexible way to perform such replacements. For a Series, you can replace a single value or a list of values by another diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index ce4a920ad77b5f..aded5e4402df2b 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -1069,8 +1069,7 @@ Offset Aliases ~~~~~~~~~~~~~~ A number of string aliases are given to useful common time series -frequencies. We will refer to these aliases as *offset aliases* -(referred to as *time rules* prior to v0.8.0). +frequencies. We will refer to these aliases as *offset aliases*. .. csv-table:: :header: "Alias", "Description" diff --git a/doc/source/visualization.rst b/doc/source/visualization.rst index fb799c642131d7..c637246537ca1c 100644 --- a/doc/source/visualization.rst +++ b/doc/source/visualization.rst @@ -306,8 +306,6 @@ subplots: df.diff().hist(color='k', alpha=0.5, bins=50) -.. versionadded:: 0.10.0 - The ``by`` keyword can be specified to plot grouped histograms: .. ipython:: python @@ -831,8 +829,6 @@ and take a :class:`Series` or :class:`DataFrame` as an argument. Scatter Matrix Plot ~~~~~~~~~~~~~~~~~~~ -.. versionadded:: 0.7.3 - You can create a scatter plot matrix using the ``scatter_matrix`` method in ``pandas.plotting``: @@ -859,8 +855,6 @@ You can create a scatter plot matrix using the Density Plot ~~~~~~~~~~~~ -.. versionadded:: 0.8.0 - You can create density plots using the :meth:`Series.plot.kde` and :meth:`DataFrame.plot.kde` methods. .. ipython:: python From c2d048137c7288644e8276fed3c5a7071a80221e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 4 Sep 2017 16:32:34 -0700 Subject: [PATCH 044/188] Remove unused _day and _month attrs (#17431) closes #17429 --- pandas/_libs/tslib.pyx | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 50e0b77c6d3a0d..8fbc606ccdfe24 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -829,8 +829,6 @@ class NaTType(_NaT): cdef _NaT base base = _NaT.__new__(cls, 1, 1, 1) - base._day = -1 - base._month = -1 base.value = NPY_NAT return base From 5bca6ce860f66ca6f92327086a954b9e0326a85f Mon Sep 17 00:00:00 2001 From: topper-123 Date: Tue, 5 Sep 2017 11:30:31 +0100 Subject: [PATCH 045/188] DOC: Clean-up references to v12 to v14 (both included) (#17420) --- doc/source/advanced.rst | 21 ++--------- doc/source/basics.rst | 10 +----- doc/source/comparison_with_r.rst | 4 --- doc/source/cookbook.rst | 2 +- doc/source/enhancingperf.rst | 36 ++++++------------- doc/source/groupby.rst | 19 ---------- doc/source/indexing.rst | 23 ++---------- doc/source/install.rst | 2 +- doc/source/io.rst | 61 +++++++++----------------------- doc/source/merging.rst | 2 -- doc/source/missing_data.rst | 9 ----- doc/source/options.rst | 2 +- doc/source/text.rst | 2 -- doc/source/timedeltas.rst | 2 -- doc/source/timeseries.rst | 10 +++--- doc/source/visualization.rst | 16 --------- 16 files changed, 43 insertions(+), 178 deletions(-) diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst index 711c3e9a95d05d..4af476cd5a7e12 100644 --- a/doc/source/advanced.rst +++ b/doc/source/advanced.rst @@ -270,9 +270,6 @@ Passing a list of labels or tuples works similar to reindexing: Using slicers ~~~~~~~~~~~~~ -.. versionadded:: 0.14.0 - -In 0.14.0 we added a new way to slice multi-indexed objects. You can slice a multi-index by providing multiple indexers. You can provide any of the selectors as if you are indexing by label, see :ref:`Selection by Label `, @@ -384,7 +381,7 @@ selecting data at a particular level of a MultiIndex easier. .. ipython:: python - # using the slicers (new in 0.14.0) + # using the slicers df.loc[(slice(None),'one'),:] You can also select on the columns with :meth:`~pandas.MultiIndex.xs`, by @@ -397,7 +394,7 @@ providing the axis argument .. ipython:: python - # using the slicers (new in 0.14.0) + # using the slicers df.loc[:,(slice(None),'one')] :meth:`~pandas.MultiIndex.xs` also allows selection with multiple keys @@ -408,11 +405,9 @@ providing the axis argument .. ipython:: python - # using the slicers (new in 0.14.0) + # using the slicers df.loc[:,('bar','one')] -.. versionadded:: 0.13.0 - You can pass ``drop_level=False`` to :meth:`~pandas.MultiIndex.xs` to retain the level that was selected @@ -743,16 +738,6 @@ Prior to 0.18.0, the ``Int64Index`` would provide the default index for all ``ND Float64Index ~~~~~~~~~~~~ -.. note:: - - As of 0.14.0, ``Float64Index`` is backed by a native ``float64`` dtype - array. Prior to 0.14.0, ``Float64Index`` was backed by an ``object`` dtype - array. Using a ``float64`` dtype in the backend speeds up arithmetic - operations by about 30x and boolean indexing operations on the - ``Float64Index`` itself are about 2x as fast. - -.. versionadded:: 0.13.0 - By default a ``Float64Index`` will be automatically created when passing floating, or mixed-integer-floating values in index creation. This enables a pure label-based slicing paradigm that makes ``[],ix,loc`` for scalar indexing and slicing work exactly the same. diff --git a/doc/source/basics.rst b/doc/source/basics.rst index 35eb14eda238fd..5880703b1d2711 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -347,7 +347,7 @@ That is because NaNs do not compare as equals: np.nan == np.nan -So, as of v0.13.1, NDFrames (such as Series, DataFrames, and Panels) +So, NDFrames (such as Series, DataFrames, and Panels) have an :meth:`~DataFrame.equals` method for testing equality, with NaNs in corresponding locations treated as equal. @@ -1104,10 +1104,6 @@ Applying with a ``Panel`` will pass a ``Series`` to the applied function. If the function returns a ``Series``, the result of the application will be a ``Panel``. If the applied function reduces to a scalar, the result of the application will be a ``DataFrame``. -.. note:: - - Prior to 0.13.1 ``apply`` on a ``Panel`` would only work on ``ufuncs`` (e.g. ``np.sum/np.max``). - .. ipython:: python import pandas.util.testing as tm @@ -1800,8 +1796,6 @@ Series has the :meth:`~Series.searchsorted` method, which works similar to smallest / largest values ~~~~~~~~~~~~~~~~~~~~~~~~~ -.. versionadded:: 0.14.0 - ``Series`` has the :meth:`~Series.nsmallest` and :meth:`~Series.nlargest` methods which return the smallest or largest :math:`n` values. For a large ``Series`` this can be much faster than sorting the entire Series and calling ``head(n)`` on the result. @@ -2168,8 +2162,6 @@ Selecting columns based on ``dtype`` .. _basics.selectdtypes: -.. versionadded:: 0.14.1 - The :meth:`~DataFrame.select_dtypes` method implements subsetting of columns based on their ``dtype``. diff --git a/doc/source/comparison_with_r.rst b/doc/source/comparison_with_r.rst index 194e022e34c7c0..f895cdc25e6205 100644 --- a/doc/source/comparison_with_r.rst +++ b/doc/source/comparison_with_r.rst @@ -247,8 +247,6 @@ For more details and examples see :ref:`the reshaping documentation |subset|_ ~~~~~~~~~~ -.. versionadded:: 0.13 - The :meth:`~pandas.DataFrame.query` method is similar to the base R ``subset`` function. In R you might want to get the rows of a ``data.frame`` where one column's values are less than another column's values: @@ -277,8 +275,6 @@ For more details and examples see :ref:`the query documentation |with|_ ~~~~~~~~ -.. versionadded:: 0.13 - An expression using a data.frame called ``df`` in R with the columns ``a`` and ``b`` would be evaluated using ``with`` like so: diff --git a/doc/source/cookbook.rst b/doc/source/cookbook.rst index 32e7a616fe8564..f51c3e679b36f3 100644 --- a/doc/source/cookbook.rst +++ b/doc/source/cookbook.rst @@ -818,7 +818,7 @@ The :ref:`Concat ` docs. The :ref:`Join ` d df1 = pd.DataFrame(np.random.randn(6, 3), index=rng, columns=['A', 'B', 'C']) df2 = df1.copy() -ignore_index is needed in pandas < v0.13, and depending on df construction +Depending on df construction, ``ignore_index`` may be needed .. ipython:: python diff --git a/doc/source/enhancingperf.rst b/doc/source/enhancingperf.rst index 685a8690a53d55..264bd1de1fc774 100644 --- a/doc/source/enhancingperf.rst +++ b/doc/source/enhancingperf.rst @@ -213,17 +213,18 @@ the rows, applying our ``integrate_f_typed``, and putting this in the zeros arra .. warning:: - In 0.13.0 since ``Series`` has internaly been refactored to no longer sub-class ``ndarray`` - but instead subclass ``NDFrame``, you can **not pass** a ``Series`` directly as a ``ndarray`` typed parameter - to a cython function. Instead pass the actual ``ndarray`` using the ``.values`` attribute of the Series. + You can **not pass** a ``Series`` directly as a ``ndarray`` typed parameter + to a cython function. Instead pass the actual ``ndarray`` using the + ``.values`` attribute of the Series. The reason is that the cython + definition is specific to an ndarray and not the passed Series. - Prior to 0.13.0 + So, do not do this: .. code-block:: python apply_integrate_f(df['a'], df['b'], df['N']) - Use ``.values`` to get the underlying ``ndarray`` + But rather, use ``.values`` to get the underlying ``ndarray`` .. code-block:: python @@ -399,10 +400,8 @@ Read more in the `numba docs `__. .. _enhancingperf.eval: -Expression Evaluation via :func:`~pandas.eval` (Experimental) -------------------------------------------------------------- - -.. versionadded:: 0.13 +Expression Evaluation via :func:`~pandas.eval` +----------------------------------------------- The top-level function :func:`pandas.eval` implements expression evaluation of :class:`~pandas.Series` and :class:`~pandas.DataFrame` objects. @@ -539,10 +538,8 @@ Now let's do the same thing but with comparisons: of type ``bool`` or ``np.bool_``. Again, you should perform these kinds of operations in plain Python. -The ``DataFrame.eval`` method (Experimental) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. versionadded:: 0.13 +The ``DataFrame.eval`` method +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ In addition to the top level :func:`pandas.eval` function you can also evaluate an expression in the "context" of a :class:`~pandas.DataFrame`. @@ -646,19 +643,6 @@ whether the query modifies the original frame. Local Variables ~~~~~~~~~~~~~~~ -In pandas version 0.14 the local variable API has changed. In pandas 0.13.x, -you could refer to local variables the same way you would in standard Python. -For example, - -.. code-block:: python - - df = pd.DataFrame(np.random.randn(5, 2), columns=['a', 'b']) - newcol = np.random.randn(len(df)) - df.eval('b + newcol') - - UndefinedVariableError: name 'newcol' is not defined - -As you can see from the exception generated, this syntax is no longer allowed. You must *explicitly reference* any local variable that you want to use in an expression by placing the ``@`` character in front of the name. For example, diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index 53c0b771555f83..e1231b9a4a2007 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -766,8 +766,6 @@ missing values with the ``ffill()`` method. Filtration ---------- -.. versionadded:: 0.12 - The ``filter`` method returns a subset of the original object. Suppose we want to take only elements that belong to groups with a group sum greater than 2. @@ -858,8 +856,6 @@ In this example, we chopped the collection of time series into yearly chunks then independently called :ref:`fillna ` on the groups. -.. versionadded:: 0.14.1 - The ``nlargest`` and ``nsmallest`` methods work on ``Series`` style groupbys: .. ipython:: python @@ -1048,19 +1044,6 @@ Just like for a DataFrame or Series you can call head and tail on a groupby: This shows the first or last n rows from each group. -.. warning:: - - Before 0.14.0 this was implemented with a fall-through apply, - so the result would incorrectly respect the as_index flag: - - .. code-block:: python - - >>> g.head(1): # was equivalent to g.apply(lambda x: x.head(1)) - A B - A - 1 0 1 2 - 5 2 5 6 - .. _groupby.nth: Taking the nth row of each group @@ -1113,8 +1096,6 @@ You can also select multiple rows from each group by specifying multiple nth val Enumerate group items ~~~~~~~~~~~~~~~~~~~~~ -.. versionadded:: 0.13.0 - To see the order in which each row appears within its group, use the ``cumcount`` method: diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index 4687e464905627..a6e7df57be4e50 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -248,8 +248,6 @@ as an attribute: - In any of these cases, standard indexing will still work, e.g. ``s['1']``, ``s['min']``, and ``s['index']`` will access the corresponding element or column. - - The ``Series/Panel`` accesses are available starting in 0.13.0. - If you are using the IPython environment, you may also use tab-completion to see these accessible attributes. @@ -529,7 +527,6 @@ Out of range slice indexes are handled gracefully just as in Python/Numpy. .. ipython:: python # these are allowed in python/numpy. - # Only works in Pandas starting from v0.14.0. x = list('abcdef') x x[4:10] @@ -539,14 +536,8 @@ Out of range slice indexes are handled gracefully just as in Python/Numpy. s.iloc[4:10] s.iloc[8:10] -.. note:: - - Prior to v0.14.0, ``iloc`` would not accept out of bounds indexers for - slices, e.g. a value that exceeds the length of the object being indexed. - - -Note that this could result in an empty axis (e.g. an empty DataFrame being -returned) +Note that using slices that go out of bounds can result in +an empty axis (e.g. an empty DataFrame being returned) .. ipython:: python @@ -745,8 +736,6 @@ Finally, one can also set a seed for ``sample``'s random number generator using Setting With Enlargement ------------------------ -.. versionadded:: 0.13 - The ``.loc/[]`` operations can perform enlargement when setting a non-existant key for that axis. In the ``Series`` case this is effectively an appending operation @@ -1020,8 +1009,6 @@ partial setting via ``.loc`` (but on the contents rather than the axis labels) df2[ df2[1:4] > 0 ] = 3 df2 -.. versionadded:: 0.13 - Where can also accept ``axis`` and ``level`` parameters to align the input when performing the ``where``. @@ -1064,8 +1051,6 @@ as condition and ``other`` argument. The :meth:`~pandas.DataFrame.query` Method (Experimental) --------------------------------------------------------- -.. versionadded:: 0.13 - :class:`~pandas.DataFrame` objects have a :meth:`~pandas.DataFrame.query` method that allows selection using an expression. @@ -1506,8 +1491,6 @@ The name, if set, will be shown in the console display: Setting metadata ~~~~~~~~~~~~~~~~ -.. versionadded:: 0.13.0 - Indexes are "mostly immutable", but it is possible to set and change their metadata, like the index ``name`` (or, for ``MultiIndex``, ``levels`` and ``labels``). @@ -1790,7 +1773,7 @@ Evaluation order matters Furthermore, in chained expressions, the order may determine whether a copy is returned or not. If an expression will set values on a copy of a slice, then a ``SettingWithCopy`` -exception will be raised (this raise/warn behavior is new starting in 0.13.0) +warning will be issued. You can control the action of a chained assignment via the option ``mode.chained_assignment``, which can take the values ``['raise','warn',None]``, where showing a warning is the default. diff --git a/doc/source/install.rst b/doc/source/install.rst index f92c43839ee317..8dc8224ea6cb24 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -107,7 +107,7 @@ following command:: To install a specific pandas version:: - conda install pandas=0.13.1 + conda install pandas=0.20.3 To install other packages, IPython for example:: diff --git a/doc/source/io.rst b/doc/source/io.rst index f68358764a40e0..33523ea171f3ae 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1310,8 +1310,6 @@ column widths for contiguous columns: The parser will take care of extra white spaces around the columns so it's ok to have extra separation between the columns in the file. -.. versionadded:: 0.13.0 - By default, ``read_fwf`` will try to infer the file's ``colspecs`` by using the first 100 rows of the file. It can do it only in cases when the columns are aligned and correctly separated by the provided ``delimiter`` (default delimiter @@ -1407,8 +1405,7 @@ Reading columns with a ``MultiIndex`` By specifying list of row locations for the ``header`` argument, you can read in a ``MultiIndex`` for the columns. Specifying non-consecutive -rows will skip the intervening rows. In order to have the pre-0.13 behavior -of tupleizing columns, specify ``tupleize_cols=True``. +rows will skip the intervening rows. .. ipython:: python @@ -1418,7 +1415,7 @@ of tupleizing columns, specify ``tupleize_cols=True``. print(open('mi.csv').read()) pd.read_csv('mi.csv',header=[0,1,2,3],index_col=[0,1]) -Starting in 0.13.0, ``read_csv`` will be able to interpret a more common format +``read_csv`` is also able to interpret a more common format of multi-columns indices. .. ipython:: python @@ -2012,8 +2009,6 @@ The speedup is less noticeable for smaller datasets: Normalization ''''''''''''' -.. versionadded:: 0.13.0 - pandas provides a utility function to take a dict or list of dicts and *normalize* this semi-structured data into a flat table. @@ -2198,8 +2193,6 @@ Reading HTML Content We **highly encourage** you to read the :ref:`HTML Table Parsing gotchas ` below regarding the issues surrounding the BeautifulSoup4/html5lib/lxml parsers. -.. versionadded:: 0.12.0 - The top-level :func:`~pandas.io.html.read_html` function can accept an HTML string/file/URL and will parse HTML tables into list of pandas DataFrames. Let's look at a few examples. @@ -2653,10 +2646,6 @@ of sheet names can simply be passed to ``read_excel`` with no loss in performanc # equivalent using the read_excel function data = read_excel('path_to_file.xls', ['Sheet1', 'Sheet2'], index_col=None, na_values=['NA']) -.. versionadded:: 0.12 - -``ExcelFile`` has been moved to the top level namespace. - .. versionadded:: 0.17 ``read_excel`` can take an ``ExcelFile`` object as input @@ -2716,9 +2705,6 @@ Using a list to get multiple sheets: ``read_excel`` can read more than one sheet, by setting ``sheet_name`` to either a list of sheet names, a list of sheet positions, or ``None`` to read all sheets. - -.. versionadded:: 0.13 - Sheets can be specified by sheet index or sheet name, using an integer or string, respectively. @@ -2866,9 +2852,9 @@ Files with a ``.xls`` extension will be written using ``xlwt`` and those with a ``.xlsx`` extension will be written using ``xlsxwriter`` (if available) or ``openpyxl``. -The DataFrame will be written in a way that tries to mimic the REPL output. One -difference from 0.12.0 is that the ``index_label`` will be placed in the second -row instead of the first. You can get the previous behaviour by setting the +The DataFrame will be written in a way that tries to mimic the REPL output. +The ``index_label`` will be placed in the second +row instead of the first. You can place it in the first row by setting the ``merge_cells`` option in ``to_excel()`` to ``False``: .. code-block:: python @@ -2945,8 +2931,6 @@ Added support for Openpyxl >= 2.2 Excel writer engines '''''''''''''''''''' -.. versionadded:: 0.13 - ``pandas`` chooses an Excel writer via two methods: 1. the ``engine`` keyword argument @@ -3074,14 +3058,19 @@ any pickled pandas object (or any other pickled object) from file: Loading pickled data received from untrusted sources can be unsafe. - See: http://docs.python.org/2.7/library/pickle.html + See: https://docs.python.org/3.6/library/pickle.html .. warning:: - Several internal refactorings, 0.13 (:ref:`Series Refactoring `), and 0.15 (:ref:`Index Refactoring `), - preserve compatibility with pickles created prior to these versions. However, these must - be read with ``pd.read_pickle``, rather than the default python ``pickle.load``. - See `this question `__ + Several internal refactorings have been done while still preserving + compatibility with pickles created with older versions of pandas. However, + for such cases, pickled dataframes, series etc, must be read with + ``pd.read_pickle``, rather than ``pickle.load``. + + See `here `__ + and `here `__ + for some examples of compatibility-breaking changes. See + `this question `__ for a detailed explanation. .. _io.pickle.compression: @@ -3150,9 +3139,7 @@ The default is to 'infer msgpack ------- -.. versionadded:: 0.13.0 - -Starting in 0.13.0, pandas is supporting the ``msgpack`` format for +pandas supports the ``msgpack`` format for object serialization. This is a lightweight portable binary format, similar to binary JSON, that is highly space efficient, and provides good performance both on the writing (serialization), and reading (deserialization). @@ -3424,10 +3411,6 @@ This is also true for the major axis of a ``Panel``: Fixed Format '''''''''''' -.. note:: - - This was prior to 0.13.0 the ``Storer`` format. - The examples above show storing using ``put``, which write the HDF5 to ``PyTables`` in a fixed array format, called the ``fixed`` format. These types of stores are **not** appendable once written (though you can simply remove them and rewrite). Nor are they **queryable**; they must be @@ -3460,8 +3443,6 @@ other sessions. In addition, delete & query type operations are supported. This format is specified by ``format='table'`` or ``format='t'`` to ``append`` or ``put`` or ``to_hdf`` -.. versionadded:: 0.13 - This format can be set as an option as well ``pd.set_option('io.hdf.default_format','table')`` to enable ``put/append/to_hdf`` to by default store in the ``table`` format. @@ -3765,9 +3746,7 @@ space. These are in terms of the total number of rows in a table. Using timedelta64[ns] +++++++++++++++++++++ -.. versionadded:: 0.13 - -Beginning in 0.13.0, you can store and query using the ``timedelta64[ns]`` type. Terms can be +You can store and query using the ``timedelta64[ns]`` type. Terms can be specified in the format: ``()``, where float may be signed (and fractional), and unit can be ``D,s,ms,us,ns`` for the timedelta. Here's an example: @@ -3889,8 +3868,6 @@ The default is 50,000 rows returned in a chunk. .. note:: - .. versionadded:: 0.12.0 - You can also use the iterator with ``read_hdf`` which will open, then automatically close the store when finished iterating. @@ -4603,8 +4580,6 @@ included in Python's standard library by default. You can find an overview of supported drivers for each SQL dialect in the `SQLAlchemy docs `__. -.. versionadded:: 0.14.0 - If SQLAlchemy is not installed, a fallback is only provided for sqlite (and for mysql for backwards compatibility, but this is deprecated and will be removed in a future version). @@ -4937,8 +4912,6 @@ Full documentation can be found `here `__ Stata Format ------------ -.. versionadded:: 0.12.0 - .. _io.stata_writer: Writing to Stata format diff --git a/doc/source/merging.rst b/doc/source/merging.rst index d956f1ca54e6b8..a5ee1b1a9384cc 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -1053,8 +1053,6 @@ As you can see, this drops any rows where there was no match. Joining a single Index to a Multi-index ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. versionadded:: 0.14.0 - You can join a singly-indexed ``DataFrame`` with a level of a multi-indexed ``DataFrame``. The level will match on the name of the index of the singly-indexed frame against a level name of the multi-indexed frame. diff --git a/doc/source/missing_data.rst b/doc/source/missing_data.rst index 64a321d67a825f..65b411ccd4af26 100644 --- a/doc/source/missing_data.rst +++ b/doc/source/missing_data.rst @@ -263,8 +263,6 @@ and ``bfill()`` is equivalent to ``fillna(method='bfill')`` Filling with a PandasObject ~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. versionadded:: 0.12 - You can also fillna using a dict or Series that is alignable. The labels of the dict or index of the Series must match the columns of the frame you wish to fill. The use case of this is to fill a DataFrame with the mean of that column. @@ -280,8 +278,6 @@ use case of this is to fill a DataFrame with the mean of that column. dff.fillna(dff.mean()) dff.fillna(dff.mean()['B':'C']) -.. versionadded:: 0.13 - Same result as above, but is aligning the 'fill' value which is a Series in this case. @@ -320,11 +316,6 @@ examined :ref:`in the API `. Interpolation ~~~~~~~~~~~~~ -.. versionadded:: 0.13.0 - - :meth:`~pandas.DataFrame.interpolate`, and :meth:`~pandas.Series.interpolate` have - revamped interpolation methods and functionality. - .. versionadded:: 0.17.0 The ``limit_direction`` keyword argument was added. diff --git a/doc/source/options.rst b/doc/source/options.rst index 51d02bc89692a6..1592caf90546c7 100644 --- a/doc/source/options.rst +++ b/doc/source/options.rst @@ -306,7 +306,7 @@ display.float_format None The callable should accept a fl See core.format.EngFormatter for an example. display.large_repr truncate For DataFrames exceeding max_rows/max_cols, the repr (and HTML repr) can show - a truncated table (the default from 0.13), + a truncated table (the default), or switch to the view from df.info() (the behaviour in earlier versions of pandas). allowable settings, ['truncate', 'info'] diff --git a/doc/source/text.rst b/doc/source/text.rst index e3e4b24d17f448..85b8aa6aa18578 100644 --- a/doc/source/text.rst +++ b/doc/source/text.rst @@ -211,8 +211,6 @@ Extracting Substrings Extract first match in each subject (extract) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. versionadded:: 0.13.0 - .. warning:: In version 0.18.0, ``extract`` gained the ``expand`` argument. When diff --git a/doc/source/timedeltas.rst b/doc/source/timedeltas.rst index 07effcfdff33b7..daa2c262c8c860 100644 --- a/doc/source/timedeltas.rst +++ b/doc/source/timedeltas.rst @@ -242,8 +242,6 @@ Numeric reduction operation for ``timedelta64[ns]`` will return ``Timedelta`` ob Frequency Conversion -------------------- -.. versionadded:: 0.13 - Timedelta Series, ``TimedeltaIndex``, and ``Timedelta`` scalars can be converted to other 'frequencies' by dividing by another timedelta, or by astyping to a specific timedelta type. These operations yield Series and propagate ``NaT`` -> ``nan``. Note that division by the numpy scalar is true division, while astyping is equivalent of floor division. diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index aded5e4402df2b..c86c58c3183f6f 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -177,7 +177,7 @@ you can pass the ``dayfirst`` flag: .. note:: Specifying a ``format`` argument will potentially speed up the conversion - considerably and on versions later then 0.13.0 explicitly specifying + considerably and explicitly specifying a format string of '%Y%m%d' takes a faster path still. If you pass a single string to ``to_datetime``, it returns single ``Timestamp``. @@ -1946,9 +1946,11 @@ These can easily be converted to a ``PeriodIndex`` Time Zone Handling ------------------ -Pandas provides rich support for working with timestamps in different time zones using ``pytz`` and ``dateutil`` libraries. -``dateutil`` support is new in 0.14.1 and currently only supported for fixed offset and tzfile zones. The default library is ``pytz``. -Support for ``dateutil`` is provided for compatibility with other applications e.g. if you use ``dateutil`` in other python packages. +Pandas provides rich support for working with timestamps in different time +zones using ``pytz`` and ``dateutil`` libraries. ``dateutil`` currently is only +supported for fixed offset and tzfile zones. The default library is ``pytz``. +Support for ``dateutil`` is provided for compatibility with other +applications e.g. if you use ``dateutil`` in other python packages. Working with Time Zones ~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/visualization.rst b/doc/source/visualization.rst index c637246537ca1c..839390c8778aa1 100644 --- a/doc/source/visualization.rst +++ b/doc/source/visualization.rst @@ -512,8 +512,6 @@ Compare to: Area Plot ~~~~~~~~~ -.. versionadded:: 0.14 - You can create area plots with :meth:`Series.plot.area` and :meth:`DataFrame.plot.area`. Area plots are stacked by default. To produce stacked area plot, each column must be either all positive or all negative values. @@ -550,8 +548,6 @@ To produce an unstacked plot, pass ``stacked=False``. Alpha value is set to 0.5 Scatter Plot ~~~~~~~~~~~~ -.. versionadded:: 0.13 - Scatter plot can be drawn by using the :meth:`DataFrame.plot.scatter` method. Scatter plot requires numeric columns for x and y axis. These can be specified by ``x`` and ``y`` keywords each. @@ -619,8 +615,6 @@ See the :meth:`scatter ` method and the Hexagonal Bin Plot ~~~~~~~~~~~~~~~~~~ -.. versionadded:: 0.14 - You can create hexagonal bin plots with :meth:`DataFrame.plot.hexbin`. Hexbin plots can be a useful alternative to scatter plots if your data are too dense to plot each point individually. @@ -682,8 +676,6 @@ See the :meth:`hexbin ` method and the Pie plot ~~~~~~~~ -.. versionadded:: 0.14 - You can create a pie plot with :meth:`DataFrame.plot.pie` or :meth:`Series.plot.pie`. If your data includes any ``NaN``, they will be automatically filled with 0. A ``ValueError`` will be raised if there are any negative values in your data. @@ -1365,8 +1357,6 @@ Another option is passing an ``ax`` argument to :meth:`Series.plot` to plot on a Plotting With Error Bars ~~~~~~~~~~~~~~~~~~~~~~~~ -.. versionadded:: 0.14 - Plotting with error bars is now supported in the :meth:`DataFrame.plot` and :meth:`Series.plot` Horizontal and vertical errorbars can be supplied to the ``xerr`` and ``yerr`` keyword arguments to :meth:`~DataFrame.plot()`. The error values can be specified using a variety of formats. @@ -1407,8 +1397,6 @@ Here is an example of one way to easily plot group means with standard deviation Plotting Tables ~~~~~~~~~~~~~~~ -.. versionadded:: 0.14 - Plotting with matplotlib table is now supported in :meth:`DataFrame.plot` and :meth:`Series.plot` with a ``table`` keyword. The ``table`` keyword can accept ``bool``, :class:`DataFrame` or :class:`Series`. The simple way to draw a table is to specify ``table=True``. Data will be transposed to meet matplotlib's default layout. .. ipython:: python @@ -1585,10 +1573,6 @@ available in matplotlib. Although this formatting does not provide the same level of refinement you would get when plotting via pandas, it can be faster when plotting a large number of points. -.. note:: - - The speed up for large data sets only applies to pandas 0.14.0 and later. - .. ipython:: python :suppress: From 25d529905521c4710c13b9a2c189a39479c529cb Mon Sep 17 00:00:00 2001 From: s-weigand Date: Wed, 6 Sep 2017 14:03:39 +0200 Subject: [PATCH 046/188] BUG: Plotting Timedelta on y-axis #16953 (#17430) * implemented fix for GH issue #16953 * added tests for fix of issue #16953 * changed comments for git issue to pandas style GH# * changed linelength in tests, so all lines are less than 80 characters * added whatsnew entry * swaped conversion and filtering of values, for plot to also work with object dtypes * refomated code, so len(line) < 80 * changed whatsnew with timedelta and datetime dtypes * added support for datetimetz and extended tests * added reason to pytest.mark.xfail --- doc/source/whatsnew/v0.21.0.txt | 2 +- pandas/plotting/_core.py | 8 ++- pandas/tests/plotting/test_frame.py | 76 +++++++++++++++++++++++++++++ 3 files changed, 84 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 81e52266f972e5..1f3bf00c877670 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -432,7 +432,7 @@ I/O Plotting ^^^^^^^^ - Bug in plotting methods using ``secondary_y`` and ``fontsize`` not setting secondary axis font size (:issue:`12565`) - +- Bug when plotting ``timedelta`` and ``datetime`` dtypes on y-axis (:issue:`16953`) Groupby/Resample/Rolling ^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index e5b9497993172d..a0b7e93efd05cb 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -342,7 +342,13 @@ def _compute_plot_data(self): label = 'None' data = data.to_frame(name=label) - numeric_data = data._convert(datetime=True)._get_numeric_data() + # GH16953, _convert is needed as fallback, for ``Series`` + # with ``dtype == object`` + data = data._convert(datetime=True, timedelta=True) + numeric_data = data.select_dtypes(include=[np.number, + "datetime", + "datetimetz", + "timedelta"]) try: is_empty = numeric_data.empty diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index 67098529a01119..f3b287a8889c37 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -380,6 +380,82 @@ def test_subplots_timeseries(self): self._check_ticks_props(ax, xlabelsize=7, xrot=45, ylabelsize=7) + def test_subplots_timeseries_y_axis(self): + # GH16953 + data = {"numeric": np.array([1, 2, 5]), + "timedelta": [pd.Timedelta(-10, unit="s"), + pd.Timedelta(10, unit="m"), + pd.Timedelta(10, unit="h")], + "datetime_no_tz": [pd.to_datetime("2017-08-01 00:00:00"), + pd.to_datetime("2017-08-01 02:00:00"), + pd.to_datetime("2017-08-02 00:00:00")], + "datetime_all_tz": [pd.to_datetime("2017-08-01 00:00:00", + utc=True), + pd.to_datetime("2017-08-01 02:00:00", + utc=True), + pd.to_datetime("2017-08-02 00:00:00", + utc=True)], + "text": ["This", "should", "fail"]} + testdata = DataFrame(data) + + ax_numeric = testdata.plot(y="numeric") + assert (ax_numeric.get_lines()[0].get_data()[1] == + testdata["numeric"].values).all() + ax_timedelta = testdata.plot(y="timedelta") + assert (ax_timedelta.get_lines()[0].get_data()[1] == + testdata["timedelta"].values).all() + ax_datetime_no_tz = testdata.plot(y="datetime_no_tz") + assert (ax_datetime_no_tz.get_lines()[0].get_data()[1] == + testdata["datetime_no_tz"].values).all() + ax_datetime_all_tz = testdata.plot(y="datetime_all_tz") + assert (ax_datetime_all_tz.get_lines()[0].get_data()[1] == + testdata["datetime_all_tz"].values).all() + with pytest.raises(TypeError): + testdata.plot(y="text") + + @pytest.mark.xfail(reason='not support for period, categorical, ' + 'datetime_mixed_tz') + def test_subplots_timeseries_y_axis_not_supported(self): + """ + This test will fail for: + period: + since period isn't yet implemented in ``select_dtypes`` + and because it will need a custom value converter + + tick formater (as was done for x-axis plots) + + categorical: + because it will need a custom value converter + + tick formater (also doesn't work for x-axis, as of now) + + datetime_mixed_tz: + because of the way how pandas handels ``Series`` of + ``datetime`` objects with different timezone, + generally converting ``datetime`` objects in a tz-aware + form could help with this problem + """ + data = {"numeric": np.array([1, 2, 5]), + "period": [pd.Period('2017-08-01 00:00:00', freq='H'), + pd.Period('2017-08-01 02:00', freq='H'), + pd.Period('2017-08-02 00:00:00', freq='H')], + "categorical": pd.Categorical(["c", "b", "a"], + categories=["a", "b", "c"], + ordered=False), + "datetime_mixed_tz": [pd.to_datetime("2017-08-01 00:00:00", + utc=True), + pd.to_datetime("2017-08-01 02:00:00"), + pd.to_datetime("2017-08-02 00:00:00")]} + testdata = pd.DataFrame(data) + ax_period = testdata.plot(x="numeric", y="period") + assert (ax_period.get_lines()[0].get_data()[1] == + testdata["period"].values).all() + ax_categorical = testdata.plot(x="numeric", y="categorical") + assert (ax_categorical.get_lines()[0].get_data()[1] == + testdata["categorical"].values).all() + ax_datetime_mixed_tz = testdata.plot(x="numeric", + y="datetime_mixed_tz") + assert (ax_datetime_mixed_tz.get_lines()[0].get_data()[1] == + testdata["datetime_mixed_tz"].values).all() + @pytest.mark.slow def test_subplots_layout(self): # GH 6667 From 84a39f99013f238a2e1df9ba63bdaa8a3fd00c08 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 6 Sep 2017 08:23:06 -0400 Subject: [PATCH 047/188] COMPAT: handle pyarrow deprecation of timestamps_to_ms in .from_pandas with pyarrow < 0.6.0 (#17447) closes #17438 --- ci/requirements-3.5.sh | 2 +- doc/source/whatsnew/v0.21.0.txt | 2 +- pandas/io/parquet.py | 18 ++++++++++++++---- 3 files changed, 16 insertions(+), 6 deletions(-) diff --git a/ci/requirements-3.5.sh b/ci/requirements-3.5.sh index 33db9c28c78a9c..d694ad3679ac12 100644 --- a/ci/requirements-3.5.sh +++ b/ci/requirements-3.5.sh @@ -8,4 +8,4 @@ echo "install 35" conda remove -n pandas python-dateutil --force pip install python-dateutil -conda install -n pandas -c conda-forge feather-format pyarrow=0.4.1 +conda install -n pandas -c conda-forge feather-format pyarrow=0.5.0 diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 1f3bf00c877670..b24a6f067cee4c 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -125,7 +125,7 @@ Other Enhancements - :func:`DataFrame.select_dtypes` now accepts scalar values for include/exclude as well as list-like. (:issue:`16855`) - :func:`date_range` now accepts 'YS' in addition to 'AS' as an alias for start of year (:issue:`9313`) - :func:`date_range` now accepts 'Y' in addition to 'A' as an alias for end of year (:issue:`9313`) -- Integration with `Apache Parquet `__, including a new top-level :func:`read_parquet` and :func:`DataFrame.to_parquet` method, see :ref:`here `. +- Integration with `Apache Parquet `__, including a new top-level :func:`read_parquet` and :func:`DataFrame.to_parquet` method, see :ref:`here `. (:issue:`15838`, :issue:`17438`) - :func:`DataFrame.add_prefix` and :func:`DataFrame.add_suffix` now accept strings containing the '%' character. (:issue:`17151`) - `read_*` methods can now infer compression from non-string paths, such as ``pathlib.Path`` objects (:issue:`17206`). - :func:`pd.read_sas()` now recognizes much more of the most frequently used date (datetime) formats in SAS7BDAT files (:issue:`15871`). diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 09603fd6fdcce7..4b507b7f5df6f7 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -58,13 +58,23 @@ def __init__(self): "\nor via pip\n" "pip install -U pyarrow\n") + self._pyarrow_lt_050 = LooseVersion(pyarrow.__version__) < '0.5.0' + self._pyarrow_lt_060 = LooseVersion(pyarrow.__version__) < '0.6.0' self.api = pyarrow - def write(self, df, path, compression='snappy', **kwargs): + def write(self, df, path, compression='snappy', + coerce_timestamps='ms', **kwargs): path, _, _ = get_filepath_or_buffer(path) - table = self.api.Table.from_pandas(df, timestamps_to_ms=True) - self.api.parquet.write_table( - table, path, compression=compression, **kwargs) + if self._pyarrow_lt_060: + table = self.api.Table.from_pandas(df, timestamps_to_ms=True) + self.api.parquet.write_table( + table, path, compression=compression, **kwargs) + + else: + table = self.api.Table.from_pandas(df) + self.api.parquet.write_table( + table, path, compression=compression, + coerce_timestamps=coerce_timestamps, **kwargs) def read(self, path): path, _, _ = get_filepath_or_buffer(path) From d4577911c750f2f48f760ce451d413116bed72da Mon Sep 17 00:00:00 2001 From: topper-123 Date: Wed, 6 Sep 2017 15:55:12 +0100 Subject: [PATCH 048/188] DOC/TST: Add examples to MultiIndex.get_level_values + related changes (#17414) --- pandas/core/indexes/base.py | 12 ++++++++++-- pandas/core/indexes/multi.py | 23 +++++++++++++++++++++-- pandas/tests/indexes/test_base.py | 6 ++++++ 3 files changed, 37 insertions(+), 4 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 6a30eaefaaae76..a9098126a38e3d 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2529,15 +2529,23 @@ def set_value(self, arr, key, value): def _get_level_values(self, level): """ Return an Index of values for requested level, equal to the length - of the index + of the index. Parameters ---------- - level : int + level : int or str + ``level`` is either the integer position of the level in the + MultiIndex, or the name of the level. Returns ------- values : Index + ``self``, as there is only one level in the Index. + + See also + --------- + pandas.MultiIndex.get_level_values : get values for a level of a + MultiIndex """ self._validate_index_level(level) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index d7d5b6d128a2c1..8b2cf0e7c0b407 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -882,15 +882,34 @@ def _get_level_values(self, level): def get_level_values(self, level): """ Return vector of label values for requested level, - equal to the length of the index + equal to the length of the index. Parameters ---------- - level : int or level name + level : int or str + ``level`` is either the integer position of the level in the + MultiIndex, or the name of the level. Returns ------- values : Index + ``values`` is a level of this MultiIndex converted to + a single :class:`Index` (or subclass thereof). + + Examples + --------- + + Create a MultiIndex: + + >>> mi = pd.MultiIndex.from_arrays((list('abc'), list('def'))) + >>> mi.names = ['level_1', 'level_2'] + + Get level values by supplying level as either integer or name: + + >>> mi.get_level_values(0) + Index(['a', 'b', 'c'], dtype='object', name='level_1') + >>> mi.get_level_values('level_2') + Index(['d', 'e', 'f'], dtype='object', name='level_2') """ level = self._get_level_number(level) values = self._get_level_values(level) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index aa32e75ba0d585..f96dbdcfb8acfe 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1438,6 +1438,12 @@ def test_get_level_values(self): result = self.strIndex.get_level_values(0) tm.assert_index_equal(result, self.strIndex) + # test for name (GH 17414) + index_with_name = self.strIndex.copy() + index_with_name.name = 'a' + result = index_with_name.get_level_values('a') + tm.assert_index_equal(result, index_with_name) + def test_slice_keep_name(self): idx = Index(['a', 'b'], name='asdf') assert idx.name == idx[1:].name From b8694460dbe3d4d82adb757a37e5f515356b1cde Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 6 Sep 2017 17:14:05 -0700 Subject: [PATCH 049/188] Dont re-pin total_seconds as it is already implemented (#17432) --- pandas/_libs/tslib.pyx | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 8fbc606ccdfe24..962c2ef3956a10 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -856,6 +856,9 @@ class NaTType(_NaT): return (__nat_unpickle, (None, )) def total_seconds(self): + """ + Total duration of timedelta in seconds (to ns precision) + """ # GH 10939 return np.nan @@ -3890,8 +3893,9 @@ for field in fields: _nat_methods = ['date', 'now', 'replace', 'to_pydatetime', 'today', 'round', 'floor', 'ceil', 'tz_convert', 'tz_localize'] -_nan_methods = ['weekday', 'isoweekday', 'total_seconds'] -_implemented_methods = ['to_datetime', 'to_datetime64', 'isoformat'] +_nan_methods = ['weekday', 'isoweekday'] +_implemented_methods = [ + 'to_datetime', 'to_datetime64', 'isoformat', 'total_seconds'] _implemented_methods.extend(_nat_methods) _implemented_methods.extend(_nan_methods) From 3a12687c4e91501d805fc71c37e9ce0a496b48bf Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Wed, 6 Sep 2017 17:46:50 -0700 Subject: [PATCH 050/188] BUG: Return local Timestamp.weekday_name attribute (#17354) (#17377) --- doc/source/whatsnew/v0.21.0.txt | 1 + pandas/_libs/tslib.pyx | 22 ++++++++++++++++++---- pandas/tests/scalar/test_timestamp.py | 8 ++++++++ 3 files changed, 27 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index b24a6f067cee4c..553e622b8560eb 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -399,6 +399,7 @@ Conversion - Fixed the return type of ``IntervalIndex.is_non_overlapping_monotonic`` to be a Python ``bool`` for consistency with similar attributes/methods. Previously returned a ``numpy.bool_``. (:issue:`17237`) - Bug in ``IntervalIndex.is_non_overlapping_monotonic`` when intervals are closed on both sides and overlap at a point (:issue:`16560`) - Bug in :func:`Series.fillna` returns frame when ``inplace=True`` and ``value`` is dict (:issue:`16156`) +- Bug in :attr:`Timestamp.weekday_name` returning a UTC-based weekday name when localized to a timezone (:issue:`17354`) Indexing ^^^^^^^^ diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 962c2ef3956a10..f31be9502499f1 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -532,9 +532,7 @@ class Timestamp(_Timestamp): @property def weekday_name(self): - out = get_date_name_field( - np.array([self.value], dtype=np.int64), 'weekday_name') - return out[0] + return self._get_named_field('weekday_name') @property def dayofyear(self): @@ -1269,13 +1267,29 @@ cdef class _Timestamp(datetime): # same timezone if specified) return datetime.__sub__(self, other) - cpdef _get_field(self, field): + cdef int64_t _maybe_convert_value_to_local(self): + """Convert UTC i8 value to local i8 value if tz exists""" + cdef: + int64_t val val = self.value if self.tz is not None and not _is_utc(self.tz): val = tz_convert_single(self.value, 'UTC', self.tz) + return val + + cpdef _get_field(self, field): + cdef: + int64_t val + val = self._maybe_convert_value_to_local() out = get_date_field(np.array([val], dtype=np.int64), field) return int(out[0]) + cpdef _get_named_field(self, field): + cdef: + int64_t val + val = self._maybe_convert_value_to_local() + out = get_date_name_field(np.array([val], dtype=np.int64), field) + return out[0] + cpdef _get_start_end_field(self, field): month_kw = self.freq.kwds.get( 'startingMonth', self.freq.kwds.get( diff --git a/pandas/tests/scalar/test_timestamp.py b/pandas/tests/scalar/test_timestamp.py index 7cd1a7db0f9fe9..8d47ce4802ac65 100644 --- a/pandas/tests/scalar/test_timestamp.py +++ b/pandas/tests/scalar/test_timestamp.py @@ -555,6 +555,14 @@ def check(value, equal): for end in ends: assert getattr(ts, end) + @pytest.mark.parametrize('data, expected', + [(Timestamp('2017-08-28 23:00:00'), 'Monday'), + (Timestamp('2017-08-28 23:00:00', tz='EST'), + 'Monday')]) + def test_weekday_name(self, data, expected): + # GH 17354 + assert data.weekday_name == expected + def test_pprint(self): # GH12622 import pprint From fd137f537051ad98ca9a9c069827c72a8b9e7543 Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Thu, 7 Sep 2017 02:47:43 +0200 Subject: [PATCH 051/188] BUG: intersection of decreasing RangeIndexes (#17374) closes #17296 --- doc/source/whatsnew/v0.21.0.txt | 1 + pandas/core/indexes/range.py | 22 +++++++++++++--------- pandas/tests/indexes/test_range.py | 15 +++++++++++++++ 3 files changed, 29 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 553e622b8560eb..f7cd8230c8b9b5 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -417,6 +417,7 @@ Indexing - Bug in ``.iloc`` when used with inplace addition or assignment and an int indexer on a ``MultiIndex`` causing the wrong indexes to be read from and written to (:issue:`17148`) - Bug in ``.isin()`` in which checking membership in empty ``Series`` objects raised an error (:issue:`16991`) - Bug in ``CategoricalIndex`` reindexing in which specified indices containing duplicates were not being respected (:issue:`17323`) +- Bug in intersection of ``RangeIndex`` with negative step (:issue:`17296`) I/O ^^^ diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 82412d3a7ef57a..b759abaed4e564 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -324,12 +324,13 @@ def intersection(self, other): if not len(self) or not len(other): return RangeIndex._simple_new(None) + first = self[::-1] if self._step < 0 else self + second = other[::-1] if other._step < 0 else other + # check whether intervals intersect # deals with in- and decreasing ranges - int_low = max(min(self._start, self._stop + 1), - min(other._start, other._stop + 1)) - int_high = min(max(self._stop, self._start + 1), - max(other._stop, other._start + 1)) + int_low = max(first._start, second._start) + int_high = min(first._stop, second._stop) if int_high <= int_low: return RangeIndex._simple_new(None) @@ -337,21 +338,24 @@ def intersection(self, other): # solve intersection problem # performance hint: for identical step sizes, could use # cheaper alternative - gcd, s, t = self._extended_gcd(self._step, other._step) + gcd, s, t = first._extended_gcd(first._step, second._step) # check whether element sets intersect - if (self._start - other._start) % gcd: + if (first._start - second._start) % gcd: return RangeIndex._simple_new(None) # calculate parameters for the RangeIndex describing the # intersection disregarding the lower bounds - tmp_start = self._start + (other._start - self._start) * \ - self._step // gcd * s - new_step = self._step * other._step // gcd + tmp_start = first._start + (second._start - first._start) * \ + first._step // gcd * s + new_step = first._step * second._step // gcd new_index = RangeIndex(tmp_start, int_high, new_step, fastpath=True) # adjust index to limiting interval new_index._start = new_index._min_fitting_element(int_low) + + if (self._step < 0 and other._step < 0) is not (new_index._step < 0): + new_index = new_index[::-1] return new_index def _min_fitting_element(self, lower_limit): diff --git a/pandas/tests/indexes/test_range.py b/pandas/tests/indexes/test_range.py index 5ecf467b57fc5c..06c8f0ee392c77 100644 --- a/pandas/tests/indexes/test_range.py +++ b/pandas/tests/indexes/test_range.py @@ -610,6 +610,21 @@ def test_intersection(self): other.values))) tm.assert_index_equal(result, expected) + # reversed (GH 17296) + result = other.intersection(self.index) + tm.assert_index_equal(result, expected) + + # GH 17296: intersect two decreasing RangeIndexes + first = RangeIndex(10, -2, -2) + other = RangeIndex(5, -4, -1) + expected = first.astype(int).intersection(other.astype(int)) + result = first.intersection(other).astype(int) + tm.assert_index_equal(result, expected) + + # reversed + result = other.intersection(first).astype(int) + tm.assert_index_equal(result, expected) + index = RangeIndex(5) # intersect of non-overlapping indices From 93e23a71f583920b46b4bb20e99a9a5e73685c47 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 6 Sep 2017 17:51:50 -0700 Subject: [PATCH 052/188] Remove property that re-computed microsecond (#17331) --- asv_bench/benchmarks/timestamp.py | 60 +++++++++++++++++++++++++++++++ doc/source/whatsnew/v0.21.0.txt | 2 +- pandas/_libs/period.pyx | 1 + pandas/_libs/tslib.pyx | 4 --- 4 files changed, 62 insertions(+), 5 deletions(-) create mode 100644 asv_bench/benchmarks/timestamp.py diff --git a/asv_bench/benchmarks/timestamp.py b/asv_bench/benchmarks/timestamp.py new file mode 100644 index 00000000000000..066479b22739a8 --- /dev/null +++ b/asv_bench/benchmarks/timestamp.py @@ -0,0 +1,60 @@ +from .pandas_vb_common import * +from pandas import to_timedelta, Timestamp + + +class TimestampProperties(object): + goal_time = 0.2 + + def setup(self): + self.ts = Timestamp('2017-08-25 08:16:14') + + def time_tz(self): + self.ts.tz + + def time_offset(self): + self.ts.offset + + def time_dayofweek(self): + self.ts.dayofweek + + def time_weekday_name(self): + self.ts.weekday_name + + def time_dayofyear(self): + self.ts.dayofyear + + def time_week(self): + self.ts.week + + def time_quarter(self): + self.ts.quarter + + def time_days_in_month(self): + self.ts.days_in_month + + def time_freqstr(self): + self.ts.freqstr + + def time_is_month_start(self): + self.ts.is_month_start + + def time_is_month_end(self): + self.ts.is_month_end + + def time_is_quarter_start(self): + self.ts.is_quarter_start + + def time_is_quarter_end(self): + self.ts.is_quarter_end + + def time_is_year_start(self): + self.ts.is_quarter_end + + def time_is_year_end(self): + self.ts.is_quarter_end + + def time_is_leap_year(self): + self.ts.is_quarter_end + + def time_microsecond(self): + self.ts.microsecond diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index f7cd8230c8b9b5..33a6db18db3cad 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -382,7 +382,7 @@ Performance Improvements - Improved performance of instantiating :class:`SparseDataFrame` (:issue:`16773`) - :attr:`Series.dt` no longer performs frequency inference, yielding a large speedup when accessing the attribute (:issue:`17210`) - +- :attr:`Timestamp.microsecond` no longer re-computes on attribute access (:issue:`17331`) .. _whatsnew_0210.bug_fixes: diff --git a/pandas/_libs/period.pyx b/pandas/_libs/period.pyx index 816b7ebfff86de..0ade8f9a6dde5b 100644 --- a/pandas/_libs/period.pyx +++ b/pandas/_libs/period.pyx @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- from datetime import datetime, date, timedelta import operator diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index f31be9502499f1..a7b33c669a8b8c 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -544,10 +544,6 @@ class Timestamp(_Timestamp): weekofyear = week - @property - def microsecond(self): - return self._get_field('us') - @property def quarter(self): return self._get_field('q') From 20fee85ede7f2052f855b8f3445cd1ffc17ee0c3 Mon Sep 17 00:00:00 2001 From: topper-123 Date: Thu, 7 Sep 2017 02:00:49 +0100 Subject: [PATCH 053/188] cleaned references to pandas v0.15 and v0.16 in docs (#17442) --- doc/source/10min.rst | 2 +- doc/source/advanced.rst | 15 ++------ doc/source/basics.rst | 8 ++-- doc/source/categorical.rst | 64 +++++--------------------------- doc/source/comparison_with_r.rst | 2 - doc/source/computation.rst | 7 +--- doc/source/cookbook.rst | 6 --- doc/source/dsintro.rst | 2 - doc/source/gotchas.rst | 4 +- doc/source/indexing.rst | 14 ------- doc/source/install.rst | 20 ++++------ doc/source/io.rst | 41 ++++---------------- doc/source/remote_data.rst | 11 +++--- doc/source/reshaping.rst | 4 +- doc/source/sparse.rst | 2 - doc/source/timedeltas.rst | 26 +++++-------- doc/source/visualization.rst | 4 -- doc/source/whatsnew/v0.21.0.txt | 1 + 18 files changed, 53 insertions(+), 180 deletions(-) diff --git a/doc/source/10min.rst b/doc/source/10min.rst index def49a641a0ff8..ef6b2d6ef2c904 100644 --- a/doc/source/10min.rst +++ b/doc/source/10min.rst @@ -655,7 +655,7 @@ the quarter end: Categoricals ------------ -Since version 0.15, pandas can include categorical data in a ``DataFrame``. For full docs, see the +pandas can include categorical data in a ``DataFrame``. For full docs, see the :ref:`categorical introduction ` and the :ref:`API documentation `. .. ipython:: python diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst index 4af476cd5a7e12..3f145cf9556645 100644 --- a/doc/source/advanced.rst +++ b/doc/source/advanced.rst @@ -26,12 +26,6 @@ See the :ref:`Indexing and Selecting Data ` for general indexing docum should be avoided. See :ref:`Returning a View versus Copy ` -.. warning:: - - In 0.15.0 ``Index`` has internally been refactored to no longer sub-class ``ndarray`` - but instead subclass ``PandasObject``, similarly to the rest of the pandas objects. This should be - a transparent change with only very limited API implications (See the :ref:`Internal Refactoring `) - See the :ref:`cookbook` for some advanced strategies .. _advanced.hierarchical: @@ -638,12 +632,9 @@ In the following sub-sections we will highlite some other index types. CategoricalIndex ~~~~~~~~~~~~~~~~ -.. versionadded:: 0.16.1 - -We introduce a ``CategoricalIndex``, a new type of index object that is useful for supporting -indexing with duplicates. This is a container around a ``Categorical`` (introduced in v0.15.0) -and allows efficient indexing and storage of an index with a large number of duplicated elements. Prior to 0.16.1, -setting the index of a ``DataFrame/Series`` with a ``category`` dtype would convert this to regular object-based ``Index``. +``CategoricalIndex`` is a type of index that is useful for supporting +indexing with duplicates. This is a container around a ``Categorical`` +and allows efficient indexing and storage of an index with a large number of duplicated elements. .. ipython:: python diff --git a/doc/source/basics.rst b/doc/source/basics.rst index 5880703b1d2711..42c28df3a6030f 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -719,8 +719,6 @@ on an entire ``DataFrame`` or ``Series``, row- or column-wise, or elementwise. Tablewise Function Application ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. versionadded:: 0.16.2 - ``DataFrames`` and ``Series`` can of course just be passed into functions. However, if the function needs to be called in a chain, consider using the :meth:`~DataFrame.pipe` method. Compare the following @@ -1860,8 +1858,10 @@ dtypes ------ The main types stored in pandas objects are ``float``, ``int``, ``bool``, -``datetime64[ns]`` and ``datetime64[ns, tz]`` (in >= 0.17.0), ``timedelta[ns]``, ``category`` (in >= 0.15.0), and ``object``. In addition these dtypes -have item sizes, e.g. ``int64`` and ``int32``. See :ref:`Series with TZ ` for more detail on ``datetime64[ns, tz]`` dtypes. +``datetime64[ns]`` and ``datetime64[ns, tz]`` (in >= 0.17.0), ``timedelta[ns]``, +``category`` and ``object``. In addition these dtypes have item sizes, e.g. +``int64`` and ``int32``. See :ref:`Series with TZ ` +for more detail on ``datetime64[ns, tz]`` dtypes. A convenient :attr:`~DataFrame.dtypes` attribute for DataFrames returns a Series with the data type of each column. diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index 02d7920bc4a84e..8835c4a1533d0c 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -16,13 +16,6 @@ Categorical Data **************** -.. versionadded:: 0.15 - -.. note:: - While there was `pandas.Categorical` in earlier versions, the ability to use - categorical data in `Series` and `DataFrame` is new. - - This is an introduction to pandas categorical data type, including a short comparison with R's ``factor``. @@ -295,10 +288,6 @@ Sorting and Order .. _categorical.sort: -.. warning:: - - The default for construction has changed in v0.16.0 to ``ordered=False``, from the prior implicit ``ordered=True`` - If categorical data is ordered (``s.cat.ordered == True``), then the order of the categories has a meaning and certain operations are possible. If the categorical is unordered, ``.min()/.max()`` will raise a `TypeError`. @@ -803,13 +792,11 @@ Following table summarizes the results of ``Categoricals`` related concatenation Getting Data In/Out ------------------- -.. versionadded:: 0.15.2 +You can write data that contains ``category`` dtypes to a ``HDFStore``. +See :ref:`here ` for an example and caveats. -Writing data (`Series`, `Frames`) to a HDF store that contains a ``category`` dtype was implemented -in 0.15.2. See :ref:`here ` for an example and caveats. - -Writing data to and reading data from *Stata* format files was implemented in -0.15.2. See :ref:`here ` for an example and caveats. +It is also possible to write data to and reading data from *Stata* format files. +See :ref:`here ` for an example and caveats. Writing to a CSV file will convert the data, effectively removing any information about the categorical (categories and ordering). So if you read back the CSV file you have to convert the @@ -928,32 +915,6 @@ an ``object`` dtype is a constant times the length of the data. s.astype('category').nbytes -Old style constructor usage -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -In earlier versions than pandas 0.15, a `Categorical` could be constructed by passing in precomputed -`codes` (called then `labels`) instead of values with categories. The `codes` were interpreted as -pointers to the categories with `-1` as `NaN`. This type of constructor usage is replaced by -the special constructor :func:`Categorical.from_codes`. - -Unfortunately, in some special cases, using code which assumes the old style constructor usage -will work with the current pandas version, resulting in subtle bugs: - -.. code-block:: python - - >>> cat = pd.Categorical([1,2], [1,2,3]) - >>> # old version - >>> cat.get_values() - array([2, 3], dtype=int64) - >>> # new version - >>> cat.get_values() - array([1, 2], dtype=int64) - -.. warning:: - If you used `Categoricals` with older versions of pandas, please audit your code before - upgrading and change your code to use the :func:`~pandas.Categorical.from_codes` - constructor. - `Categorical` is not a `numpy` array ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -982,8 +943,7 @@ Dtype comparisons work: dtype == np.str_ np.str_ == dtype -To check if a Series contains Categorical data, with pandas 0.16 or later, use -``hasattr(s, 'cat')``: +To check if a Series contains Categorical data, use ``hasattr(s, 'cat')``: .. ipython:: python @@ -1023,13 +983,13 @@ basic type) and applying along columns will also convert to object. Categorical Index ~~~~~~~~~~~~~~~~~ -.. versionadded:: 0.16.1 - -A new ``CategoricalIndex`` index type is introduced in version 0.16.1. See the -:ref:`advanced indexing docs ` for a more detailed +``CategoricalIndex`` is a type of index that is useful for supporting +indexing with duplicates. This is a container around a ``Categorical`` +and allows efficient indexing and storage of an index with a large number of duplicated elements. +See the :ref:`advanced indexing docs ` for a more detailed explanation. -Setting the index, will create create a ``CategoricalIndex`` +Setting the index will create a ``CategoricalIndex`` .. ipython:: python @@ -1041,10 +1001,6 @@ Setting the index, will create create a ``CategoricalIndex`` # This now sorts by the categories order df.sort_index() -In previous versions (<0.16.1) there is no index of type ``category``, so -setting the index to categorical column will convert the categorical data to a -"normal" dtype first and therefore remove any custom ordering of the categories. - Side Effects ~~~~~~~~~~~~ diff --git a/doc/source/comparison_with_r.rst b/doc/source/comparison_with_r.rst index f895cdc25e6205..eb97aeeb7e6962 100644 --- a/doc/source/comparison_with_r.rst +++ b/doc/source/comparison_with_r.rst @@ -505,8 +505,6 @@ For more details and examples see :ref:`the reshaping documentation |factor|_ ~~~~~~~~~ -.. versionadded:: 0.15 - pandas has a data type for categorical data. .. code-block:: r diff --git a/doc/source/computation.rst b/doc/source/computation.rst index 76a030d355e332..23699393958cfe 100644 --- a/doc/source/computation.rst +++ b/doc/source/computation.rst @@ -924,15 +924,12 @@ EWM has a ``min_periods`` argument, which has the same meaning it does for all the ``.expanding`` and ``.rolling`` methods: no output values will be set until at least ``min_periods`` non-null values are encountered in the (expanding) window. -(This is a change from versions prior to 0.15.0, in which the ``min_periods`` -argument affected only the ``min_periods`` consecutive entries starting at the -first non-null value.) -EWM also has an ``ignore_na`` argument, which deterines how +EWM also has an ``ignore_na`` argument, which determines how intermediate null values affect the calculation of the weights. When ``ignore_na=False`` (the default), weights are calculated based on absolute positions, so that intermediate null values affect the result. -When ``ignore_na=True`` (which reproduces the behavior in versions prior to 0.15.0), +When ``ignore_na=True``, weights are calculated by ignoring intermediate null values. For example, assuming ``adjust=True``, if ``ignore_na=False``, the weighted average of ``3, NaN, 5`` would be calculated as diff --git a/doc/source/cookbook.rst b/doc/source/cookbook.rst index f51c3e679b36f3..5bb3ba75fe51bc 100644 --- a/doc/source/cookbook.rst +++ b/doc/source/cookbook.rst @@ -256,12 +256,6 @@ Panels pf = pd.Panel({'df1':df1,'df2':df2,'df3':df3});pf - #Assignment using Transpose (pandas < 0.15) - pf = pf.transpose(2,0,1) - pf['E'] = pd.DataFrame(data, rng, cols) - pf = pf.transpose(1,2,0);pf - - #Direct assignment (pandas > 0.15) pf.loc[:,:,'F'] = pd.DataFrame(data, rng, cols);pf `Mask a panel by using np.where and then reconstructing the panel with the new masked values diff --git a/doc/source/dsintro.rst b/doc/source/dsintro.rst index 4652ccbf0ad34e..ec0a1c7a00bf74 100644 --- a/doc/source/dsintro.rst +++ b/doc/source/dsintro.rst @@ -453,8 +453,6 @@ available to insert at a particular location in the columns: Assigning New Columns in Method Chains ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. versionadded:: 0.16.0 - Inspired by `dplyr's `__ ``mutate`` verb, DataFrame has an :meth:`~pandas.DataFrame.assign` diff --git a/doc/source/gotchas.rst b/doc/source/gotchas.rst index a3062b4086673b..9e6f98923fca6c 100644 --- a/doc/source/gotchas.rst +++ b/doc/source/gotchas.rst @@ -22,8 +22,8 @@ Frequently Asked Questions (FAQ) DataFrame memory usage ---------------------- -As of pandas version 0.15.0, the memory usage of a dataframe (including -the index) is shown when accessing the ``info`` method of a dataframe. A +The memory usage of a dataframe (including the index) +is shown when accessing the ``info`` method of a dataframe. A configuration option, ``display.memory_usage`` (see :ref:`options`), specifies if the dataframe's memory usage will be displayed when invoking the ``df.info()`` method. diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index a6e7df57be4e50..88e62b5d301a38 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -47,12 +47,6 @@ advanced indexing. should be avoided. See :ref:`Returning a View versus Copy ` -.. warning:: - - In 0.15.0 ``Index`` has internally been refactored to no longer subclass ``ndarray`` - but instead subclass ``PandasObject``, similarly to the rest of the pandas objects. This should be - a transparent change with only very limited API implications (See the :ref:`Internal Refactoring `) - .. warning:: Indexing on an integer-based Index with floats has been clarified in 0.18.0, for a summary of the changes, see :ref:`here `. @@ -660,7 +654,6 @@ For getting *multiple* indexers, using ``.get_indexer`` Selecting Random Samples ------------------------ -.. versionadded::0.16.1 A random selection of rows or columns from a Series, DataFrame, or Panel with the :meth:`~DataFrame.sample` method. The method will sample rows by default, and accepts a specific number of rows/columns to return, or a fraction of rows. @@ -1510,8 +1503,6 @@ See :ref:`Advanced Indexing ` for usage of MultiIndexes. ind.name = "bob" ind -.. versionadded:: 0.15.0 - ``set_names``, ``set_levels``, and ``set_labels`` also take an optional `level`` argument @@ -1527,11 +1518,6 @@ Set operations on Index objects .. _indexing.set_ops: -.. warning:: - - In 0.15.0. the set operations ``+`` and ``-`` were deprecated in order to provide these for numeric type operations on certain - index types. ``+`` can be replace by ``.union()`` or ``|``, and ``-`` by ``.difference()``. - The two main operations are ``union (|)``, ``intersection (&)`` These can be directly called as instance methods or used via overloaded operators. Difference is provided via the ``.difference()`` method. diff --git a/doc/source/install.rst b/doc/source/install.rst index 8dc8224ea6cb24..c805f84d0faaa7 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -18,7 +18,7 @@ Instructions for installing from source, Python version support ---------------------- -Officially Python 2.7, 3.4, 3.5, and 3.6 +Officially Python 2.7, 3.5, and 3.6. Installing pandas ----------------- @@ -183,21 +183,17 @@ installed), make sure you have `pytest >>> import pandas as pd >>> pd.test() - Running unit tests for pandas - pandas version 0.18.0 - numpy version 1.10.2 - pandas is installed in pandas - Python version 2.7.11 |Continuum Analytics, Inc.| - (default, Dec 6 2015, 18:57:58) [GCC 4.2.1 (Apple Inc. build 5577)] - nose version 1.3.7 + running: pytest --skip-slow --skip-network C:\Users\TP\Anaconda3\envs\py36\lib\site-packages\pandas + ============================= test session starts ============================= + platform win32 -- Python 3.6.2, pytest-3.2.1, py-1.4.34, pluggy-0.4.0 + rootdir: C:\Users\TP\Documents\Python\pandasdev\pandas, inifile: setup.cfg + collected 12145 items / 3 skipped + ..................................................................S...... ........S................................................................ ......................................................................... - ---------------------------------------------------------------------- - Ran 9252 tests in 368.339s - - OK (SKIP=117) + ==================== 12130 passed, 12 skipped in 368.339 seconds ===================== Dependencies ------------ diff --git a/doc/source/io.rst b/doc/source/io.rst index 33523ea171f3ae..de3150035c446b 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -592,8 +592,7 @@ Ignoring line comments and empty lines ++++++++++++++++++++++++++++++++++++++ If the ``comment`` parameter is specified, then completely commented lines will -be ignored. By default, completely blank lines will be ignored as well. Both of -these are API changes introduced in version 0.15. +be ignored. By default, completely blank lines will be ignored as well. .. ipython:: python @@ -2701,8 +2700,6 @@ Using a list to get multiple sheets: # Returns the 1st and 4th sheet, as a dictionary of DataFrames. read_excel('path_to_file.xls',sheet_name=['Sheet1',3]) -.. versionadded:: 0.16 - ``read_excel`` can read more than one sheet, by setting ``sheet_name`` to either a list of sheet names, a list of sheet positions, or ``None`` to read all sheets. Sheets can be specified by sheet index or sheet name, using an integer or string, @@ -3241,11 +3238,10 @@ for some advanced strategies .. warning:: - As of version 0.15.0, pandas requires ``PyTables`` >= 3.0.0. Stores written with prior versions of pandas / ``PyTables`` >= 2.3 are fully compatible (this was the previous minimum ``PyTables`` required version). - -.. warning:: - - There is a ``PyTables`` indexing bug which may appear when querying stores using an index. If you see a subset of results being returned, upgrade to ``PyTables`` >= 3.2. Stores created previously will need to be rewritten using the updated version. + pandas requires ``PyTables`` >= 3.0.0. + There is a indexing bug in ``PyTables`` < 3.2 which may appear when querying stores using an index. + If you see a subset of results being returned, upgrade to ``PyTables`` >= 3.2. + Stores created previously will need to be rewritten using the updated version. .. warning:: @@ -4210,10 +4206,8 @@ object : ``strings`` ``np.nan`` Categorical Data ++++++++++++++++ -.. versionadded:: 0.15.2 - -Writing data to a ``HDFStore`` that contains a ``category`` dtype was implemented -in 0.15.2. Queries work the same as if it was an object array. However, the ``category`` dtyped data is +You can write data that contains ``category`` dtypes to a ``HDFStore``. +Queries work the same as if it was an object array. However, the ``category`` dtyped data is stored in a more efficient manner. .. ipython:: python @@ -4228,21 +4222,6 @@ stored in a more efficient manner. result result.dtypes -.. warning:: - - The format of the ``Categorical`` is readable by prior versions of pandas (< 0.15.2), but will retrieve - the data as an integer based column (e.g. the ``codes``). However, the ``categories`` *can* be retrieved - but require the user to select them manually using the explicit meta path. - - The data is stored like so: - - .. ipython:: python - - cstore - - # to get the categories - cstore.select('dfcat/meta/A/meta') - .. ipython:: python :suppress: :okexcept: @@ -4746,8 +4725,6 @@ You can check if a table exists using :func:`~pandas.io.sql.has_table` Schema support '''''''''''''' -.. versionadded:: 0.15.0 - Reading from and writing to different schema's is supported through the ``schema`` keyword in the :func:`~pandas.read_sql_table` and :func:`~pandas.DataFrame.to_sql` functions. Note however that this depends on the database flavor (sqlite does not @@ -4975,8 +4952,6 @@ be used to read the file incrementally. pd.read_stata('stata.dta') -.. versionadded:: 0.16.0 - Specifying a ``chunksize`` yields a :class:`~pandas.io.stata.StataReader` instance that can be used to read ``chunksize`` lines from the file at a time. The ``StataReader`` @@ -5034,8 +5009,6 @@ values will have ``object`` data type. Categorical Data ++++++++++++++++ -.. versionadded:: 0.15.2 - ``Categorical`` data can be exported to *Stata* data files as value labeled data. The exported data consists of the underlying category codes as integer data values and the categories as value labels. *Stata* does not have an explicit equivalent diff --git a/doc/source/remote_data.rst b/doc/source/remote_data.rst index 7980133582125e..9af66058a7aaa3 100644 --- a/doc/source/remote_data.rst +++ b/doc/source/remote_data.rst @@ -11,14 +11,13 @@ Remote Data Access DataReader ---------- -The sub-package ``pandas.io.data`` is removed in favor of a separately -installable `pandas-datareader package +The sub-package ``pandas.io.data`` was deprecated in v.0.17 and removed in +`v.0.19 `__. + Instead there has been created a separately installable `pandas-datareader package `_. This will allow the data -modules to be independently updated to your pandas installation. The API for -``pandas-datareader v0.1.1`` is the same as in ``pandas v0.16.1``. -(:issue:`8961`) +modules to be independently updated on your pandas installation. - You should replace the imports of the following: + For code older than < 0.19 you should replace the imports of the following: .. code-block:: python diff --git a/doc/source/reshaping.rst b/doc/source/reshaping.rst index 3dce73b302c7ca..fab83222b313f1 100644 --- a/doc/source/reshaping.rst +++ b/doc/source/reshaping.rst @@ -569,8 +569,6 @@ This function is often used along with discretization functions like ``cut``: See also :func:`Series.str.get_dummies `. -.. versionadded:: 0.15.0 - :func:`get_dummies` also accepts a DataFrame. By default all categorical variables (categorical in the statistical sense, those with `object` or `categorical` dtype) are encoded as dummy variables. @@ -675,4 +673,4 @@ handling of NaN: you can use ``df["cat_col"] = pd.Categorical(df["col"])`` or ``df["cat_col"] = df["col"].astype("category")``. For full docs on :class:`~pandas.Categorical`, see the :ref:`Categorical introduction ` and the - :ref:`API documentation `. This feature was introduced in version 0.15. + :ref:`API documentation `. diff --git a/doc/source/sparse.rst b/doc/source/sparse.rst index b4884cf1c4141b..cf16cee501a3e5 100644 --- a/doc/source/sparse.rst +++ b/doc/source/sparse.rst @@ -216,8 +216,6 @@ To convert a ``SparseDataFrame`` back to sparse SciPy matrix in COO format, you SparseSeries ~~~~~~~~~~~~ -.. versionadded:: 0.16.0 - A :meth:`SparseSeries.to_coo` method is implemented for transforming a ``SparseSeries`` indexed by a ``MultiIndex`` to a ``scipy.sparse.coo_matrix``. The method requires a ``MultiIndex`` with two or more levels. diff --git a/doc/source/timedeltas.rst b/doc/source/timedeltas.rst index daa2c262c8c860..d055c49dc4721e 100644 --- a/doc/source/timedeltas.rst +++ b/doc/source/timedeltas.rst @@ -23,13 +23,12 @@ Time Deltas *********** -.. note:: - - Starting in v0.15.0, we introduce a new scalar type ``Timedelta``, which is a subclass of ``datetime.timedelta``, and behaves in a similar manner, - but allows compatibility with ``np.timedelta64`` types as well as a host of custom representation, parsing, and attributes. +Timedeltas are differences in times, expressed in difference units, e.g. days, hours, minutes, +seconds. They can be both positive and negative. -Timedeltas are differences in times, expressed in difference units, e.g. days, hours, minutes, seconds. -They can be both positive and negative. +``Timedelta`` is a subclass of ``datetime.timedelta``, and behaves in a similar manner, +but allows compatibility with ``np.timedelta64`` types as well as a host of custom representation, +parsing, and attributes. Parsing ------- @@ -78,15 +77,10 @@ Further, operations among the scalars yield another scalar ``Timedelta``. to_timedelta ~~~~~~~~~~~~ -.. warning:: - - Prior to 0.15.0 ``pd.to_timedelta`` would return a ``Series`` for list-like/Series input, and a ``np.timedelta64`` for scalar input. - It will now return a ``TimedeltaIndex`` for list-like input, ``Series`` for Series input, and ``Timedelta`` for scalar input. - - The arguments to ``pd.to_timedelta`` are now ``(arg, unit='ns', box=True)``, previously were ``(arg, box=True, unit='ns')`` as these are more logical. - -Using the top-level ``pd.to_timedelta``, you can convert a scalar, array, list, or Series from a recognized timedelta format / value into a ``Timedelta`` type. -It will construct Series if the input is a Series, a scalar if the input is scalar-like, otherwise will output a ``TimedeltaIndex``. +Using the top-level ``pd.to_timedelta``, you can convert a scalar, array, list, +or Series from a recognized timedelta format / value into a ``Timedelta`` type. +It will construct Series if the input is a Series, a scalar if the input is +scalar-like, otherwise it will output a ``TimedeltaIndex``. You can parse a single string to a Timedelta: @@ -328,8 +322,6 @@ You can convert a ``Timedelta`` to an `ISO 8601 Duration`_ string with the TimedeltaIndex -------------- -.. versionadded:: 0.15.0 - To generate an index with time delta, you can use either the ``TimedeltaIndex`` or the ``timedelta_range`` constructor. diff --git a/doc/source/visualization.rst b/doc/source/visualization.rst index 839390c8778aa1..b5a261e3acac5d 100644 --- a/doc/source/visualization.rst +++ b/doc/source/visualization.rst @@ -229,8 +229,6 @@ To get horizontal bar plots, use the ``barh`` method: Histograms ~~~~~~~~~~ -.. versionadded:: 0.15.0 - Histogram can be drawn by using the :meth:`DataFrame.plot.hist` and :meth:`Series.plot.hist` methods. .. ipython:: python @@ -328,8 +326,6 @@ The ``by`` keyword can be specified to plot grouped histograms: Box Plots ~~~~~~~~~ -.. versionadded:: 0.15.0 - Boxplot can be drawn calling :meth:`Series.plot.box` and :meth:`DataFrame.plot.box`, or :meth:`DataFrame.boxplot` to visualize the distribution of values within each column. diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 33a6db18db3cad..636bb2dc3e60ea 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -485,3 +485,4 @@ Other ^^^^^ - Bug in :func:`eval` where the ``inplace`` parameter was being incorrectly handled (:issue:`16732`) - Several ``NaT`` method docstrings (e.g. :func:`NaT.ctime`) were incorrect (:issue:`17327`) +- The documentation has had references to versions < v0.16 removed and cleaned up (:issue:`17442`, :issue:`17442` & :issue:`#17404`) From 24b440e67abb3b14856f0fd920141f5a6dcf83fd Mon Sep 17 00:00:00 2001 From: Dillon Niederhut Date: Thu, 7 Sep 2017 05:52:11 -0500 Subject: [PATCH 054/188] BUG: revert collision warning (#17298) --- doc/source/indexing.rst | 15 --------------- doc/source/whatsnew/v0.21.0.txt | 24 +++--------------------- pandas/core/generic.py | 8 ++------ pandas/tests/dtypes/test_generic.py | 5 ----- pandas/tests/io/test_pytables.py | 4 ++-- 5 files changed, 7 insertions(+), 49 deletions(-) diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index 88e62b5d301a38..8474116c380825 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -269,21 +269,6 @@ new column. In 0.21.0 and later, this will raise a ``UserWarning``: 1 2.0 2 3.0 -Similarly, it is possible to create a column with a name which collides with one of Pandas's -built-in methods or attributes, which can cause confusion later when attempting to access -that column as an attribute. This behavior now warns: - -.. code-block:: ipython - - In[4]: df['sum'] = [5., 7., 9.] - UserWarning: Column name 'sum' collides with a built-in method, which will cause unexpected attribute behavior - In[5]: df.sum - Out[5]: - - Slicing ranges -------------- diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 636bb2dc3e60ea..fa00140fb4abda 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -67,8 +67,8 @@ Improved warnings when attempting to create columns ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ New users are often flummoxed by the relationship between column operations and attribute -access on ``DataFrame`` instances (:issue:`5904` & :issue:`7175`). Two specific instances -of this confusion include attempting to create a new column by setting into an attribute: +access on ``DataFrame`` instances (:issue:`7175`). One specific instance +of this confusion is attempting to create a new column by setting into an attribute: .. code-block:: ipython @@ -86,25 +86,7 @@ This does not raise any obvious exceptions, but also does not create a new colum 1 2.0 2 3.0 -The second source of confusion is creating a column whose name collides with a method or -attribute already in the instance namespace: - -.. code-block:: ipython - - In[4]: df['sum'] = [5., 7., 9.] - -This does not permit that column to be accessed as an attribute: - -.. code-block:: ipython - - In[5]: df.sum - Out[5]: - - -Both of these now raise a ``UserWarning`` about the potential for unexpected behavior. See :ref:`Attribute Access `. +Setting a list-like data structure into a new attribute now raise a ``UserWarning`` about the potential for unexpected behavior. See :ref:`Attribute Access `. .. _whatsnew_0210.enhancements.other: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index cdb08d8887e05b..df5f1a8326acd3 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1905,10 +1905,6 @@ def _slice(self, slobj, axis=0, kind=None): return result def _set_item(self, key, value): - if isinstance(key, str) and callable(getattr(self, key, None)): - warnings.warn("Column name '{key}' collides with a built-in " - "method, which will cause unexpected attribute " - "behavior".format(key=key), stacklevel=3) self._data.set(key, value) self._clear_item_cache() @@ -3441,8 +3437,8 @@ def __setattr__(self, name, value): object.__setattr__(self, name, value) except (AttributeError, TypeError): if isinstance(self, ABCDataFrame) and (is_list_like(value)): - warnings.warn("Pandas doesn't allow Series to be assigned " - "into nonexistent columns - see " + warnings.warn("Pandas doesn't allow columns to be " + "created via a new attribute name - see " "https://pandas.pydata.org/pandas-docs/" "stable/indexing.html#attribute-access", stacklevel=2) diff --git a/pandas/tests/dtypes/test_generic.py b/pandas/tests/dtypes/test_generic.py index 82444d6c941576..bd365f9c3281f8 100644 --- a/pandas/tests/dtypes/test_generic.py +++ b/pandas/tests/dtypes/test_generic.py @@ -48,7 +48,6 @@ def test_abc_types(self): def test_setattr_warnings(): - # GH5904 - Suggestion: Warning for DataFrame colname-methodname clash # GH7175 - GOTCHA: You can't use dot notation to add a column... d = {'one': pd.Series([1., 2., 3.], index=['a', 'b', 'c']), 'two': pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])} @@ -78,7 +77,3 @@ def test_setattr_warnings(): # warn when setting column to nonexistent name df.four = df.two + 2 assert df.four.sum() > df.two.sum() - - with tm.assert_produces_warning(UserWarning): - # warn when column has same name as method - df['sum'] = df.two diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index b5ecc4d34cd08b..9c488cb2389bed 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -2011,7 +2011,7 @@ def check(obj, comparator): df['string'] = 'foo' df['float322'] = 1. df['float322'] = df['float322'].astype('float32') - df['boolean'] = df['float322'] > 0 + df['bool'] = df['float322'] > 0 df['time1'] = Timestamp('20130101') df['time2'] = Timestamp('20130102') check(df, tm.assert_frame_equal) @@ -2141,7 +2141,7 @@ def test_table_values_dtypes_roundtrip(self): df1['string'] = 'foo' df1['float322'] = 1. df1['float322'] = df1['float322'].astype('float32') - df1['boolean'] = df1['float32'] > 0 + df1['bool'] = df1['float32'] > 0 df1['time1'] = Timestamp('20130101') df1['time2'] = Timestamp('20130102') From 8a8a4fd74dc1dd2804d5f605fcad47e6f0fd4b60 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Thu, 7 Sep 2017 04:28:12 -0700 Subject: [PATCH 055/188] cdef out dtype for _Timestamp._get_field (#17457) --- pandas/_libs/tslib.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index a7b33c669a8b8c..7e009652f7f0c6 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -1275,6 +1275,7 @@ cdef class _Timestamp(datetime): cpdef _get_field(self, field): cdef: int64_t val + ndarray[int32_t] out val = self._maybe_convert_value_to_local() out = get_date_field(np.array([val], dtype=np.int64), field) return int(out[0]) @@ -1282,6 +1283,7 @@ cdef class _Timestamp(datetime): cpdef _get_named_field(self, field): cdef: int64_t val + ndarray[object] out val = self._maybe_convert_value_to_local() out = get_date_name_field(np.array([val], dtype=np.int64), field) return out[0] @@ -1291,9 +1293,7 @@ cdef class _Timestamp(datetime): 'startingMonth', self.freq.kwds.get( 'month', 12)) if self.freq else 12 freqstr = self.freqstr if self.freq else None - val = self.value - if self.tz is not None and not _is_utc(self.tz): - val = tz_convert_single(self.value, 'UTC', self.tz) + val = self._maybe_convert_value_to_local() out = get_start_end_field( np.array([val], dtype=np.int64), field, freqstr, month_kw) return out[0] From 9dc01c4f9142908c4a7db5a3a0300685f6d43308 Mon Sep 17 00:00:00 2001 From: Sam Foo Date: Thu, 7 Sep 2017 07:35:40 -0400 Subject: [PATCH 056/188] DOC: Add Timestamp, Period, Timedelta, and Interval to api.rst (#17424) --- doc/source/api.rst | 195 ++++++++++++++++++++++++++++++++++++++++ pandas/_libs/period.pyx | 2 +- 2 files changed, 196 insertions(+), 1 deletion(-) diff --git a/doc/source/api.rst b/doc/source/api.rst index 12e6c7ad7f6305..d34cec86638fba 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -1599,6 +1599,201 @@ Conversion TimedeltaIndex.floor TimedeltaIndex.ceil +.. currentmodule:: pandas + +Scalars +------- + +Period +~~~~~~ +.. autosummary:: + :toctree: generated/ + + Period + +Attributes +~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + Period.day + Period.dayofweek + Period.dayofyear + Period.days_in_month + Period.daysinmonth + Period.end_time + Period.freq + Period.freqstr + Period.hour + Period.is_leap_year + Period.minute + Period.month + Period.now + Period.ordinal + Period.quarter + Period.qyear + Period.second + Period.start_time + Period.strftime + Period.week + Period.weekday + Period.weekofyear + Period.year + +Methods +~~~~~~~ +.. autosummary:: + :toctree: generated/ + + Period.asfreq + Period.strftime + Period.to_timestamp + +Timestamp +~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + Timestamp + +Properties +~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + Timestamp.asm8 + Timestamp.day + Timestamp.dayofweek + Timestamp.dayofyear + Timestamp.days_in_month + Timestamp.daysinmonth + Timestamp.hour + Timestamp.is_leap_year + Timestamp.is_month_end + Timestamp.is_month_start + Timestamp.is_quarter_end + Timestamp.is_quarter_start + Timestamp.is_year_end + Timestamp.is_year_start + Timestamp.max + Timestamp.microsecond + Timestamp.min + Timestamp.month + Timestamp.nanosecond + Timestamp.quarter + Timestamp.resolution + Timestamp.second + Timestamp.tz + Timestamp.tzinfo + Timestamp.value + Timestamp.weekday_name + Timestamp.weekofyear + Timestamp.year + +Methods +~~~~~~~ +.. autosummary:: + :toctree: generated/ + + Timestamp.astimezone + Timestamp.ceil + Timestamp.combine + Timestamp.ctime + Timestamp.date + Timestamp.dst + Timestamp.floor + Timestamp.freq + Timestamp.freqstr + Timestamp.from_ordinal + Timestamp.fromtimestamp + Timestamp.isocalendar + Timestamp.isoformat + Timestamp.isoweekday + Timestamp.normalize + Timestamp.now + Timestamp.replace + Timestamp.round + Timestamp.strftime + Timestamp.strptime + Timestamp.time + Timestamp.timetuple + Timestamp.timetz + Timestamp.to_datetime64 + Timestamp.to_julian_date + Timestamp.to_period + Timestamp.to_pydatetime + Timestamp.today + Timestamp.toordinal + Timestamp.tz_convert + Timestamp.tz_localize + Timestamp.tzname + Timestamp.utcfromtimestamp + Timestamp.utcnow + Timestamp.utcoffset + Timestamp.utctimetuple + Timestamp.weekday + +Interval +~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + Interval + +Properties +~~~~~~~~~~ +.. autosummary:: + :toctree generated/ + + Interval.closed + Interval.closed_left + Interval.closed_right + Interval.left + Interval.mid + Interval.open_left + Interval.open_right + Interval.right + +Timedelta +~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + Timedelta + +Properties +~~~~~~~~~~ +.. autosummary:: + :toctree generated/ + + Timedelta.asm8 + Timedelta.components + Timedelta.days + Timedelta.delta + Timedelta.freq + Timedelta.is_populated + Timedelta.max + Timedelta.microseconds + Timedelta.min + Timedelta.nanoseconds + Timedelta.resolution + Timedelta.seconds + Timedelta.value + +Methods +~~~~~~~ +.. autosummary:: + :toctree generated/ + + Timedelta.ceil + Timedelta.floor + Timedelta.isoformat + Timedelta.round + Timdelta.to_pytimedelta + Timedelta.to_timedelta64 + Timedelta.total_seconds + Timedelta.view + Window ------ .. currentmodule:: pandas.core.window diff --git a/pandas/_libs/period.pyx b/pandas/_libs/period.pyx index 0ade8f9a6dde5b..8f89b812fec04f 100644 --- a/pandas/_libs/period.pyx +++ b/pandas/_libs/period.pyx @@ -1102,7 +1102,7 @@ cdef class _Period(object): class Period(_Period): """ - Represents an period of time + Represents a period of time Parameters ---------- From aee2ae086e0972aabcb43d05fa2a404153e3b3b5 Mon Sep 17 00:00:00 2001 From: majiang Date: Thu, 7 Sep 2017 20:41:24 +0900 Subject: [PATCH 057/188] DOC: to_json (#17461) --- pandas/core/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index df5f1a8326acd3..8d16b079ba2c8d 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1265,7 +1265,7 @@ def to_json(self, path_or_buf=None, orient=None, date_format=None, Parameters ---------- path_or_buf : the path or buffer to write the result string - if this is None, return a StringIO of the converted string + if this is None, return the converted string orient : string * Series From 3a291bb7170ca900cb1b886a3c0b39976a9870ef Mon Sep 17 00:00:00 2001 From: jschendel Date: Thu, 7 Sep 2017 05:49:27 -0600 Subject: [PATCH 058/188] BUG: Index._searchsorted_monotonic(..., side='right') returns the left side position for monotonic decreasing indexes (#17272) --- doc/source/whatsnew/v0.21.0.txt | 1 + pandas/core/indexes/base.py | 2 +- pandas/tests/indexes/common.py | 59 +++++++++++++++++-- .../indexes/datetimes/test_datetimelike.py | 4 +- pandas/tests/indexes/period/test_period.py | 4 +- pandas/tests/indexes/test_base.py | 3 +- pandas/tests/indexes/test_numeric.py | 12 ++-- pandas/tests/indexes/test_range.py | 3 +- pandas/tests/indexing/test_interval.py | 56 +++++++++++------- 9 files changed, 111 insertions(+), 33 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index fa00140fb4abda..d3c61adccc7a61 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -400,6 +400,7 @@ Indexing - Bug in ``.isin()`` in which checking membership in empty ``Series`` objects raised an error (:issue:`16991`) - Bug in ``CategoricalIndex`` reindexing in which specified indices containing duplicates were not being respected (:issue:`17323`) - Bug in intersection of ``RangeIndex`` with negative step (:issue:`17296`) +- Bug in ``IntervalIndex`` where performing a scalar lookup fails for included right endpoints of non-overlapping monotonic decreasing indexes (:issue:`16417`, :issue:`17271`) I/O ^^^ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index a9098126a38e3d..ef5f68936044a8 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3465,7 +3465,7 @@ def _searchsorted_monotonic(self, label, side='left'): # everything for it to work (element ordering, search side and # resulting value). pos = self[::-1].searchsorted(label, side='right' if side == 'left' - else 'right') + else 'left') return len(self) - pos raise ValueError('index must be monotonic increasing or decreasing') diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 1fdc08d68eb268..90618cd6e235f6 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -11,6 +11,7 @@ RangeIndex, MultiIndex, CategoricalIndex, DatetimeIndex, TimedeltaIndex, PeriodIndex, IntervalIndex, notna, isna) +from pandas.core.indexes.base import InvalidIndexError from pandas.core.indexes.datetimelike import DatetimeIndexOpsMixin from pandas.core.dtypes.common import needs_i8_conversion from pandas._libs.tslib import iNaT @@ -138,9 +139,14 @@ def test_get_indexer_consistency(self): if isinstance(index, IntervalIndex): continue - indexer = index.get_indexer(index[0:2]) - assert isinstance(indexer, np.ndarray) - assert indexer.dtype == np.intp + if index.is_unique or isinstance(index, CategoricalIndex): + indexer = index.get_indexer(index[0:2]) + assert isinstance(indexer, np.ndarray) + assert indexer.dtype == np.intp + else: + e = "Reindexing only valid with uniquely valued Index objects" + with tm.assert_raises_regex(InvalidIndexError, e): + indexer = index.get_indexer(index[0:2]) indexer, _ = index.get_indexer_non_unique(index[0:2]) assert isinstance(indexer, np.ndarray) @@ -632,7 +638,8 @@ def test_difference_base(self): pass elif isinstance(idx, (DatetimeIndex, TimedeltaIndex)): assert result.__class__ == answer.__class__ - tm.assert_numpy_array_equal(result.asi8, answer.asi8) + tm.assert_numpy_array_equal(result.sort_values().asi8, + answer.sort_values().asi8) else: result = first.difference(case) assert tm.equalContents(result, answer) @@ -954,3 +961,47 @@ def test_join_self_unique(self, how): if index.is_unique: joined = index.join(index, how=how) assert (index == joined).all() + + def test_searchsorted_monotonic(self): + # GH17271 + for index in self.indices.values(): + # not implemented for tuple searches in MultiIndex + # or Intervals searches in IntervalIndex + if isinstance(index, (MultiIndex, IntervalIndex)): + continue + + # nothing to test if the index is empty + if index.empty: + continue + value = index[0] + + # determine the expected results (handle dupes for 'right') + expected_left, expected_right = 0, (index == value).argmin() + if expected_right == 0: + # all values are the same, expected_right should be length + expected_right = len(index) + + # test _searchsorted_monotonic in all cases + # test searchsorted only for increasing + if index.is_monotonic_increasing: + ssm_left = index._searchsorted_monotonic(value, side='left') + assert expected_left == ssm_left + + ssm_right = index._searchsorted_monotonic(value, side='right') + assert expected_right == ssm_right + + ss_left = index.searchsorted(value, side='left') + assert expected_left == ss_left + + ss_right = index.searchsorted(value, side='right') + assert expected_right == ss_right + elif index.is_monotonic_decreasing: + ssm_left = index._searchsorted_monotonic(value, side='left') + assert expected_left == ssm_left + + ssm_right = index._searchsorted_monotonic(value, side='right') + assert expected_right == ssm_right + else: + # non-monotonic should raise. + with pytest.raises(ValueError): + index._searchsorted_monotonic(value, side='left') diff --git a/pandas/tests/indexes/datetimes/test_datetimelike.py b/pandas/tests/indexes/datetimes/test_datetimelike.py index 3b970ee3825212..538e10e6011ec4 100644 --- a/pandas/tests/indexes/datetimes/test_datetimelike.py +++ b/pandas/tests/indexes/datetimes/test_datetimelike.py @@ -12,7 +12,9 @@ class TestDatetimeIndex(DatetimeLike): _holder = DatetimeIndex def setup_method(self, method): - self.indices = dict(index=tm.makeDateIndex(10)) + self.indices = dict(index=tm.makeDateIndex(10), + index_dec=date_range('20130110', periods=10, + freq='-1D')) self.setup_indices() def create_index(self): diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index e24e2ad936e2c2..51f7d13cb0638f 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -18,7 +18,9 @@ class TestPeriodIndex(DatetimeLike): _multiprocess_can_split_ = True def setup_method(self, method): - self.indices = dict(index=tm.makePeriodIndex(10)) + self.indices = dict(index=tm.makePeriodIndex(10), + index_dec=period_range('20130101', periods=10, + freq='D')[::-1]) self.setup_indices() def create_index(self): diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index f96dbdcfb8acfe..d69fbbcdf4bf60 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -46,7 +46,8 @@ def setup_method(self, method): catIndex=tm.makeCategoricalIndex(100), empty=Index([]), tuples=MultiIndex.from_tuples(lzip( - ['foo', 'bar', 'baz'], [1, 2, 3]))) + ['foo', 'bar', 'baz'], [1, 2, 3])), + repeats=Index([0, 0, 1, 1, 2, 2])) self.setup_indices() def create_index(self): diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index 1a0a38c1732843..7e7e10e4aeabee 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -181,7 +181,9 @@ class TestFloat64Index(Numeric): def setup_method(self, method): self.indices = dict(mixed=Float64Index([1.5, 2, 3, 4, 5]), - float=Float64Index(np.arange(5) * 2.5)) + float=Float64Index(np.arange(5) * 2.5), + mixed_dec=Float64Index([5, 4, 3, 2, 1.5]), + float_dec=Float64Index(np.arange(4, -1, -1) * 2.5)) self.setup_indices() def create_index(self): @@ -654,7 +656,8 @@ class TestInt64Index(NumericInt): _holder = Int64Index def setup_method(self, method): - self.indices = dict(index=Int64Index(np.arange(0, 20, 2))) + self.indices = dict(index=Int64Index(np.arange(0, 20, 2)), + index_dec=Int64Index(np.arange(19, -1, -1))) self.setup_indices() def create_index(self): @@ -949,8 +952,9 @@ class TestUInt64Index(NumericInt): _holder = UInt64Index def setup_method(self, method): - self.indices = dict(index=UInt64Index([2**63, 2**63 + 10, 2**63 + 15, - 2**63 + 20, 2**63 + 25])) + vals = [2**63, 2**63 + 10, 2**63 + 15, 2**63 + 20, 2**63 + 25] + self.indices = dict(index=UInt64Index(vals), + index_dec=UInt64Index(reversed(vals))) self.setup_indices() def create_index(self): diff --git a/pandas/tests/indexes/test_range.py b/pandas/tests/indexes/test_range.py index 06c8f0ee392c77..d206c36ee51c95 100644 --- a/pandas/tests/indexes/test_range.py +++ b/pandas/tests/indexes/test_range.py @@ -25,7 +25,8 @@ class TestRangeIndex(Numeric): _compat_props = ['shape', 'ndim', 'size', 'itemsize'] def setup_method(self, method): - self.indices = dict(index=RangeIndex(0, 20, 2, name='foo')) + self.indices = dict(index=RangeIndex(0, 20, 2, name='foo'), + index_dec=RangeIndex(18, -1, -2, name='bar')) self.setup_indices() def create_index(self): diff --git a/pandas/tests/indexing/test_interval.py b/pandas/tests/indexing/test_interval.py index be6e5e1cffb2e5..31a94abcd99a59 100644 --- a/pandas/tests/indexing/test_interval.py +++ b/pandas/tests/indexing/test_interval.py @@ -3,6 +3,7 @@ import pandas as pd from pandas import Series, DataFrame, IntervalIndex, Interval +from pandas.compat import product import pandas.util.testing as tm @@ -14,16 +15,6 @@ def setup_method(self, method): def test_loc_with_scalar(self): s = self.s - expected = 0 - - result = s.loc[0.5] - assert result == expected - - result = s.loc[1] - assert result == expected - - with pytest.raises(KeyError): - s.loc[0] expected = s.iloc[:3] tm.assert_series_equal(expected, s.loc[:3]) @@ -42,16 +33,6 @@ def test_loc_with_scalar(self): def test_getitem_with_scalar(self): s = self.s - expected = 0 - - result = s[0.5] - assert result == expected - - result = s[1] - assert result == expected - - with pytest.raises(KeyError): - s[0] expected = s.iloc[:3] tm.assert_series_equal(expected, s[:3]) @@ -67,6 +48,41 @@ def test_getitem_with_scalar(self): expected = s.iloc[2:5] tm.assert_series_equal(expected, s[s >= 2]) + @pytest.mark.parametrize('direction, closed', + product(('increasing', 'decreasing'), + ('left', 'right', 'neither', 'both'))) + def test_nonoverlapping_monotonic(self, direction, closed): + tpls = [(0, 1), (2, 3), (4, 5)] + if direction == 'decreasing': + tpls = reversed(tpls) + + idx = IntervalIndex.from_tuples(tpls, closed=closed) + s = Series(list('abc'), idx) + + for key, expected in zip(idx.left, s): + if idx.closed_left: + assert s[key] == expected + assert s.loc[key] == expected + else: + with pytest.raises(KeyError): + s[key] + with pytest.raises(KeyError): + s.loc[key] + + for key, expected in zip(idx.right, s): + if idx.closed_right: + assert s[key] == expected + assert s.loc[key] == expected + else: + with pytest.raises(KeyError): + s[key] + with pytest.raises(KeyError): + s.loc[key] + + for key, expected in zip(idx.mid, s): + assert s[key] == expected + assert s.loc[key] == expected + def test_with_interval(self): s = self.s From ee6185e2fb9461632949f3ba52a28b37a1f7296e Mon Sep 17 00:00:00 2001 From: Matti Picus Date: Thu, 7 Sep 2017 14:56:33 +0300 Subject: [PATCH 059/188] COMPAT: Pypy tweaks (#17351) --- doc/source/whatsnew/v0.21.0.txt | 11 ++++- pandas/_libs/src/ujson/python/JSONtoObj.c | 16 +++---- pandas/io/parsers.py | 1 + pandas/tests/indexes/test_base.py | 16 +++++-- pandas/tests/indexes/test_multi.py | 13 +++++- pandas/tests/io/parser/test_parsers.py | 52 ++++++++++++++++++++++- 6 files changed, 92 insertions(+), 17 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index d3c61adccc7a61..f50052347cfb56 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -371,13 +371,11 @@ Performance Improvements Bug Fixes ~~~~~~~~~ - Conversion ^^^^^^^^^^ - Bug in assignment against datetime-like data with ``int`` may incorrectly convert to datetime-like (:issue:`14145`) - Bug in assignment against ``int64`` data with ``np.ndarray`` with ``float64`` dtype may keep ``int64`` dtype (:issue:`14001`) -- Fix :func:`DataFrame.memory_usage` to support PyPy. Objects on PyPy do not have a fixed size, so an approximation is used instead (:issue:`17228`) - Fixed the return type of ``IntervalIndex.is_non_overlapping_monotonic`` to be a Python ``bool`` for consistency with similar attributes/methods. Previously returned a ``numpy.bool_``. (:issue:`17237`) - Bug in ``IntervalIndex.is_non_overlapping_monotonic`` when intervals are closed on both sides and overlap at a point (:issue:`16560`) - Bug in :func:`Series.fillna` returns frame when ``inplace=True`` and ``value`` is dict (:issue:`16156`) @@ -463,6 +461,15 @@ Categorical the ``.categories`` to be an empty ``Float64Index`` rather than an empty ``Index`` with object dtype (:issue:`17248`) +PyPy +^^^^ + +- Compatibility with PyPy in :func:`read_csv` with ``usecols=[]`` and + :func:`read_json` (:issue:`17351`) +- Split tests into cases for CPython and PyPy where needed, which highlights the fragility + of index matching with ``float('nan')``, ``np.nan`` and ``NAT`` (:issue:`17351`) +- Fix :func:`DataFrame.memory_usage` to support PyPy. Objects on PyPy do not have a fixed size, + so an approximation is used instead (:issue:`17228`) Other ^^^^^ diff --git a/pandas/_libs/src/ujson/python/JSONtoObj.c b/pandas/_libs/src/ujson/python/JSONtoObj.c index b0132532c16af7..85cf1d5e5e7a1a 100644 --- a/pandas/_libs/src/ujson/python/JSONtoObj.c +++ b/pandas/_libs/src/ujson/python/JSONtoObj.c @@ -409,7 +409,7 @@ JSOBJ Object_npyEndObject(void *prv, JSOBJ obj) { } int Object_npyObjectAddKey(void *prv, JSOBJ obj, JSOBJ name, JSOBJ value) { - PyObject *label; + PyObject *label, *labels; npy_intp labelidx; // add key to label array, value to values array NpyArrContext *npyarr = (NpyArrContext *)obj; @@ -424,11 +424,11 @@ int Object_npyObjectAddKey(void *prv, JSOBJ obj, JSOBJ name, JSOBJ value) { if (!npyarr->labels[labelidx]) { npyarr->labels[labelidx] = PyList_New(0); } - + labels = npyarr->labels[labelidx]; // only fill label array once, assumes all column labels are the same // for 2-dimensional arrays. - if (PyList_GET_SIZE(npyarr->labels[labelidx]) <= npyarr->elcount) { - PyList_Append(npyarr->labels[labelidx], label); + if (PyList_Check(labels) && PyList_GET_SIZE(labels) <= npyarr->elcount) { + PyList_Append(labels, label); } if (((JSONObjectDecoder *)npyarr->dec)->arrayAddItem(prv, obj, value)) { @@ -439,16 +439,16 @@ int Object_npyObjectAddKey(void *prv, JSOBJ obj, JSOBJ name, JSOBJ value) { } int Object_objectAddKey(void *prv, JSOBJ obj, JSOBJ name, JSOBJ value) { - PyDict_SetItem(obj, name, value); + int ret = PyDict_SetItem(obj, name, value); Py_DECREF((PyObject *)name); Py_DECREF((PyObject *)value); - return 1; + return ret == 0 ? 1 : 0; } int Object_arrayAddItem(void *prv, JSOBJ obj, JSOBJ value) { - PyList_Append(obj, value); + int ret = PyList_Append(obj, value); Py_DECREF((PyObject *)value); - return 1; + return ret == 0 ? 1 : 0; } JSOBJ Object_newString(void *prv, wchar_t *start, wchar_t *end) { diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 8b1a921536a1dd..6adf154aabba7f 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1716,6 +1716,7 @@ def _set_noconvert_columns(self): # A set of integers will be converted to a list in # the correct order every single time. usecols = list(self.usecols) + usecols.sort() elif (callable(self.usecols) or self.usecols_dtype not in ('empty', None)): # The names attribute should have the correct columns diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index d69fbbcdf4bf60..fa73c9fc7b7225 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -9,7 +9,7 @@ from pandas.tests.indexes.common import Base from pandas.compat import (range, lrange, lzip, u, - text_type, zip, PY3, PY36) + text_type, zip, PY3, PY36, PYPY) import operator import numpy as np @@ -1370,13 +1370,21 @@ def test_isin(self): assert len(result) == 0 assert result.dtype == np.bool_ - def test_isin_nan(self): + @pytest.mark.skipif(PYPY, reason="np.nan is float('nan') on PyPy") + def test_isin_nan_not_pypy(self): + tm.assert_numpy_array_equal(Index(['a', np.nan]).isin([float('nan')]), + np.array([False, False])) + + @pytest.mark.skipif(not PYPY, reason="np.nan is float('nan') on PyPy") + def test_isin_nan_pypy(self): + tm.assert_numpy_array_equal(Index(['a', np.nan]).isin([float('nan')]), + np.array([False, True])) + + def test_isin_nan_common(self): tm.assert_numpy_array_equal(Index(['a', np.nan]).isin([np.nan]), np.array([False, True])) tm.assert_numpy_array_equal(Index(['a', pd.NaT]).isin([pd.NaT]), np.array([False, True])) - tm.assert_numpy_array_equal(Index(['a', np.nan]).isin([float('nan')]), - np.array([False, False])) tm.assert_numpy_array_equal(Index(['a', np.nan]).isin([pd.NaT]), np.array([False, False])) diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 798d2444689615..86308192c91665 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -14,7 +14,7 @@ from pandas import (CategoricalIndex, DataFrame, Index, MultiIndex, compat, date_range, period_range) -from pandas.compat import PY3, long, lrange, lzip, range, u +from pandas.compat import PY3, long, lrange, lzip, range, u, PYPY from pandas.errors import PerformanceWarning, UnsortedIndexError from pandas.core.indexes.base import InvalidIndexError from pandas._libs import lib @@ -2571,13 +2571,22 @@ def test_isin(self): assert len(result) == 0 assert result.dtype == np.bool_ - def test_isin_nan(self): + @pytest.mark.skipif(PYPY, reason="tuples cmp recursively on PyPy") + def test_isin_nan_not_pypy(self): idx = MultiIndex.from_arrays([['foo', 'bar'], [1.0, np.nan]]) tm.assert_numpy_array_equal(idx.isin([('bar', np.nan)]), np.array([False, False])) tm.assert_numpy_array_equal(idx.isin([('bar', float('nan'))]), np.array([False, False])) + @pytest.mark.skipif(not PYPY, reason="tuples cmp recursively on PyPy") + def test_isin_nan_pypy(self): + idx = MultiIndex.from_arrays([['foo', 'bar'], [1.0, np.nan]]) + tm.assert_numpy_array_equal(idx.isin([('bar', np.nan)]), + np.array([False, True])) + tm.assert_numpy_array_equal(idx.isin([('bar', float('nan'))]), + np.array([False, True])) + def test_isin_level_kwarg(self): idx = MultiIndex.from_arrays([['qux', 'baz', 'foo', 'bar'], np.arange( 4)]) diff --git a/pandas/tests/io/parser/test_parsers.py b/pandas/tests/io/parser/test_parsers.py index 2fee2451c5e36f..0ea4757b10e942 100644 --- a/pandas/tests/io/parser/test_parsers.py +++ b/pandas/tests/io/parser/test_parsers.py @@ -3,8 +3,10 @@ import os import pandas.util.testing as tm -from pandas import read_csv, read_table +from pandas import read_csv, read_table, DataFrame from pandas.core.common import AbstractMethodError +from pandas._libs.lib import Timestamp +from pandas.compat import StringIO from .common import ParserTests from .header import HeaderTests @@ -100,3 +102,51 @@ def read_table(self, *args, **kwds): kwds = kwds.copy() kwds['engine'] = self.engine return read_table(*args, **kwds) + + +class TestUnsortedUsecols(object): + def test_override__set_noconvert_columns(self): + # GH 17351 - usecols needs to be sorted in _setnoconvert_columns + # based on the test_usecols_with_parse_dates test from usecols.py + from pandas.io.parsers import CParserWrapper, TextFileReader + + s = """a,b,c,d,e + 0,1,20140101,0900,4 + 0,1,20140102,1000,4""" + + parse_dates = [[1, 2]] + cols = { + 'a': [0, 0], + 'c_d': [ + Timestamp('2014-01-01 09:00:00'), + Timestamp('2014-01-02 10:00:00') + ] + } + expected = DataFrame(cols, columns=['c_d', 'a']) + + class MyTextFileReader(TextFileReader): + def __init__(self): + self._currow = 0 + self.squeeze = False + + class MyCParserWrapper(CParserWrapper): + def _set_noconvert_columns(self): + if self.usecols_dtype == 'integer': + # self.usecols is a set, which is documented as unordered + # but in practice, a CPython set of integers is sorted. + # In other implementations this assumption does not hold. + # The following code simulates a different order, which + # before GH 17351 would cause the wrong columns to be + # converted via the parse_dates parameter + self.usecols = list(self.usecols) + self.usecols.reverse() + return CParserWrapper._set_noconvert_columns(self) + + parser = MyTextFileReader() + parser.options = {'usecols': [0, 2, 3], + 'parse_dates': parse_dates, + 'delimiter': ','} + parser._engine = MyCParserWrapper(StringIO(s), **parser.options) + df = parser.read() + + tm.assert_frame_equal(df, expected) From 46832ac8f465aa911ba79ebc1b1a4d0f6baf46f9 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 7 Sep 2017 17:46:12 -0700 Subject: [PATCH 060/188] Replace * imports with explicit imports; remove unused declared constants (#17470) --- pandas/_libs/src/skiplist.pyx | 1 - pandas/_libs/window.pyx | 38 ++++++----------------------------- 2 files changed, 6 insertions(+), 33 deletions(-) diff --git a/pandas/_libs/src/skiplist.pyx b/pandas/_libs/src/skiplist.pyx index 559b529822a69f..1524dca38d0e07 100644 --- a/pandas/_libs/src/skiplist.pyx +++ b/pandas/_libs/src/skiplist.pyx @@ -15,7 +15,6 @@ cdef double Log2(double x): return log(x) / log(2.) cimport numpy as np -from numpy cimport * import numpy as np from random import random diff --git a/pandas/_libs/window.pyx b/pandas/_libs/window.pyx index 9fb3d0662eb4f2..b6bd6f92f61998 100644 --- a/pandas/_libs/window.pyx +++ b/pandas/_libs/window.pyx @@ -1,55 +1,29 @@ # cython: profile=False # cython: boundscheck=False, wraparound=False, cdivision=True -from numpy cimport * +from cython cimport Py_ssize_t + cimport numpy as np import numpy as np cimport cython -import_array() +np.import_array() cimport util from libc.stdlib cimport malloc, free -from numpy cimport NPY_INT8 as NPY_int8 -from numpy cimport NPY_INT16 as NPY_int16 -from numpy cimport NPY_INT32 as NPY_int32 -from numpy cimport NPY_INT64 as NPY_int64 -from numpy cimport NPY_FLOAT16 as NPY_float16 -from numpy cimport NPY_FLOAT32 as NPY_float32 -from numpy cimport NPY_FLOAT64 as NPY_float64 - -from numpy cimport (int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t, - uint32_t, uint64_t, float16_t, float32_t, float64_t) - -int8 = np.dtype(np.int8) -int16 = np.dtype(np.int16) -int32 = np.dtype(np.int32) -int64 = np.dtype(np.int64) -float16 = np.dtype(np.float16) -float32 = np.dtype(np.float32) -float64 = np.dtype(np.float64) - -cdef np.int8_t MINint8 = np.iinfo(np.int8).min -cdef np.int16_t MINint16 = np.iinfo(np.int16).min -cdef np.int32_t MINint32 = np.iinfo(np.int32).min -cdef np.int64_t MINint64 = np.iinfo(np.int64).min -cdef np.float16_t MINfloat16 = np.NINF + +from numpy cimport ndarray, double_t, int64_t, float64_t + cdef np.float32_t MINfloat32 = np.NINF cdef np.float64_t MINfloat64 = np.NINF -cdef np.int8_t MAXint8 = np.iinfo(np.int8).max -cdef np.int16_t MAXint16 = np.iinfo(np.int16).max -cdef np.int32_t MAXint32 = np.iinfo(np.int32).max -cdef np.int64_t MAXint64 = np.iinfo(np.int64).max -cdef np.float16_t MAXfloat16 = np.inf cdef np.float32_t MAXfloat32 = np.inf cdef np.float64_t MAXfloat64 = np.inf cdef double NaN = np.NaN -cdef double nan = NaN cdef inline int int_max(int a, int b): return a if a >= b else b cdef inline int int_min(int a, int b): return a if a <= b else b From 9c4e4c8959853c7cda554d8e9b530efdd8ef9cb1 Mon Sep 17 00:00:00 2001 From: Sam Foo Date: Thu, 7 Sep 2017 20:47:52 -0400 Subject: [PATCH 061/188] Removed Timedelta.is_populated and fixed spelling errors (#17469) --- doc/source/api.rst | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/doc/source/api.rst b/doc/source/api.rst index d34cec86638fba..c32a541d196057 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -1704,7 +1704,7 @@ Methods Timestamp.floor Timestamp.freq Timestamp.freqstr - Timestamp.from_ordinal + Timestamp.fromordinal Timestamp.fromtimestamp Timestamp.isocalendar Timestamp.isoformat @@ -1769,9 +1769,7 @@ Properties Timedelta.asm8 Timedelta.components Timedelta.days - Timedelta.delta Timedelta.freq - Timedelta.is_populated Timedelta.max Timedelta.microseconds Timedelta.min @@ -1789,10 +1787,9 @@ Methods Timedelta.floor Timedelta.isoformat Timedelta.round - Timdelta.to_pytimedelta + Timedelta.to_pytimedelta Timedelta.to_timedelta64 Timedelta.total_seconds - Timedelta.view Window ------ From 7e4e8acf5b5d68b3dfadecd3ba816d4f0b9be0ce Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 7 Sep 2017 18:00:05 -0700 Subject: [PATCH 062/188] PERF: Implement get_freq_code in cython frequencies (#17422) --- asv_bench/benchmarks/period.py | 29 ++++ pandas/_libs/tslibs/__init__.py | 0 pandas/_libs/tslibs/frequencies.pyx | 201 ++++++++++++++++++++++++++++ pandas/tseries/frequencies.py | 79 +---------- setup.py | 4 + 5 files changed, 235 insertions(+), 78 deletions(-) create mode 100644 pandas/_libs/tslibs/__init__.py create mode 100644 pandas/_libs/tslibs/frequencies.pyx diff --git a/asv_bench/benchmarks/period.py b/asv_bench/benchmarks/period.py index f9837191a7bae8..78d66295f28cc6 100644 --- a/asv_bench/benchmarks/period.py +++ b/asv_bench/benchmarks/period.py @@ -2,6 +2,35 @@ from pandas import Series, Period, PeriodIndex, date_range +class PeriodProperties(object): + def setup(self): + self.per = Period('2012-06-01', freq='M') + + def time_year(self): + self.per.year + + def time_month(self): + self.per.month + + def time_quarter(self): + self.per.quarter + + def time_day(self): + self.per.day + + def time_hour(self): + self.per.hour + + def time_minute(self): + self.per.second + + def time_second(self): + self.per.second + + def time_leap_year(self): + self.per.is_leapyear + + class Constructor(object): goal_time = 0.2 diff --git a/pandas/_libs/tslibs/__init__.py b/pandas/_libs/tslibs/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/pandas/_libs/tslibs/frequencies.pyx b/pandas/_libs/tslibs/frequencies.pyx new file mode 100644 index 00000000000000..35429e8ae87f00 --- /dev/null +++ b/pandas/_libs/tslibs/frequencies.pyx @@ -0,0 +1,201 @@ +# -*- coding: utf-8 -*- +# cython: profile=False +import re + +cimport cython + +import numpy as np +cimport numpy as np +np.import_array() + +from util cimport is_integer_object + + +cpdef get_freq_code(freqstr): + """ + Return freq str or tuple to freq code and stride (mult) + + Parameters + ---------- + freqstr : str or tuple + + Returns + ------- + return : tuple of base frequency code and stride (mult) + + Example + ------- + >>> get_freq_code('3D') + (6000, 3) + + >>> get_freq_code('D') + (6000, 1) + + >>> get_freq_code(('D', 3)) + (6000, 3) + """ + if getattr(freqstr, '_typ', None) == 'dateoffset': + freqstr = (freqstr.rule_code, freqstr.n) + + if isinstance(freqstr, tuple): + if (is_integer_object(freqstr[0]) and + is_integer_object(freqstr[1])): + # e.g., freqstr = (2000, 1) + return freqstr + else: + # e.g., freqstr = ('T', 5) + try: + code = _period_str_to_code(freqstr[0]) + stride = freqstr[1] + except: + if is_integer_object(freqstr[1]): + raise + code = _period_str_to_code(freqstr[1]) + stride = freqstr[0] + return code, stride + + if is_integer_object(freqstr): + return (freqstr, 1) + + base, stride = _base_and_stride(freqstr) + code = _period_str_to_code(base) + + return code, stride + + +# hack to handle WOM-1MON +opattern = re.compile( + r'([\-]?\d*|[\-]?\d*\.\d*)\s*([A-Za-z]+([\-][\dA-Za-z\-]+)?)' +) + + +cpdef _base_and_stride(freqstr): + """ + Return base freq and stride info from string representation + + Examples + -------- + _freq_and_stride('5Min') -> 'Min', 5 + """ + groups = opattern.match(freqstr) + + if not groups: + raise ValueError("Could not evaluate {freq}".format(freq=freqstr)) + + stride = groups.group(1) + + if len(stride): + stride = int(stride) + else: + stride = 1 + + base = groups.group(2) + + return (base, stride) + + +# --------------------------------------------------------------------- +# Period codes + +# period frequency constants corresponding to scikits timeseries +# originals +_period_code_map = { + # Annual freqs with various fiscal year ends. + # eg, 2005 for A-FEB runs Mar 1, 2004 to Feb 28, 2005 + "A-DEC": 1000, # Annual - December year end + "A-JAN": 1001, # Annual - January year end + "A-FEB": 1002, # Annual - February year end + "A-MAR": 1003, # Annual - March year end + "A-APR": 1004, # Annual - April year end + "A-MAY": 1005, # Annual - May year end + "A-JUN": 1006, # Annual - June year end + "A-JUL": 1007, # Annual - July year end + "A-AUG": 1008, # Annual - August year end + "A-SEP": 1009, # Annual - September year end + "A-OCT": 1010, # Annual - October year end + "A-NOV": 1011, # Annual - November year end + + # Quarterly frequencies with various fiscal year ends. + # eg, Q42005 for Q-OCT runs Aug 1, 2005 to Oct 31, 2005 + "Q-DEC": 2000, # Quarterly - December year end + "Q-JAN": 2001, # Quarterly - January year end + "Q-FEB": 2002, # Quarterly - February year end + "Q-MAR": 2003, # Quarterly - March year end + "Q-APR": 2004, # Quarterly - April year end + "Q-MAY": 2005, # Quarterly - May year end + "Q-JUN": 2006, # Quarterly - June year end + "Q-JUL": 2007, # Quarterly - July year end + "Q-AUG": 2008, # Quarterly - August year end + "Q-SEP": 2009, # Quarterly - September year end + "Q-OCT": 2010, # Quarterly - October year end + "Q-NOV": 2011, # Quarterly - November year end + + "M": 3000, # Monthly + + "W-SUN": 4000, # Weekly - Sunday end of week + "W-MON": 4001, # Weekly - Monday end of week + "W-TUE": 4002, # Weekly - Tuesday end of week + "W-WED": 4003, # Weekly - Wednesday end of week + "W-THU": 4004, # Weekly - Thursday end of week + "W-FRI": 4005, # Weekly - Friday end of week + "W-SAT": 4006, # Weekly - Saturday end of week + + "B": 5000, # Business days + "D": 6000, # Daily + "H": 7000, # Hourly + "T": 8000, # Minutely + "S": 9000, # Secondly + "L": 10000, # Millisecondly + "U": 11000, # Microsecondly + "N": 12000, # Nanosecondly +} + +# Yearly aliases; careful not to put these in _reverse_period_code_map +_period_code_map.update({'Y' + key[1:]: _period_code_map[key] + for key in _period_code_map + if key.startswith('A-')}) + +_period_code_map.update({ + "Q": 2000, # Quarterly - December year end (default quarterly) + "A": 1000, # Annual + "W": 4000, # Weekly + "C": 5000, # Custom Business Day + }) + +_dont_uppercase = set(('MS', 'ms')) + +_lite_rule_alias = { + 'W': 'W-SUN', + 'Q': 'Q-DEC', + + 'A': 'A-DEC', # YearEnd(month=12), + 'Y': 'A-DEC', + 'AS': 'AS-JAN', # YearBegin(month=1), + 'YS': 'AS-JAN', + 'BA': 'BA-DEC', # BYearEnd(month=12), + 'BY': 'BA-DEC', + 'BAS': 'BAS-JAN', # BYearBegin(month=1), + 'BYS': 'BAS-JAN', + + 'Min': 'T', + 'min': 'T', + 'ms': 'L', + 'us': 'U', + 'ns': 'N'} + +_INVALID_FREQ_ERROR = "Invalid frequency: {0}" + + +cpdef _period_str_to_code(freqstr): + freqstr = _lite_rule_alias.get(freqstr, freqstr) + + if freqstr not in _dont_uppercase: + lower = freqstr.lower() + freqstr = _lite_rule_alias.get(lower, freqstr) + + if freqstr not in _dont_uppercase: + freqstr = freqstr.upper() + try: + return _period_code_map[freqstr] + except KeyError: + raise ValueError(_INVALID_FREQ_ERROR.format(freqstr)) diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index 7f34bcaf52926e..6644a33245a849 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -8,7 +8,6 @@ from pandas.core.dtypes.generic import ABCSeries from pandas.core.dtypes.common import ( - is_integer, is_period_arraylike, is_timedelta64_dtype, is_datetime64_dtype) @@ -21,6 +20,7 @@ from pandas._libs import lib, tslib from pandas._libs.tslib import Timedelta +from pandas._libs.tslibs.frequencies import get_freq_code, _base_and_stride from pytz import AmbiguousTimeError @@ -298,58 +298,6 @@ def get_freq(freq): return freq -def get_freq_code(freqstr): - """ - Return freq str or tuple to freq code and stride (mult) - - Parameters - ---------- - freqstr : str or tuple - - Returns - ------- - return : tuple of base frequency code and stride (mult) - - Example - ------- - >>> get_freq_code('3D') - (6000, 3) - - >>> get_freq_code('D') - (6000, 1) - - >>> get_freq_code(('D', 3)) - (6000, 3) - """ - if isinstance(freqstr, DateOffset): - freqstr = (freqstr.rule_code, freqstr.n) - - if isinstance(freqstr, tuple): - if (is_integer(freqstr[0]) and - is_integer(freqstr[1])): - # e.g., freqstr = (2000, 1) - return freqstr - else: - # e.g., freqstr = ('T', 5) - try: - code = _period_str_to_code(freqstr[0]) - stride = freqstr[1] - except: - if is_integer(freqstr[1]): - raise - code = _period_str_to_code(freqstr[1]) - stride = freqstr[0] - return code, stride - - if is_integer(freqstr): - return (freqstr, 1) - - base, stride = _base_and_stride(freqstr) - code = _period_str_to_code(base) - - return code, stride - - def _get_freq_str(base, mult=1): code = _reverse_period_code_map.get(base) if mult == 1: @@ -577,31 +525,6 @@ def to_offset(freq): ) -def _base_and_stride(freqstr): - """ - Return base freq and stride info from string representation - - Examples - -------- - _freq_and_stride('5Min') -> 'Min', 5 - """ - groups = opattern.match(freqstr) - - if not groups: - raise ValueError("Could not evaluate {freq}".format(freq=freqstr)) - - stride = groups.group(1) - - if len(stride): - stride = int(stride) - else: - stride = 1 - - base = groups.group(2) - - return (base, stride) - - def get_base_alias(freqstr): """ Returns the base frequency alias, e.g., '5D' -> 'D' diff --git a/setup.py b/setup.py index 444db5bc4d275e..4e326beefa9081 100755 --- a/setup.py +++ b/setup.py @@ -341,6 +341,7 @@ class CheckSDist(sdist_class): 'pandas/_libs/window.pyx', 'pandas/_libs/sparse.pyx', 'pandas/_libs/parsers.pyx', + 'panads/_libs/tslibs/frequencies.pyx', 'pandas/io/sas/sas.pyx'] def initialize_options(self): @@ -492,6 +493,8 @@ def pxd(name): 'sources': ['pandas/_libs/src/datetime/np_datetime.c', 'pandas/_libs/src/datetime/np_datetime_strings.c', 'pandas/_libs/src/period_helper.c']}, + '_libs.tslibs.frequencies': {'pyxfile': '_libs/tslibs/frequencies', + 'pxdfiles': ['_libs/src/util']}, '_libs.index': {'pyxfile': '_libs/index', 'sources': ['pandas/_libs/src/datetime/np_datetime.c', 'pandas/_libs/src/datetime/np_datetime_strings.c'], @@ -653,6 +656,7 @@ def pxd(name): 'pandas.io.formats', 'pandas.io.clipboard', 'pandas._libs', + 'pandas._libs.tslibs', 'pandas.plotting', 'pandas.stats', 'pandas.types', From 3ccb88c912d898b2fd8decd3d988aca264e4e820 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 8 Sep 2017 03:05:05 -0700 Subject: [PATCH 063/188] Fix typo in setup.py introduced by 17422 (#17473) --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 4e326beefa9081..3269fe7972cf0f 100755 --- a/setup.py +++ b/setup.py @@ -341,7 +341,7 @@ class CheckSDist(sdist_class): 'pandas/_libs/window.pyx', 'pandas/_libs/sparse.pyx', 'pandas/_libs/parsers.pyx', - 'panads/_libs/tslibs/frequencies.pyx', + 'pandas/_libs/tslibs/frequencies.pyx', 'pandas/io/sas/sas.pyx'] def initialize_options(self): From d6df8ea99f2574480e934aae01a1e142f935145e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 8 Sep 2017 03:16:13 -0700 Subject: [PATCH 064/188] Follow up to #17422 (#17472) --- pandas/_libs/period.pyx | 55 ++++++------ pandas/_libs/tslibs/frequencies.pxd | 4 + pandas/_libs/tslibs/frequencies.pyx | 3 + pandas/tseries/frequencies.py | 128 +--------------------------- 4 files changed, 38 insertions(+), 152 deletions(-) create mode 100644 pandas/_libs/tslibs/frequencies.pxd diff --git a/pandas/_libs/period.pyx b/pandas/_libs/period.pyx index 8f89b812fec04f..e2a3baa8d6e8ba 100644 --- a/pandas/_libs/period.pyx +++ b/pandas/_libs/period.pyx @@ -10,17 +10,16 @@ from cpython cimport ( from numpy cimport (int8_t, int32_t, int64_t, import_array, ndarray, NPY_INT64, NPY_DATETIME, NPY_TIMEDELTA) import numpy as np +import_array() from libc.stdlib cimport free -from pandas import compat from pandas.compat import PY2 cimport cython from datetime cimport ( is_leapyear, - PyDateTime_IMPORT, pandas_datetimestruct, pandas_datetimestruct_to_datetime, pandas_datetime_to_datetimestruct, @@ -29,6 +28,7 @@ from datetime cimport ( cimport util, lib +from util cimport is_period_object, is_string_object from lib cimport is_null_datetimelike, is_period from pandas._libs import tslib, lib @@ -41,6 +41,8 @@ from tslib cimport ( _get_dst_info, _nat_scalar_rules) +from tslibs.frequencies cimport get_freq_code + from pandas.tseries import offsets from pandas.core.tools.datetimes import parse_time_string from pandas.tseries import frequencies @@ -329,8 +331,6 @@ cdef list str_extra_fmts = ["^`AB`^", "^`CD`^", "^`EF`^", "^`GH`^", "^`IJ`^", "^`KL`^"] cdef object _period_strftime(int64_t value, int freq, object fmt): - import sys - cdef: Py_ssize_t i date_info dinfo @@ -683,7 +683,7 @@ cdef class _Period(object): def _maybe_convert_freq(cls, object freq): if isinstance(freq, (int, tuple)): - code, stride = frequencies.get_freq_code(freq) + code, stride = get_freq_code(freq) freq = frequencies._get_freq_str(code, stride) freq = frequencies.to_offset(freq) @@ -707,7 +707,7 @@ cdef class _Period(object): return self def __richcmp__(self, other, op): - if isinstance(other, Period): + if is_period_object(other): if other.freq != self.freq: msg = _DIFFERENT_FREQ.format(self.freqstr, other.freqstr) raise IncompatibleFrequency(msg) @@ -753,7 +753,7 @@ cdef class _Period(object): return NotImplemented def __add__(self, other): - if isinstance(self, Period): + if is_period_object(self): if isinstance(other, (timedelta, np.timedelta64, offsets.DateOffset, Timedelta)): @@ -765,13 +765,13 @@ cdef class _Period(object): return Period(ordinal=ordinal, freq=self.freq) else: # pragma: no cover return NotImplemented - elif isinstance(other, Period): + elif is_period_object(other): return other + self else: return NotImplemented def __sub__(self, other): - if isinstance(self, Period): + if is_period_object(self): if isinstance(other, (timedelta, np.timedelta64, offsets.DateOffset, Timedelta)): @@ -780,7 +780,7 @@ cdef class _Period(object): elif lib.is_integer(other): ordinal = self.ordinal - other * self.freq.n return Period(ordinal=ordinal, freq=self.freq) - elif isinstance(other, Period): + elif is_period_object(other): if other.freq != self.freq: msg = _DIFFERENT_FREQ.format(self.freqstr, other.freqstr) raise IncompatibleFrequency(msg) @@ -789,7 +789,7 @@ cdef class _Period(object): return -other.__sub__(self) else: # pragma: no cover return NotImplemented - elif isinstance(other, Period): + elif is_period_object(other): if self is NaT: return NaT return NotImplemented @@ -813,8 +813,8 @@ cdef class _Period(object): """ freq = self._maybe_convert_freq(freq) how = _validate_end_alias(how) - base1, mult1 = frequencies.get_freq_code(self.freq) - base2, mult2 = frequencies.get_freq_code(freq) + base1, mult1 = get_freq_code(self.freq) + base2, mult2 = get_freq_code(freq) # mult1 can't be negative or 0 end = how == 'E' @@ -860,17 +860,17 @@ cdef class _Period(object): how = _validate_end_alias(how) if freq is None: - base, mult = frequencies.get_freq_code(self.freq) + base, mult = get_freq_code(self.freq) freq = frequencies.get_to_timestamp_base(base) - base, mult = frequencies.get_freq_code(freq) + base, mult = get_freq_code(freq) val = self.asfreq(freq, how) dt64 = period_ordinal_to_dt64(val.ordinal, base) return Timestamp(dt64, tz=tz) cdef _field(self, alias): - base, mult = frequencies.get_freq_code(self.freq) + base, mult = get_freq_code(self.freq) return get_period_field(alias, self.ordinal, base) property year: @@ -935,7 +935,7 @@ cdef class _Period(object): return self.freq.freqstr def __repr__(self): - base, mult = frequencies.get_freq_code(self.freq) + base, mult = get_freq_code(self.freq) formatted = period_format(self.ordinal, base) return "Period('%s', '%s')" % (formatted, self.freqstr) @@ -946,7 +946,7 @@ cdef class _Period(object): Invoked by unicode(df) in py2 only. Yields a Unicode String in both py2/py3. """ - base, mult = frequencies.get_freq_code(self.freq) + base, mult = get_freq_code(self.freq) formatted = period_format(self.ordinal, base) value = ("%s" % formatted) return value @@ -1096,7 +1096,7 @@ cdef class _Period(object): >>> a.strftime('%b. %d, %Y was a %A') 'Jan. 01, 2001 was a Monday' """ - base, mult = frequencies.get_freq_code(self.freq) + base, mult = get_freq_code(self.freq) return period_format(self.ordinal, base, fmt) @@ -1161,10 +1161,10 @@ class Period(_Period): ordinal = _ordinal_from_fields(year, month, quarter, day, hour, minute, second, freq) - elif isinstance(value, Period): + elif is_period_object(value): other = value - if freq is None or frequencies.get_freq_code( - freq) == frequencies.get_freq_code(other.freq): + if freq is None or get_freq_code( + freq) == get_freq_code(other.freq): ordinal = other.ordinal freq = other.freq else: @@ -1174,7 +1174,7 @@ class Period(_Period): elif is_null_datetimelike(value) or value in tslib._nat_strings: ordinal = iNaT - elif isinstance(value, compat.string_types) or lib.is_integer(value): + elif is_string_object(value) or lib.is_integer(value): if lib.is_integer(value): value = str(value) value = value.upper() @@ -1191,7 +1191,7 @@ class Period(_Period): dt = value if freq is None: raise ValueError('Must supply freq for datetime value') - elif isinstance(value, np.datetime64): + elif util.is_datetime64_object(value): dt = Timestamp(value) if freq is None: raise ValueError('Must supply freq for datetime value') @@ -1204,7 +1204,7 @@ class Period(_Period): raise ValueError(msg) if ordinal is None: - base, mult = frequencies.get_freq_code(freq) + base, mult = get_freq_code(freq) ordinal = get_period_ordinal(dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.second, dt.microsecond, 0, base) @@ -1214,7 +1214,7 @@ class Period(_Period): def _ordinal_from_fields(year, month, quarter, day, hour, minute, second, freq): - base, mult = frequencies.get_freq_code(freq) + base, mult = get_freq_code(freq) if quarter is not None: year, month = _quarter_to_myear(year, quarter, freq) @@ -1227,8 +1227,7 @@ def _quarter_to_myear(year, quarter, freq): if quarter <= 0 or quarter > 4: raise ValueError('Quarter must be 1 <= q <= 4') - mnum = frequencies._month_numbers[ - frequencies._get_rule_month(freq)] + 1 + mnum = tslib._MONTH_NUMBERS[tslib._get_rule_month(freq)] + 1 month = (mnum + (quarter - 1) * 3) % 12 + 1 if month > mnum: year -= 1 diff --git a/pandas/_libs/tslibs/frequencies.pxd b/pandas/_libs/tslibs/frequencies.pxd new file mode 100644 index 00000000000000..974eb4ab45df0c --- /dev/null +++ b/pandas/_libs/tslibs/frequencies.pxd @@ -0,0 +1,4 @@ +# -*- coding: utf-8 -*- +# cython: profile=False + +cpdef get_freq_code(freqstr) diff --git a/pandas/_libs/tslibs/frequencies.pyx b/pandas/_libs/tslibs/frequencies.pyx index 35429e8ae87f00..f7889d76abbc71 100644 --- a/pandas/_libs/tslibs/frequencies.pyx +++ b/pandas/_libs/tslibs/frequencies.pyx @@ -150,6 +150,9 @@ _period_code_map = { "N": 12000, # Nanosecondly } +_reverse_period_code_map = { + _period_code_map[key]: key for key in _period_code_map} + # Yearly aliases; careful not to put these in _reverse_period_code_map _period_code_map.update({'Y' + key[1:]: _period_code_map[key] for key in _period_code_map diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index 6644a33245a849..085a3a784557ba 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -20,7 +20,10 @@ from pandas._libs import lib, tslib from pandas._libs.tslib import Timedelta -from pandas._libs.tslibs.frequencies import get_freq_code, _base_and_stride +from pandas._libs.tslibs.frequencies import ( # noqa + get_freq_code, _base_and_stride, _period_str_to_code, + _INVALID_FREQ_ERROR, opattern, _lite_rule_alias, _dont_uppercase, + _period_code_map, _reverse_period_code_map) from pytz import AmbiguousTimeError @@ -375,27 +378,6 @@ def get_period_alias(offset_str): return _offset_to_period_map.get(offset_str, None) -_lite_rule_alias = { - 'W': 'W-SUN', - 'Q': 'Q-DEC', - - 'A': 'A-DEC', # YearEnd(month=12), - 'Y': 'A-DEC', - 'AS': 'AS-JAN', # YearBegin(month=1), - 'YS': 'AS-JAN', - 'BA': 'BA-DEC', # BYearEnd(month=12), - 'BY': 'BA-DEC', - 'BAS': 'BAS-JAN', # BYearBegin(month=1), - 'BYS': 'BAS-JAN', - - 'Min': 'T', - 'min': 'T', - 'ms': 'L', - 'us': 'U', - 'ns': 'N' -} - - _name_to_offset_map = {'days': Day(1), 'hours': Hour(1), 'minutes': Minute(1), @@ -405,9 +387,6 @@ def get_period_alias(offset_str): 'nanoseconds': Nano(1)} -_INVALID_FREQ_ERROR = "Invalid frequency: {0}" - - @deprecate_kwarg(old_arg_name='freqstr', new_arg_name='freq') def to_offset(freq): """ @@ -519,12 +498,6 @@ def to_offset(freq): return delta -# hack to handle WOM-1MON -opattern = re.compile( - r'([\-]?\d*|[\-]?\d*\.\d*)\s*([A-Za-z]+([\-][\dA-Za-z\-]+)?)' -) - - def get_base_alias(freqstr): """ Returns the base frequency alias, e.g., '5D' -> 'D' @@ -532,9 +505,6 @@ def get_base_alias(freqstr): return _base_and_stride(freqstr)[0] -_dont_uppercase = set(('MS', 'ms')) - - def get_offset(name): """ Return DateOffset object associated with rule name @@ -583,96 +553,6 @@ def get_standard_freq(freq): # --------------------------------------------------------------------- # Period codes -# period frequency constants corresponding to scikits timeseries -# originals -_period_code_map = { - # Annual freqs with various fiscal year ends. - # eg, 2005 for A-FEB runs Mar 1, 2004 to Feb 28, 2005 - "A-DEC": 1000, # Annual - December year end - "A-JAN": 1001, # Annual - January year end - "A-FEB": 1002, # Annual - February year end - "A-MAR": 1003, # Annual - March year end - "A-APR": 1004, # Annual - April year end - "A-MAY": 1005, # Annual - May year end - "A-JUN": 1006, # Annual - June year end - "A-JUL": 1007, # Annual - July year end - "A-AUG": 1008, # Annual - August year end - "A-SEP": 1009, # Annual - September year end - "A-OCT": 1010, # Annual - October year end - "A-NOV": 1011, # Annual - November year end - - # Quarterly frequencies with various fiscal year ends. - # eg, Q42005 for Q-OCT runs Aug 1, 2005 to Oct 31, 2005 - "Q-DEC": 2000, # Quarterly - December year end - "Q-JAN": 2001, # Quarterly - January year end - "Q-FEB": 2002, # Quarterly - February year end - "Q-MAR": 2003, # Quarterly - March year end - "Q-APR": 2004, # Quarterly - April year end - "Q-MAY": 2005, # Quarterly - May year end - "Q-JUN": 2006, # Quarterly - June year end - "Q-JUL": 2007, # Quarterly - July year end - "Q-AUG": 2008, # Quarterly - August year end - "Q-SEP": 2009, # Quarterly - September year end - "Q-OCT": 2010, # Quarterly - October year end - "Q-NOV": 2011, # Quarterly - November year end - - "M": 3000, # Monthly - - "W-SUN": 4000, # Weekly - Sunday end of week - "W-MON": 4001, # Weekly - Monday end of week - "W-TUE": 4002, # Weekly - Tuesday end of week - "W-WED": 4003, # Weekly - Wednesday end of week - "W-THU": 4004, # Weekly - Thursday end of week - "W-FRI": 4005, # Weekly - Friday end of week - "W-SAT": 4006, # Weekly - Saturday end of week - - "B": 5000, # Business days - "D": 6000, # Daily - "H": 7000, # Hourly - "T": 8000, # Minutely - "S": 9000, # Secondly - "L": 10000, # Millisecondly - "U": 11000, # Microsecondly - "N": 12000, # Nanosecondly -} - -_reverse_period_code_map = {} -for _k, _v in compat.iteritems(_period_code_map): - _reverse_period_code_map[_v] = _k - -# Yearly aliases -year_aliases = {} - -for k, v in compat.iteritems(_period_code_map): - if k.startswith("A-"): - alias = "Y" + k[1:] - year_aliases[alias] = v - -_period_code_map.update(**year_aliases) -del year_aliases - -_period_code_map.update({ - "Q": 2000, # Quarterly - December year end (default quarterly) - "A": 1000, # Annual - "W": 4000, # Weekly - "C": 5000, # Custom Business Day -}) - - -def _period_str_to_code(freqstr): - freqstr = _lite_rule_alias.get(freqstr, freqstr) - - if freqstr not in _dont_uppercase: - lower = freqstr.lower() - freqstr = _lite_rule_alias.get(lower, freqstr) - - if freqstr not in _dont_uppercase: - freqstr = freqstr.upper() - try: - return _period_code_map[freqstr] - except KeyError: - raise ValueError(_INVALID_FREQ_ERROR.format(freqstr)) - def infer_freq(index, warn=True): """ From fdbc6b8f4b36f07da62fc901b19754f922ae3952 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Sat, 9 Sep 2017 12:09:08 -0700 Subject: [PATCH 065/188] MAINT: calcurate --> calculate in _doctools.py --- pandas/util/_doctools.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pandas/util/_doctools.py b/pandas/util/_doctools.py index cbc9518b96416f..d654c78b8b13fe 100644 --- a/pandas/util/_doctools.py +++ b/pandas/util/_doctools.py @@ -15,12 +15,18 @@ def __init__(self, cell_width=0.37, cell_height=0.25, font_size=7.5): self.font_size = font_size def _shape(self, df): - """Calcurate table chape considering index levels""" + """ + Calculate table chape considering index levels. + """ + row, col = df.shape return row + df.columns.nlevels, col + df.index.nlevels def _get_cells(self, left, right, vertical): - """Calcurate appropriate figure size based on left and right data""" + """ + Calculate appropriate figure size based on left and right data. + """ + if vertical: # calcurate required number of cells vcells = max(sum([self._shape(l)[0] for l in left]), From 23050dca1b404d23527132c0277f3d40dc41cab8 Mon Sep 17 00:00:00 2001 From: Matt Bark Date: Sun, 10 Sep 2017 03:30:48 -0400 Subject: [PATCH 066/188] BUG: Fix TypeError caused by GH13374 (#17465) --- doc/source/whatsnew/v0.21.0.txt | 1 + pandas/io/parsers.py | 4 +++- pandas/tests/io/parser/python_parser_only.py | 19 +++++++++++++++++++ 3 files changed, 23 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index f50052347cfb56..bfe7d974a60972 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -411,6 +411,7 @@ I/O - Bug in :func:`read_csv` when called with a single-element list ``header`` would return a ``DataFrame`` of all NaN values (:issue:`7757`) - Bug in :func:`read_stata` where value labels could not be read when using an iterator (:issue:`16923`) - Bug in :func:`read_html` where import check fails when run in multiple threads (:issue:`16928`) +- Bug in :func:`read_csv` where automatic delimiter detection caused a ``TypeError`` to be thrown when a bad line was encountered rather than the correct error message (:issue:`13374`) Plotting ^^^^^^^^ diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 6adf154aabba7f..d9e83176d0d6e3 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2836,7 +2836,9 @@ def _rows_to_cols(self, content): for row_num, actual_len in bad_lines: msg = ('Expected %d fields in line %d, saw %d' % (col_len, row_num + 1, actual_len)) - if len(self.delimiter) > 1 and self.quoting != csv.QUOTE_NONE: + if (self.delimiter and + len(self.delimiter) > 1 and + self.quoting != csv.QUOTE_NONE): # see gh-13374 reason = ('Error could possibly be due to quotes being ' 'ignored when a multi-char delimiter is used.') diff --git a/pandas/tests/io/parser/python_parser_only.py b/pandas/tests/io/parser/python_parser_only.py index a0784d3aeae2d0..c3dc91b3f188c4 100644 --- a/pandas/tests/io/parser/python_parser_only.py +++ b/pandas/tests/io/parser/python_parser_only.py @@ -218,6 +218,25 @@ def test_multi_char_sep_quotes(self): self.read_csv(StringIO(data), sep=',,', quoting=csv.QUOTE_NONE) + def test_none_delimiter(self): + # see gh-13374 and gh-17465 + + data = "a,b,c\n0,1,2\n3,4,5,6\n7,8,9" + expected = DataFrame({'a': [0, 7], + 'b': [1, 8], + 'c': [2, 9]}) + + # We expect the third line in the data to be + # skipped because it is malformed, + # but we do not expect any errors to occur. + result = self.read_csv(StringIO(data), header=0, + sep=None, + error_bad_lines=False, + warn_bad_lines=True, + engine='python', + tupleize_cols=True) + tm.assert_frame_equal(result, expected) + def test_skipfooter_bad_row(self): # see gh-13879 # see gh-15910 From c3ad501ed31e2e71ab91a201ed72779fdd597698 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 10 Sep 2017 07:19:52 -0700 Subject: [PATCH 067/188] Remove incorrect kwds from DateOffset tests (#17486) --- pandas/tests/tseries/test_offsets.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/tests/tseries/test_offsets.py b/pandas/tests/tseries/test_offsets.py index e03b3e0a85e5e6..7e6e85f322fe0f 100644 --- a/pandas/tests/tseries/test_offsets.py +++ b/pandas/tests/tseries/test_offsets.py @@ -111,7 +111,10 @@ def offset_types(self): def _get_offset(self, klass, value=1, normalize=False): # create instance from offset class - if klass is FY5253 or klass is FY5253Quarter: + if klass is FY5253: + klass = klass(n=value, startingMonth=1, weekday=1, + variation='last', normalize=normalize) + elif klass is FY5253Quarter: klass = klass(n=value, startingMonth=1, weekday=1, qtr_with_extra_week=1, variation='last', normalize=normalize) @@ -2629,7 +2632,7 @@ def test_offset(self): def test_day_of_month(self): dt = datetime(2007, 1, 1) - offset = MonthEnd(day=20) + offset = MonthEnd() result = dt + offset assert result == Timestamp(2007, 1, 31) @@ -3678,7 +3681,7 @@ def test_onOffset(self): 1, startingMonth=8, weekday=WeekDay.THU, qtr_with_extra_week=4) offset_n = FY5253(weekday=WeekDay.TUE, startingMonth=12, - variation="nearest", qtr_with_extra_week=4) + variation="nearest") tests = [ # From Wikipedia From e6aed2ebb7374ed2a6a7c284750d47728aec285e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 10 Sep 2017 07:43:37 -0700 Subject: [PATCH 068/188] Remove pyx dependencies from setup (#17478) --- setup.py | 31 +++++++++---------------------- 1 file changed, 9 insertions(+), 22 deletions(-) diff --git a/setup.py b/setup.py index 3269fe7972cf0f..d64a78db7500ac 100755 --- a/setup.py +++ b/setup.py @@ -347,14 +347,6 @@ class CheckSDist(sdist_class): def initialize_options(self): sdist_class.initialize_options(self) - ''' - self._pyxfiles = [] - for root, dirs, files in os.walk('pandas'): - for f in files: - if f.endswith('.pyx'): - self._pyxfiles.append(pjoin(root, f)) - ''' - def run(self): if 'cython' in cmdclass: self.run_command('cython') @@ -479,11 +471,10 @@ def pxd(name): '_libs.lib': {'pyxfile': '_libs/lib', 'depends': lib_depends + tseries_depends}, '_libs.hashtable': {'pyxfile': '_libs/hashtable', - 'pxdfiles': ['_libs/hashtable'], 'depends': (['pandas/_libs/src/klib/khash_python.h'] + _pxi_dep['hashtable'])}, '_libs.tslib': {'pyxfile': '_libs/tslib', - 'pxdfiles': ['_libs/src/util', '_libs/lib'], + 'pxdfiles': ['_libs/src/util'], 'depends': tseries_depends, 'sources': ['pandas/_libs/src/datetime/np_datetime.c', 'pandas/_libs/src/datetime/np_datetime_strings.c', @@ -498,21 +489,20 @@ def pxd(name): '_libs.index': {'pyxfile': '_libs/index', 'sources': ['pandas/_libs/src/datetime/np_datetime.c', 'pandas/_libs/src/datetime/np_datetime_strings.c'], - 'pxdfiles': ['_libs/src/util', '_libs/hashtable'], + 'pxdfiles': ['_libs/src/util'], 'depends': _pxi_dep['index']}, '_libs.algos': {'pyxfile': '_libs/algos', - 'pxdfiles': ['_libs/src/util', '_libs/algos', '_libs/hashtable'], + 'pxdfiles': ['_libs/src/util'], 'depends': _pxi_dep['algos']}, '_libs.groupby': {'pyxfile': '_libs/groupby', - 'pxdfiles': ['_libs/src/util', '_libs/algos'], - 'depends': _pxi_dep['groupby']}, + 'pxdfiles': ['_libs/src/util'], + 'depends': _pxi_dep['groupby']}, '_libs.join': {'pyxfile': '_libs/join', - 'pxdfiles': ['_libs/src/util', '_libs/hashtable'], + 'pxdfiles': ['_libs/src/util'], 'depends': _pxi_dep['join']}, '_libs.reshape': {'pyxfile': '_libs/reshape', 'depends': _pxi_dep['reshape']}, '_libs.interval': {'pyxfile': '_libs/interval', - 'pxdfiles': ['_libs/hashtable'], 'depends': _pxi_dep['interval']}, '_libs.window': {'pyxfile': '_libs/window', 'pxdfiles': ['_libs/src/skiplist', '_libs/src/util'], @@ -525,12 +515,9 @@ def pxd(name): 'sources': ['pandas/_libs/src/parser/tokenizer.c', 'pandas/_libs/src/parser/io.c']}, '_libs.sparse': {'pyxfile': '_libs/sparse', - 'depends': (['pandas/_libs/sparse.pyx'] + - _pxi_dep['sparse'])}, - '_libs.testing': {'pyxfile': '_libs/testing', - 'depends': ['pandas/_libs/testing.pyx']}, - '_libs.hashing': {'pyxfile': '_libs/hashing', - 'depends': ['pandas/_libs/hashing.pyx']}, + 'depends': _pxi_dep['sparse']}, + '_libs.testing': {'pyxfile': '_libs/testing'}, + '_libs.hashing': {'pyxfile': '_libs/hashing'}, 'io.sas._sas': {'pyxfile': 'io/sas/sas'}, } From 42ed4f143f8b0b386c90df9fa8a55d0f2e5a857c Mon Sep 17 00:00:00 2001 From: Licht Takeuchi Date: Mon, 11 Sep 2017 09:01:41 +0900 Subject: [PATCH 069/188] ENH: Add Styler.where (#17474) --- doc/source/api.rst | 1 + doc/source/whatsnew/v0.21.0.txt | 1 + pandas/io/formats/style.py | 42 +++++++++++++++++++ pandas/tests/io/formats/test_style.py | 58 +++++++++++++++++++++++++++ 4 files changed, 102 insertions(+) diff --git a/doc/source/api.rst b/doc/source/api.rst index c32a541d196057..27a4ab9cc6cbc4 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -2062,6 +2062,7 @@ Style Application Styler.apply Styler.applymap + Styler.where Styler.format Styler.set_precision Styler.set_table_styles diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index bfe7d974a60972..eccd71f45ec276 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -112,6 +112,7 @@ Other Enhancements - `read_*` methods can now infer compression from non-string paths, such as ``pathlib.Path`` objects (:issue:`17206`). - :func:`pd.read_sas()` now recognizes much more of the most frequently used date (datetime) formats in SAS7BDAT files (:issue:`15871`). - :func:`DataFrame.items` and :func:`Series.items` is now present in both Python 2 and 3 and is lazy in all cases (:issue:`13918`, :issue:`17213`) +- :func:`Styler.where` has been implemented. It is as a convenience for :func:`Styler.applymap` and enables simple DataFrame styling on the Jupyter notebook (:issue:`17474`). diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 87d672197be300..d7677e3642c26e 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -618,11 +618,53 @@ def applymap(self, func, subset=None, **kwargs): ------- self : Styler + See Also + -------- + Styler.where + """ self._todo.append((lambda instance: getattr(instance, '_applymap'), (func, subset), kwargs)) return self + def where(self, cond, value, other=None, subset=None, **kwargs): + """ + Apply a function elementwise, updating the HTML + representation with a style which is selected in + accordance with the return value of a function. + + .. versionadded:: 0.21.0 + + Parameters + ---------- + cond : callable + ``cond`` should take a scalar and return a boolean + value : str + applied when ``cond`` returns true + other : str + applied when ``cond`` returns false + subset : IndexSlice + a valid indexer to limit ``data`` to *before* applying the + function. Consider using a pandas.IndexSlice + kwargs : dict + pass along to ``cond`` + + Returns + ------- + self : Styler + + See Also + -------- + Styler.applymap + + """ + + if other is None: + other = '' + + return self.applymap(lambda val: value if cond(val) else other, + subset=subset, **kwargs) + def set_precision(self, precision): """ Set the precision used to render. diff --git a/pandas/tests/io/formats/test_style.py b/pandas/tests/io/formats/test_style.py index 59d9f938734abf..811381e4cbd2ad 100644 --- a/pandas/tests/io/formats/test_style.py +++ b/pandas/tests/io/formats/test_style.py @@ -265,6 +265,64 @@ def f(x): col in self.df.loc[slice_].columns) assert result == expected + def test_where_with_one_style(self): + # GH 17474 + def f(x): + return x > 0.5 + + style1 = 'foo: bar' + + result = self.df.style.where(f, style1)._compute().ctx + expected = dict(((r, c), + [style1 if f(self.df.loc[row, col]) else '']) + for r, row in enumerate(self.df.index) + for c, col in enumerate(self.df.columns)) + assert result == expected + + def test_where_subset(self): + # GH 17474 + def f(x): + return x > 0.5 + + style1 = 'foo: bar' + style2 = 'baz: foo' + + slices = [pd.IndexSlice[:], pd.IndexSlice[:, ['A']], + pd.IndexSlice[[1], :], pd.IndexSlice[[1], ['A']], + pd.IndexSlice[:2, ['A', 'B']]] + + for slice_ in slices: + result = self.df.style.where(f, style1, style2, + subset=slice_)._compute().ctx + expected = dict(((r, c), + [style1 if f(self.df.loc[row, col]) else style2]) + for r, row in enumerate(self.df.index) + for c, col in enumerate(self.df.columns) + if row in self.df.loc[slice_].index and + col in self.df.loc[slice_].columns) + assert result == expected + + def test_where_subset_compare_with_applymap(self): + # GH 17474 + def f(x): + return x > 0.5 + + style1 = 'foo: bar' + style2 = 'baz: foo' + + def g(x): + return style1 if f(x) else style2 + + slices = [pd.IndexSlice[:], pd.IndexSlice[:, ['A']], + pd.IndexSlice[[1], :], pd.IndexSlice[[1], ['A']], + pd.IndexSlice[:2, ['A', 'B']]] + + for slice_ in slices: + result = self.df.style.where(f, style1, style2, + subset=slice_)._compute().ctx + expected = self.df.style.applymap(g, subset=slice_)._compute().ctx + assert result == expected + def test_empty(self): df = pd.DataFrame({'A': [1, 0]}) s = df.style From f3b6d1f91643d245d6b43b41e7c9fd1349fb8de5 Mon Sep 17 00:00:00 2001 From: rvernica Date: Mon, 11 Sep 2017 04:03:18 -0700 Subject: [PATCH 070/188] Add file-like object to docs (#17492) --- pandas/io/feather_format.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index 87a4931421d7d4..b2bf4ab7ff7f1d 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -41,8 +41,7 @@ def to_feather(df, path): Parameters ---------- df : DataFrame - path : string - File path + path : string file path, or file-like object """ path = _stringify_path(path) @@ -92,8 +91,7 @@ def read_feather(path, nthreads=1): Parameters ---------- - path : string - File path + path : string file path, or file-like object nthreads : int, default 1 Number of CPU threads to use when reading to pandas.DataFrame From 46856c3936540a47df719d10a7699eb35673e4a4 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 11 Sep 2017 04:22:56 -0700 Subject: [PATCH 071/188] Implement _is_utc in timezones (#17419) --- pandas/_libs/index.pyx | 7 +------ pandas/_libs/period.pyx | 2 +- pandas/_libs/tslib.pxd | 1 - pandas/_libs/tslib.pyx | 4 ++-- pandas/_libs/tslibs/__init__.py | 2 ++ pandas/_libs/tslibs/timezones.pxd | 4 ++++ pandas/_libs/tslibs/timezones.pyx | 12 ++++++++++++ setup.py | 2 ++ 8 files changed, 24 insertions(+), 10 deletions(-) create mode 100644 pandas/_libs/tslibs/timezones.pxd create mode 100644 pandas/_libs/tslibs/timezones.pyx diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 42ba0c1cadaec1..bf4d53683c9b71 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -17,6 +17,7 @@ cimport tslib from hashtable cimport HashTable +from tslibs.timezones cimport _is_utc from pandas._libs import tslib, algos, hashtable as _hash from pandas._libs.tslib import Timestamp, Timedelta from datetime import datetime, timedelta @@ -32,9 +33,6 @@ cdef extern from "datetime.h": cdef int64_t iNaT = util.get_nat() -from dateutil.tz import tzutc as _du_utc -import pytz -UTC = pytz.utc PyDateTime_IMPORT @@ -559,9 +557,6 @@ cdef inline _to_i8(object val): return ival return val -cdef inline bint _is_utc(object tz): - return tz is UTC or isinstance(tz, _du_utc) - cdef class MultiIndexObjectEngine(ObjectEngine): """ diff --git a/pandas/_libs/period.pyx b/pandas/_libs/period.pyx index e2a3baa8d6e8ba..08962bca824cac 100644 --- a/pandas/_libs/period.pyx +++ b/pandas/_libs/period.pyx @@ -34,9 +34,9 @@ from lib cimport is_null_datetimelike, is_period from pandas._libs import tslib, lib from pandas._libs.tslib import (Timedelta, Timestamp, iNaT, NaT, _get_utcoffset) +from tslibs.timezones cimport _is_utc from tslib cimport ( maybe_get_tz, - _is_utc, _is_tzlocal, _get_dst_info, _nat_scalar_rules) diff --git a/pandas/_libs/tslib.pxd b/pandas/_libs/tslib.pxd index aa8cbcb2cedc72..1d81c3cc15cd89 100644 --- a/pandas/_libs/tslib.pxd +++ b/pandas/_libs/tslib.pxd @@ -3,7 +3,6 @@ from numpy cimport ndarray, int64_t cdef convert_to_tsobject(object, object, object, bint, bint) cpdef convert_to_timedelta64(object, object) cpdef object maybe_get_tz(object) -cdef bint _is_utc(object) cdef bint _is_tzlocal(object) cdef object _get_dst_info(object) cdef bint _nat_scalar_rules[6] diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 7e009652f7f0c6..b1f794a0030d12 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -107,6 +107,8 @@ cdef int64_t NPY_NAT = util.get_nat() iNaT = NPY_NAT +from tslibs.timezones cimport _is_utc + cdef inline object create_timestamp_from_ts( int64_t value, pandas_datetimestruct dts, object tz, object freq): @@ -1713,8 +1715,6 @@ def _localize_pydatetime(object dt, object tz): def get_timezone(tz): return _get_zone(tz) -cdef inline bint _is_utc(object tz): - return tz is UTC or isinstance(tz, _dateutil_tzutc) cdef inline object _get_zone(object tz): """ diff --git a/pandas/_libs/tslibs/__init__.py b/pandas/_libs/tslibs/__init__.py index e69de29bb2d1d6..f3aa0424f03769 100644 --- a/pandas/_libs/tslibs/__init__.py +++ b/pandas/_libs/tslibs/__init__.py @@ -0,0 +1,2 @@ +# -*- coding: utf-8 -*- +# cython: profile=False diff --git a/pandas/_libs/tslibs/timezones.pxd b/pandas/_libs/tslibs/timezones.pxd new file mode 100644 index 00000000000000..0708282abe1d0a --- /dev/null +++ b/pandas/_libs/tslibs/timezones.pxd @@ -0,0 +1,4 @@ +# -*- coding: utf-8 -*- +# cython: profile=False + +cdef bint _is_utc(object tz) diff --git a/pandas/_libs/tslibs/timezones.pyx b/pandas/_libs/tslibs/timezones.pyx new file mode 100644 index 00000000000000..43709e77b70d56 --- /dev/null +++ b/pandas/_libs/tslibs/timezones.pyx @@ -0,0 +1,12 @@ +# -*- coding: utf-8 -*- +# cython: profile=False + +# dateutil compat +from dateutil.tz import tzutc as _dateutil_tzutc + +import pytz +UTC = pytz.utc + + +cdef inline bint _is_utc(object tz): + return tz is UTC or isinstance(tz, _dateutil_tzutc) diff --git a/setup.py b/setup.py index d64a78db7500ac..434ca644739165 100755 --- a/setup.py +++ b/setup.py @@ -341,6 +341,7 @@ class CheckSDist(sdist_class): 'pandas/_libs/window.pyx', 'pandas/_libs/sparse.pyx', 'pandas/_libs/parsers.pyx', + 'pandas/_libs/tslibs/timezones.pyx', 'pandas/_libs/tslibs/frequencies.pyx', 'pandas/io/sas/sas.pyx'] @@ -479,6 +480,7 @@ def pxd(name): 'sources': ['pandas/_libs/src/datetime/np_datetime.c', 'pandas/_libs/src/datetime/np_datetime_strings.c', 'pandas/_libs/src/period_helper.c']}, + '_libs.tslibs.timezones': {'pyxfile': '_libs/tslibs/timezones'}, '_libs.period': {'pyxfile': '_libs/period', 'depends': tseries_depends, 'sources': ['pandas/_libs/src/datetime/np_datetime.c', From 34cc2e812f60687d2a4417ff26fc180f7c042674 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 12 Sep 2017 03:09:50 -0700 Subject: [PATCH 072/188] Follow-up to #17419 (#17497) --- pandas/_libs/period.pyx | 5 +-- pandas/_libs/src/inference.pyx | 7 ++-- pandas/_libs/tslib.pxd | 1 - pandas/_libs/tslib.pyx | 66 ++++-------------------------- pandas/_libs/tslibs/timezones.pxd | 8 ++++ pandas/_libs/tslibs/timezones.pyx | 68 ++++++++++++++++++++++++++++++- 6 files changed, 88 insertions(+), 67 deletions(-) diff --git a/pandas/_libs/period.pyx b/pandas/_libs/period.pyx index 08962bca824cac..2b0734f5cf2e7a 100644 --- a/pandas/_libs/period.pyx +++ b/pandas/_libs/period.pyx @@ -33,11 +33,10 @@ from util cimport is_period_object, is_string_object from lib cimport is_null_datetimelike, is_period from pandas._libs import tslib, lib from pandas._libs.tslib import (Timedelta, Timestamp, iNaT, - NaT, _get_utcoffset) -from tslibs.timezones cimport _is_utc + NaT) +from tslibs.timezones cimport _is_utc, _is_tzlocal, _get_utcoffset from tslib cimport ( maybe_get_tz, - _is_tzlocal, _get_dst_info, _nat_scalar_rules) diff --git a/pandas/_libs/src/inference.pyx b/pandas/_libs/src/inference.pyx index 6b5a8f20f00671..95145ff49b02fd 100644 --- a/pandas/_libs/src/inference.pyx +++ b/pandas/_libs/src/inference.pyx @@ -2,7 +2,8 @@ import sys from decimal import Decimal cimport util cimport cython -from tslib import NaT, get_timezone +from tslib import NaT +from tslibs.timezones cimport _get_zone from datetime import datetime, timedelta iNaT = util.get_nat() @@ -900,13 +901,13 @@ cpdef bint is_datetime_with_singletz_array(ndarray[object] values): for i in range(n): base_val = values[i] if base_val is not NaT: - base_tz = get_timezone(getattr(base_val, 'tzinfo', None)) + base_tz = _get_zone(getattr(base_val, 'tzinfo', None)) for j in range(i, n): val = values[j] if val is not NaT: tz = getattr(val, 'tzinfo', None) - if base_tz != tz and base_tz != get_timezone(tz): + if base_tz != tz and base_tz != _get_zone(tz): return False break diff --git a/pandas/_libs/tslib.pxd b/pandas/_libs/tslib.pxd index 1d81c3cc15cd89..c1b25963a62571 100644 --- a/pandas/_libs/tslib.pxd +++ b/pandas/_libs/tslib.pxd @@ -3,7 +3,6 @@ from numpy cimport ndarray, int64_t cdef convert_to_tsobject(object, object, object, bint, bint) cpdef convert_to_timedelta64(object, object) cpdef object maybe_get_tz(object) -cdef bint _is_tzlocal(object) cdef object _get_dst_info(object) cdef bint _nat_scalar_rules[6] cdef bint _check_all_nulls(obj) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index b1f794a0030d12..a8ae0fcd733d6c 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -107,7 +107,13 @@ cdef int64_t NPY_NAT = util.get_nat() iNaT = NPY_NAT -from tslibs.timezones cimport _is_utc +from tslibs.timezones cimport ( + _is_utc, _is_tzlocal, + _treat_tz_as_dateutil, _treat_tz_as_pytz, + _get_zone, + _get_utcoffset) +from tslibs.timezones import get_timezone, _get_utcoffset # noqa + cdef inline object create_timestamp_from_ts( int64_t value, pandas_datetimestruct dts, @@ -235,10 +241,6 @@ def ints_to_pytimedelta(ndarray[int64_t] arr, box=False): return result -cdef inline bint _is_tzlocal(object tz): - return isinstance(tz, _dateutil_tzlocal) - - cdef inline bint _is_fixed_offset(object tz): if _treat_tz_as_dateutil(tz): if len(tz._trans_idx) == 0 and len(tz._trans_list) == 0: @@ -1443,11 +1445,6 @@ cdef class _TSObject: def __get__(self): return self.value -cpdef _get_utcoffset(tzinfo, obj): - try: - return tzinfo._utcoffset - except AttributeError: - return tzinfo.utcoffset(obj) # helper to extract datetime and int64 from several different possibilities cdef convert_to_tsobject(object ts, object tz, object unit, @@ -1712,48 +1709,6 @@ def _localize_pydatetime(object dt, object tz): return dt.replace(tzinfo=tz) -def get_timezone(tz): - return _get_zone(tz) - - -cdef inline object _get_zone(object tz): - """ - We need to do several things here: - 1) Distinguish between pytz and dateutil timezones - 2) Not be over-specific (e.g. US/Eastern with/without DST is same *zone* - but a different tz object) - 3) Provide something to serialize when we're storing a datetime object - in pytables. - - We return a string prefaced with dateutil if it's a dateutil tz, else just - the tz name. It needs to be a string so that we can serialize it with - UJSON/pytables. maybe_get_tz (below) is the inverse of this process. - """ - if _is_utc(tz): - return 'UTC' - else: - if _treat_tz_as_dateutil(tz): - if '.tar.gz' in tz._filename: - raise ValueError( - 'Bad tz filename. Dateutil on python 3 on windows has a ' - 'bug which causes tzfile._filename to be the same for all ' - 'timezone files. Please construct dateutil timezones ' - 'implicitly by passing a string like "dateutil/Europe' - '/London" when you construct your pandas objects instead ' - 'of passing a timezone object. See ' - 'https://github.com/pandas-dev/pandas/pull/7362') - return 'dateutil/' + tz._filename - else: - # tz is a pytz timezone or unknown. - try: - zone = tz.zone - if zone is None: - return tz - return zone - except AttributeError: - return tz - - cpdef inline object maybe_get_tz(object tz): """ (Maybe) Construct a timezone object from a string. If tz is a string, use @@ -4285,13 +4240,6 @@ def tz_convert_single(int64_t val, object tz1, object tz2): # Timezone data caches, key is the pytz string or dateutil file name. dst_cache = {} -cdef inline bint _treat_tz_as_pytz(object tz): - return hasattr(tz, '_utc_transition_times') and hasattr( - tz, '_transition_info') - -cdef inline bint _treat_tz_as_dateutil(object tz): - return hasattr(tz, '_trans_list') and hasattr(tz, '_trans_idx') - def _p_tz_cache_key(tz): """ Python interface for cache function to facilitate testing.""" diff --git a/pandas/_libs/tslibs/timezones.pxd b/pandas/_libs/tslibs/timezones.pxd index 0708282abe1d0a..897bd8af7e2deb 100644 --- a/pandas/_libs/tslibs/timezones.pxd +++ b/pandas/_libs/tslibs/timezones.pxd @@ -2,3 +2,11 @@ # cython: profile=False cdef bint _is_utc(object tz) +cdef bint _is_tzlocal(object tz) + +cdef bint _treat_tz_as_pytz(object tz) +cdef bint _treat_tz_as_dateutil(object tz) + +cdef object _get_zone(object tz) + +cpdef _get_utcoffset(tzinfo, obj) diff --git a/pandas/_libs/tslibs/timezones.pyx b/pandas/_libs/tslibs/timezones.pyx index 43709e77b70d56..249eedef4bb098 100644 --- a/pandas/_libs/tslibs/timezones.pyx +++ b/pandas/_libs/tslibs/timezones.pyx @@ -2,7 +2,9 @@ # cython: profile=False # dateutil compat -from dateutil.tz import tzutc as _dateutil_tzutc +from dateutil.tz import ( + tzutc as _dateutil_tzutc, + tzlocal as _dateutil_tzlocal) import pytz UTC = pytz.utc @@ -10,3 +12,67 @@ UTC = pytz.utc cdef inline bint _is_utc(object tz): return tz is UTC or isinstance(tz, _dateutil_tzutc) + + +cdef inline bint _is_tzlocal(object tz): + return isinstance(tz, _dateutil_tzlocal) + + +cdef inline bint _treat_tz_as_pytz(object tz): + return hasattr(tz, '_utc_transition_times') and hasattr( + tz, '_transition_info') + + +cdef inline bint _treat_tz_as_dateutil(object tz): + return hasattr(tz, '_trans_list') and hasattr(tz, '_trans_idx') + + +cdef inline object _get_zone(object tz): + """ + We need to do several things here: + 1) Distinguish between pytz and dateutil timezones + 2) Not be over-specific (e.g. US/Eastern with/without DST is same *zone* + but a different tz object) + 3) Provide something to serialize when we're storing a datetime object + in pytables. + + We return a string prefaced with dateutil if it's a dateutil tz, else just + the tz name. It needs to be a string so that we can serialize it with + UJSON/pytables. maybe_get_tz (below) is the inverse of this process. + """ + if _is_utc(tz): + return 'UTC' + else: + if _treat_tz_as_dateutil(tz): + if '.tar.gz' in tz._filename: + raise ValueError( + 'Bad tz filename. Dateutil on python 3 on windows has a ' + 'bug which causes tzfile._filename to be the same for all ' + 'timezone files. Please construct dateutil timezones ' + 'implicitly by passing a string like "dateutil/Europe' + '/London" when you construct your pandas objects instead ' + 'of passing a timezone object. See ' + 'https://github.com/pandas-dev/pandas/pull/7362') + return 'dateutil/' + tz._filename + else: + # tz is a pytz timezone or unknown. + try: + zone = tz.zone + if zone is None: + return tz + return zone + except AttributeError: + return tz + + +def get_timezone(tz): + return _get_zone(tz) + +#---------------------------------------------------------------------- +# UTC Offsets + +cpdef _get_utcoffset(tzinfo, obj): + try: + return tzinfo._utcoffset + except AttributeError: + return tzinfo.utcoffset(obj) From 9a8427404efb3df5deda12f76352725d628adf5e Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 12 Sep 2017 06:26:02 -0400 Subject: [PATCH 073/188] DOC: fix parquet example to not use ns --- doc/source/io.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index de3150035c446b..8fbb23769492e4 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -4515,8 +4515,7 @@ See the documentation for `pyarrow `__ and 'd': np.arange(4.0, 7.0, dtype='float64'), 'e': [True, False, True], 'f': pd.date_range('20130101', periods=3), - 'g': pd.date_range('20130101', periods=3, tz='US/Eastern'), - 'h': pd.date_range('20130101', periods=3, freq='ns')}) + 'g': pd.date_range('20130101', periods=3, tz='US/Eastern')}) df df.dtypes From d46b027e793e0f7b03a9372b82ac68cd35c1f35f Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Tue, 12 Sep 2017 19:31:32 +0900 Subject: [PATCH 074/188] Prevent UnicodeDecodeError in pivot_table under Py2 (#17489) --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/core/reshape/pivot.py | 2 +- pandas/tests/reshape/test_pivot.py | 10 ++++++++++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 9d475390175b28..fe24f8f4991727 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -1705,6 +1705,7 @@ Reshaping - Bug in ``pd.concat()`` in which concatenating with an empty dataframe with ``join='inner'`` was being improperly handled (:issue:`15328`) - Bug with ``sort=True`` in ``DataFrame.join`` and ``pd.merge`` when joining on indexes (:issue:`15582`) - Bug in ``DataFrame.nsmallest`` and ``DataFrame.nlargest`` where identical values resulted in duplicated rows (:issue:`15297`) +- Bug in :func:`pandas.pivot_table` incorrectly raising ``UnicodeError`` when passing unicode input for ```margins`` keyword (:issue:`13292`) Numeric ^^^^^^^ diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index f07123ca184895..d19de6030d4736 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -145,7 +145,7 @@ def _add_margins(table, data, values, rows, cols, aggfunc, if not isinstance(margins_name, compat.string_types): raise ValueError('margins_name argument must be a string') - msg = 'Conflicting name "{name}" in margins'.format(name=margins_name) + msg = u'Conflicting name "{name}" in margins'.format(name=margins_name) for level in table.index.names: if margins_name in table.index.get_level_values(level): raise ValueError(msg) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 879ac96680fbb5..bd8a999ce23304 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1625,3 +1625,13 @@ def test_isleapyear_deprecate(self): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): assert isleapyear(2004) + + def test_pivot_margins_name_unicode(self): + # issue #13292 + greek = u'\u0394\u03bf\u03ba\u03b9\u03bc\u03ae' + frame = pd.DataFrame({'foo': [1, 2, 3]}) + table = pd.pivot_table(frame, index=['foo'], aggfunc=len, margins=True, + margins_name=greek) + index = pd.Index([1, 2, 3, greek], dtype='object', name='foo') + expected = pd.DataFrame(index=index) + tm.assert_frame_equal(table, expected) From e682902327bd883a207b291b0326f277b3dcdd12 Mon Sep 17 00:00:00 2001 From: T N Date: Tue, 12 Sep 2017 19:35:55 +0900 Subject: [PATCH 075/188] DEPR: Add warning for True for dropna of SeriesGroupBy.nth (#17493) --- doc/source/whatsnew/v0.21.0.txt | 2 ++ pandas/core/groupby.py | 21 +++++++++++++++------ pandas/tests/groupby/test_nth.py | 10 ++++++++-- 3 files changed, 25 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index eccd71f45ec276..33232d2b09416c 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -341,6 +341,8 @@ Deprecations - ``pd.options.html.border`` has been deprecated in favor of ``pd.options.display.html.border`` (:issue:`15793`). +- :func:`SeriesGroupBy.nth` has deprecated ``True`` in favor of ``'all'`` for its kwarg ``dropna`` (:issue:`11038`). + .. _whatsnew_0210.prior_deprecations: Removal of prior version deprecations/changes diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 248f3b2095a785..f14ed08a27fae8 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1393,12 +1393,21 @@ def nth(self, n, dropna=None): return out.sort_index() if self.sort else out - if isinstance(self._selected_obj, DataFrame) and \ - dropna not in ['any', 'all']: - # Note: when agg-ing picker doesn't raise this, just returns NaN - raise ValueError("For a DataFrame groupby, dropna must be " - "either None, 'any' or 'all', " - "(was passed %s)." % (dropna),) + if dropna not in ['any', 'all']: + if isinstance(self._selected_obj, Series) and dropna is True: + warnings.warn("the dropna='%s' keyword is deprecated," + "use dropna='all' instead. " + "For a Series groupby, dropna must be " + "either None, 'any' or 'all'." % (dropna), + FutureWarning, + stacklevel=2) + dropna = 'all' + else: + # Note: when agg-ing picker doesn't raise this, + # just returns NaN + raise ValueError("For a DataFrame groupby, dropna must be " + "either None, 'any' or 'all', " + "(was passed %s)." % (dropna),) # old behaviour, but with all and any support for DataFrames. # modified in GH 7559 to have better perf diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py index 28392537be3c66..ffbede0eb208f3 100644 --- a/pandas/tests/groupby/test_nth.py +++ b/pandas/tests/groupby/test_nth.py @@ -2,7 +2,10 @@ import pandas as pd from pandas import DataFrame, MultiIndex, Index, Series, isna from pandas.compat import lrange -from pandas.util.testing import assert_frame_equal, assert_series_equal +from pandas.util.testing import ( + assert_frame_equal, + assert_produces_warning, + assert_series_equal) from .common import MixIn @@ -171,7 +174,10 @@ def test_nth(self): # doc example df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) g = df.groupby('A') - result = g.B.nth(0, dropna=True) + # PR 17493, related to issue 11038 + # test Series.nth with True for dropna produces DeprecationWarning + with assert_produces_warning(FutureWarning): + result = g.B.nth(0, dropna=True) expected = g.B.first() assert_series_equal(result, expected) From 83436af8ae1ccad49b7ceac7471c060d823d10ab Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 12 Sep 2017 08:54:53 -0400 Subject: [PATCH 076/188] COMPAT: Iteration should always yield a python scalar (#17491) xref #10904 closes #13236 closes #13256 xref #14216 --- doc/source/whatsnew/v0.21.0.txt | 47 ++++++++++++++++ pandas/core/base.py | 25 ++++++++- pandas/core/categorical.py | 6 ++ pandas/core/indexes/base.py | 9 --- pandas/core/indexes/category.py | 4 ++ pandas/core/series.py | 13 ----- pandas/core/sparse/array.py | 12 +++- pandas/tests/frame/test_api.py | 11 ++-- pandas/tests/frame/test_convert_to.py | 13 +++++ pandas/tests/series/test_io.py | 36 +----------- pandas/tests/test_base.py | 79 +++++++++++++++++++++++++-- 11 files changed, 187 insertions(+), 68 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 33232d2b09416c..89da897f6c5292 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -188,6 +188,53 @@ the target. Now, a ``ValueError`` will be raised when such an input is passed in ... ValueError: Cannot operate inplace if there is no assignment +.. _whatsnew_0210.api_breaking.iteration_scalars: + +Iteration of Series/Index will now return python scalars +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Previously, when using certain iteration methods for a ``Series`` with dtype ``int`` or ``float``, you would receive a ``numpy`` scalar, e.g. a ``np.int64``, rather than a python ``int``. Issue (:issue:`10904`) corrected this for ``Series.tolist()`` and ``list(Series)``. This change makes all iteration methods consistent, in particular, for ``__iter__()`` and ``.map()``; note that this only affect int/float dtypes. (:issue:`13236`, :issue:`13258`, :issue:`14216`). + +.. ipython:: python + + s = Series([1, 2, 3]) + s + +Previously: + +.. code-block:: python + + In [2]: type(list(s)[0]) + Out[2]: numpy.int64 + +New Behaviour: + +.. ipython:: python + + type(list(s)[0]) + +Furthermore this will now correctly box the results of iteration for :func:`DataFrame.to_dict` as well. + +.. ipython:: python + + d = {'a':[1], 'b':['b']} + df = DataFrame(d) + +Previously: + +.. code-block:: python + + In [8]: type(df.to_dict()['a'][0]) + Out[8]: numpy.int64 + +New Behaviour: + +.. ipython:: python + + type(df.to_dict()['a'][0]) + +.. _whatsnew_0210.api_breaking.dtype_conversions: + Dtype Conversions ^^^^^^^^^^^^^^^^^ diff --git a/pandas/core/base.py b/pandas/core/base.py index d60a8515dc920f..62d89eac4b3548 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -8,7 +8,12 @@ from pandas.core.dtypes.missing import isna from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries, ABCIndexClass -from pandas.core.dtypes.common import is_object_dtype, is_list_like, is_scalar +from pandas.core.dtypes.common import ( + is_object_dtype, + is_list_like, + is_scalar, + is_datetimelike) + from pandas.util._validators import validate_bool_kwarg from pandas.core import common as com @@ -18,7 +23,8 @@ from pandas.compat import PYPY from pandas.util._decorators import (Appender, cache_readonly, deprecate_kwarg, Substitution) -from pandas.core.common import AbstractMethodError +from pandas.core.common import AbstractMethodError, _maybe_box_datetimelike + from pandas.core.accessor import DirNamesMixin _shared_docs = dict() @@ -884,6 +890,21 @@ def argmin(self, axis=None): """ return nanops.nanargmin(self.values) + def tolist(self): + """ + return a list of the values; box to scalars + """ + return list(self.__iter__()) + + def __iter__(self): + """ + provide iteration over the values; box to scalars + """ + if is_datetimelike(self): + return (_maybe_box_datetimelike(x) for x in self._values) + else: + return iter(self._values.tolist()) + @cache_readonly def hasnans(self): """ return if I have any nans; enables various perf speedups """ diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 1c2a29333001ca..dbd2a79b7e46d9 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -399,6 +399,12 @@ def itemsize(self): """ return the size of a single category """ return self.categories.itemsize + def tolist(self): + """ + return a list of my values + """ + return np.array(self).tolist() + def reshape(self, new_shape, *args, **kwargs): """ .. deprecated:: 0.19.0 diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index ef5f68936044a8..008828cf4f309a 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -585,12 +585,6 @@ def memory_usage(self, deep=False): return result # ops compat - def tolist(self): - """ - return a list of the Index values - """ - return list(self.values) - @deprecate_kwarg(old_arg_name='n', new_arg_name='repeats') def repeat(self, repeats, *args, **kwargs): """ @@ -1601,9 +1595,6 @@ def is_all_dates(self): return False return is_datetime_array(_ensure_object(self.values)) - def __iter__(self): - return iter(self.values) - def __reduce__(self): d = dict(data=self._data) d.update(self._get_attributes_dict()) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 0681202289311e..c8044b14e4e57e 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -253,6 +253,10 @@ def get_values(self): """ return the underlying data as an ndarray """ return self._data.get_values() + def __iter__(self): + """ iterate like Categorical """ + return self._data.__iter__() + @property def codes(self): return self._data.codes diff --git a/pandas/core/series.py b/pandas/core/series.py index 6905fc1aced742..ac11c5f908fdcf 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -19,7 +19,6 @@ is_integer, is_integer_dtype, is_float_dtype, is_extension_type, is_datetimetz, - is_datetimelike, is_datetime64tz_dtype, is_timedelta64_dtype, is_list_like, @@ -1095,14 +1094,6 @@ def to_string(self, buf=None, na_rep='NaN', float_format=None, header=True, with open(buf, 'w') as f: f.write(result) - def __iter__(self): - """ provide iteration over the values of the Series - box values if necessary """ - if is_datetimelike(self): - return (_maybe_box_datetimelike(x) for x in self._values) - else: - return iter(self._values) - def iteritems(self): """ Lazily iterate over (index, value) tuples @@ -1118,10 +1109,6 @@ def keys(self): """Alias for index""" return self.index - def tolist(self): - """ Convert Series to a nested list """ - return list(self.asobject) - def to_dict(self, into=dict): """ Convert Series to {label -> value} dict or dict-like object. diff --git a/pandas/core/sparse/array.py b/pandas/core/sparse/array.py index 2f830a98db6497..f965c91999a03d 100644 --- a/pandas/core/sparse/array.py +++ b/pandas/core/sparse/array.py @@ -407,8 +407,18 @@ def to_dense(self, fill=None): return self.values def __iter__(self): + if np.issubdtype(self.dtype, np.floating): + boxer = float + elif np.issubdtype(self.dtype, np.integer): + boxer = int + else: + boxer = lambda x: x + for i in range(len(self)): - yield self._get_val_at(i) + r = self._get_val_at(i) + + # box em + yield boxer(r) def __getitem__(self, key): """ diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index a62fcb506a34bc..b3209da6449d6a 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -9,7 +9,7 @@ import sys from distutils.version import LooseVersion -from pandas.compat import range, lrange +from pandas.compat import range, lrange, long from pandas import compat from numpy.random import randn @@ -205,15 +205,18 @@ def test_itertuples(self): 'ints': lrange(5)}, columns=['floats', 'ints']) for tup in df.itertuples(index=False): - assert isinstance(tup[1], np.integer) + assert isinstance(tup[1], (int, long)) df = self.klass(data={"a": [1, 2, 3], "b": [4, 5, 6]}) dfaa = df[['a', 'a']] assert (list(dfaa.itertuples()) == [(0, 1, 1), (1, 2, 2), (2, 3, 3)]) - assert (repr(list(df.itertuples(name=None))) == - '[(0, 1, 4), (1, 2, 5), (2, 3, 6)]') + + # repr with be int/long on windows + if not compat.is_platform_windows(): + assert (repr(list(df.itertuples(name=None))) == + '[(0, 1, 4), (1, 2, 5), (2, 3, 6)]') tup = next(df.itertuples(name='TestName')) diff --git a/pandas/tests/frame/test_convert_to.py b/pandas/tests/frame/test_convert_to.py index 629c695b702fe2..99e5630ce6a43c 100644 --- a/pandas/tests/frame/test_convert_to.py +++ b/pandas/tests/frame/test_convert_to.py @@ -5,6 +5,7 @@ import numpy as np from pandas import compat +from pandas.compat import long from pandas import (DataFrame, Series, MultiIndex, Timestamp, date_range) @@ -236,3 +237,15 @@ def test_to_records_datetimeindex_with_tz(self, tz): # both converted to UTC, so they are equal tm.assert_numpy_array_equal(result, expected) + + def test_to_dict_box_scalars(self): + # 14216 + # make sure that we are boxing properly + d = {'a': [1], 'b': ['b']} + + result = DataFrame(d).to_dict() + assert isinstance(list(result['a'])[0], (int, long)) + assert isinstance(list(result['b'])[0], (int, long)) + + result = DataFrame(d).to_dict(orient='records') + assert isinstance(result[0]['a'], (int, long)) diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py index 503185de427f16..5b7fd1ec94a90b 100644 --- a/pandas/tests/series/test_io.py +++ b/pandas/tests/series/test_io.py @@ -10,7 +10,7 @@ from pandas import Series, DataFrame -from pandas.compat import StringIO, u, long +from pandas.compat import StringIO, u from pandas.util.testing import (assert_series_equal, assert_almost_equal, assert_frame_equal, ensure_clean) import pandas.util.testing as tm @@ -178,37 +178,3 @@ def test_to_dict(self, mapping): from_method = Series(ts.to_dict(collections.Counter)) from_constructor = Series(collections.Counter(ts.iteritems())) tm.assert_series_equal(from_method, from_constructor) - - -class TestSeriesToList(TestData): - - def test_tolist(self): - rs = self.ts.tolist() - xp = self.ts.values.tolist() - assert_almost_equal(rs, xp) - - # datetime64 - s = Series(self.ts.index) - rs = s.tolist() - assert self.ts.index[0] == rs[0] - - def test_tolist_np_int(self): - # GH10904 - for t in ['int8', 'int16', 'int32', 'int64']: - s = pd.Series([1], dtype=t) - assert isinstance(s.tolist()[0], (int, long)) - - def test_tolist_np_uint(self): - # GH10904 - for t in ['uint8', 'uint16']: - s = pd.Series([1], dtype=t) - assert isinstance(s.tolist()[0], int) - for t in ['uint32', 'uint64']: - s = pd.Series([1], dtype=t) - assert isinstance(s.tolist()[0], long) - - def test_tolist_np_float(self): - # GH10904 - for t in ['float16', 'float32', 'float64']: - s = pd.Series([1], dtype=t) - assert isinstance(s.tolist()[0], float) diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 9e92c7cf1a9b81..210d0260b8d95b 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -13,9 +13,10 @@ is_object_dtype, is_datetimetz, needs_i8_conversion) import pandas.util.testing as tm -from pandas import (Series, Index, DatetimeIndex, TimedeltaIndex, PeriodIndex, - Timedelta, IntervalIndex, Interval) -from pandas.compat import StringIO, PYPY +from pandas import (Series, Index, DatetimeIndex, TimedeltaIndex, + PeriodIndex, Timedelta, IntervalIndex, Interval, + CategoricalIndex, Timestamp) +from pandas.compat import StringIO, PYPY, long from pandas.compat.numpy import np_array_datetime64_compat from pandas.core.base import PandasDelegate, NoNewAttributesMixin from pandas.core.indexes.datetimelike import DatetimeIndexOpsMixin @@ -433,7 +434,7 @@ def test_value_counts_unique_nunique(self): # datetimetz Series returns array of Timestamp assert result[0] == orig[0] for r in result: - assert isinstance(r, pd.Timestamp) + assert isinstance(r, Timestamp) tm.assert_numpy_array_equal(result, orig._values.asobject.values) else: @@ -1031,3 +1032,73 @@ def f(): pytest.raises(AttributeError, f) assert not hasattr(t, "b") + + +class TestToIterable(object): + # test that we convert an iterable to python types + + dtypes = [ + ('int8', (int, long)), + ('int16', (int, long)), + ('int32', (int, long)), + ('int64', (int, long)), + ('uint8', (int, long)), + ('uint16', (int, long)), + ('uint32', (int, long)), + ('uint64', (int, long)), + ('float16', float), + ('float32', float), + ('float64', float), + ('datetime64[ns]', Timestamp), + ('datetime64[ns, US/Eastern]', Timestamp), + ('timedelta64[ns]', Timedelta)] + + @pytest.mark.parametrize( + 'dtype, rdtype', + dtypes + [ + ('object', object), + ('category', object)]) + @pytest.mark.parametrize( + 'method', + [ + lambda x: x.tolist(), + lambda x: list(x), + lambda x: list(x.__iter__()), + ], ids=['tolist', 'list', 'iter']) + @pytest.mark.parametrize('typ', [Series, Index]) + def test_iterable(self, typ, method, dtype, rdtype): + # gh-10904 + # gh-13258 + # coerce iteration to underlying python / pandas types + s = typ([1], dtype=dtype) + result = method(s)[0] + assert isinstance(result, rdtype) + + @pytest.mark.parametrize( + 'dtype, rdtype', + dtypes + [ + ('object', (int, long)), + ('category', (int, long))]) + @pytest.mark.parametrize('typ', [Series, Index]) + def test_iterable_map(self, typ, dtype, rdtype): + # gh-13236 + # coerce iteration to underlying python / pandas types + s = typ([1], dtype=dtype) + result = s.map(type)[0] + if not isinstance(rdtype, tuple): + rdtype = tuple([rdtype]) + assert result in rdtype + + @pytest.mark.parametrize( + 'method', + [ + lambda x: x.tolist(), + lambda x: list(x), + lambda x: list(x.__iter__()), + ], ids=['tolist', 'list', 'iter']) + def test_categorial_datetimelike(self, method): + i = CategoricalIndex([Timestamp('1999-12-31'), + Timestamp('2000-12-31')]) + + result = method(i)[0] + assert isinstance(result, Timestamp) From 633be31adcd43fc8bfe9a9fd9e7621ff3fc8ccbd Mon Sep 17 00:00:00 2001 From: Giftlin <31629119+Giftlin@users.noreply.github.com> Date: Wed, 13 Sep 2017 15:33:30 +0530 Subject: [PATCH 077/188] DOC: grammatical mistake (#17511) --- pandas/plotting/_tools.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/plotting/_tools.py b/pandas/plotting/_tools.py index 389e238ccb96ec..6deddc97915f1f 100644 --- a/pandas/plotting/_tools.py +++ b/pandas/plotting/_tools.py @@ -141,7 +141,7 @@ def _subplots(naxes=None, sharex=False, sharey=False, squeeze=True, array of Axis objects are returned as numpy 1-d arrays. - for NxM subplots with N>1 and M>1 are returned as a 2d array. - If False, no squeezing at all is done: the returned axis object is always + If False, no squeezing is done: the returned axis object is always a 2-d array containing Axis instances, even if it ends up being 1x1. subplot_kw : dict From f6d4d7078d49503adf990f0c159eb603ca1f0c1a Mon Sep 17 00:00:00 2001 From: topper-123 Date: Wed, 13 Sep 2017 11:04:32 +0100 Subject: [PATCH 078/188] removed versionadded <0.17 (#17504) --- doc/source/whatsnew/v0.21.0.txt | 2 +- pandas/core/frame.py | 6 ------ pandas/core/generic.py | 6 ------ pandas/core/indexes/category.py | 2 -- pandas/core/indexes/datetimes.py | 2 +- pandas/core/reshape/reshape.py | 2 -- pandas/core/sparse/series.py | 4 ---- pandas/core/strings.py | 5 ----- 8 files changed, 2 insertions(+), 27 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 89da897f6c5292..6ffa903c741500 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -526,4 +526,4 @@ Other ^^^^^ - Bug in :func:`eval` where the ``inplace`` parameter was being incorrectly handled (:issue:`16732`) - Several ``NaT`` method docstrings (e.g. :func:`NaT.ctime`) were incorrect (:issue:`17327`) -- The documentation has had references to versions < v0.16 removed and cleaned up (:issue:`17442`, :issue:`17442` & :issue:`#17404`) +- The documentation has had references to versions < v0.17 removed and cleaned up (:issue:`17442`, :issue:`17442`, :issue:`17404` & :issue:`17504`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5991ec825c8417..dd5d490ea66a8f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1479,8 +1479,6 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, Character recognized as decimal separator. E.g. use ',' for European data - .. versionadded:: 0.16.0 - """ formatter = fmt.CSVFormatter(self, path_or_buf, line_terminator=line_terminator, sep=sep, @@ -2165,8 +2163,6 @@ def _getitem_frame(self, key): def query(self, expr, inplace=False, **kwargs): """Query the columns of a frame with a boolean expression. - .. versionadded:: 0.13 - Parameters ---------- expr : string @@ -2561,8 +2557,6 @@ def assign(self, **kwargs): Assign new columns to a DataFrame, returning a new object (a copy) with all the original columns in addition to the new ones. - .. versionadded:: 0.16.0 - Parameters ---------- kwargs : keyword, value pairs diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 8d16b079ba2c8d..a71bf7be1bc753 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2348,8 +2348,6 @@ def drop(self, labels, axis=0, level=None, inplace=False, errors='raise'): errors : {'ignore', 'raise'}, default 'raise' If 'ignore', suppress error and existing labels are dropped. - .. versionadded:: 0.16.1 - Returns ------- dropped : type of caller @@ -3070,8 +3068,6 @@ def sample(self, n=None, frac=None, replace=False, weights=None, """ Returns a random sample of items from an axis of object. - .. versionadded:: 0.16.1 - Parameters ---------- n : int, optional @@ -3228,8 +3224,6 @@ def sample(self, n=None, frac=None, replace=False, weights=None, _shared_docs['pipe'] = (""" Apply func(self, \*args, \*\*kwargs) - .. versionadded:: 0.16.2 - Parameters ---------- func : function diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index c8044b14e4e57e..baa3ebce6abbcc 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -33,8 +33,6 @@ class CategoricalIndex(Index, base.PandasDelegate): Immutable Index implementing an ordered, sliceable set. CategoricalIndex represents a sparsely populated Index with an underlying Categorical. - .. versionadded:: 0.16.1 - Parameters ---------- data : array-like or Categorical, (1-dimensional) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 5a04c550f4502b..4cfb7547e7d0ac 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -1577,7 +1577,7 @@ def _set_freq(self, value): days_in_month = _field_accessor( 'days_in_month', 'dim', - "The number of days in the month\n\n.. versionadded:: 0.16.0") + "The number of days in the month") daysinmonth = days_in_month is_month_start = _field_accessor( 'is_month_start', diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index b4abba8026b35b..7260bc9a8b7a14 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1110,8 +1110,6 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, Whether the dummy columns should be sparse or not. Returns SparseDataFrame if `data` is a Series or if all columns are included. Otherwise returns a DataFrame with some SparseBlocks. - - .. versionadded:: 0.16.1 drop_first : bool, default False Whether to get k-1 dummies out of k categorical levels by removing the first level. diff --git a/pandas/core/sparse/series.py b/pandas/core/sparse/series.py index 99aec2dd115697..2aecb9d7c4ffbd 100644 --- a/pandas/core/sparse/series.py +++ b/pandas/core/sparse/series.py @@ -732,8 +732,6 @@ def to_coo(self, row_levels=(0, ), column_levels=(1, ), sort_labels=False): (labels) or numbers of the levels. {row_levels, column_levels} must be a partition of the MultiIndex level names (or numbers). - .. versionadded:: 0.16.0 - Parameters ---------- row_levels : tuple/list @@ -784,8 +782,6 @@ def from_coo(cls, A, dense_index=False): """ Create a SparseSeries from a scipy.sparse.coo_matrix. - .. versionadded:: 0.16.0 - Parameters ---------- A : scipy.sparse.coo_matrix diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 48bc2ee05dd680..021f88d1aec002 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -602,8 +602,6 @@ def str_extract(arr, pat, flags=0, expand=None): For each subject string in the Series, extract groups from the first match of regular expression pat. - .. versionadded:: 0.13.0 - Parameters ---------- pat : string @@ -1016,7 +1014,6 @@ def str_split(arr, pat=None, n=None): * If True, return DataFrame/MultiIndex expanding dimensionality. * If False, return Series/Index. - .. versionadded:: 0.16.1 return_type : deprecated, use `expand` Returns @@ -1047,8 +1044,6 @@ def str_rsplit(arr, pat=None, n=None): string, starting at the end of the string and working to the front. Equivalent to :meth:`str.rsplit`. - .. versionadded:: 0.16.2 - Parameters ---------- pat : string, default None From f11bbf2f505d81900cc83ce387a6a1b1d2a2f866 Mon Sep 17 00:00:00 2001 From: Giftlin <31629119+Giftlin@users.noreply.github.com> Date: Wed, 13 Sep 2017 17:54:57 +0530 Subject: [PATCH 079/188] DOC: grammatical mistakes (#17512) --- pandas/io/stata.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 253ed03c25db94..92f180506a8b71 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -57,7 +57,7 @@ identifier of column that should be used as index of the DataFrame convert_missing : boolean, defaults to False Flag indicating whether to convert missing values to their Stata - representations. If False, missing values are replaced with nans. + representations. If False, missing values are replaced with nan. If True, columns containing missing values are returned with object data types and missing values are represented by StataMissingValue objects. @@ -248,8 +248,9 @@ def _stata_elapsed_date_to_datetime_vec(dates, fmt): def convert_year_month_safe(year, month): """ Convert year and month to datetimes, using pandas vectorized versions - when the date range falls within the range supported by pandas. Other - wise it falls back to a slower but more robust method using datetime. + when the date range falls within the range supported by pandas. + Otherwise it falls back to a slower but more robust method + using datetime. """ if year.max() < MAX_YEAR and year.min() > MIN_YEAR: return to_datetime(100 * year + month, format='%Y%m') From eef810ef2c64be00943696b33e8bab0b4dd66e9e Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 13 Sep 2017 19:18:56 -0400 Subject: [PATCH 080/188] COMPAT: followup to #17491 (#17503) --- doc/source/whatsnew/v0.21.0.txt | 14 ++--- pandas/core/base.py | 27 ++++++--- pandas/core/categorical.py | 10 +++- pandas/core/indexes/category.py | 5 +- pandas/tests/indexes/test_category.py | 13 +++-- pandas/tests/series/test_api.py | 37 ------------- pandas/tests/test_base.py | 79 +++++++++++++++++++++++++-- 7 files changed, 119 insertions(+), 66 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 6ffa903c741500..9da1f321ef5740 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -190,19 +190,19 @@ the target. Now, a ``ValueError`` will be raised when such an input is passed in .. _whatsnew_0210.api_breaking.iteration_scalars: -Iteration of Series/Index will now return python scalars +Iteration of Series/Index will now return Python scalars ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Previously, when using certain iteration methods for a ``Series`` with dtype ``int`` or ``float``, you would receive a ``numpy`` scalar, e.g. a ``np.int64``, rather than a python ``int``. Issue (:issue:`10904`) corrected this for ``Series.tolist()`` and ``list(Series)``. This change makes all iteration methods consistent, in particular, for ``__iter__()`` and ``.map()``; note that this only affect int/float dtypes. (:issue:`13236`, :issue:`13258`, :issue:`14216`). +Previously, when using certain iteration methods for a ``Series`` with dtype ``int`` or ``float``, you would receive a ``numpy`` scalar, e.g. a ``np.int64``, rather than a Python ``int``. Issue (:issue:`10904`) corrected this for ``Series.tolist()`` and ``list(Series)``. This change makes all iteration methods consistent, in particular, for ``__iter__()`` and ``.map()``; note that this only affects int/float dtypes. (:issue:`13236`, :issue:`13258`, :issue:`14216`). .. ipython:: python - s = Series([1, 2, 3]) + s = pd.Series([1, 2, 3]) s Previously: -.. code-block:: python +.. code-block:: ipython In [2]: type(list(s)[0]) Out[2]: numpy.int64 @@ -215,14 +215,14 @@ New Behaviour: Furthermore this will now correctly box the results of iteration for :func:`DataFrame.to_dict` as well. -.. ipython:: python +.. ipython:: ipython d = {'a':[1], 'b':['b']} - df = DataFrame(d) + df = pd,DataFrame(d) Previously: -.. code-block:: python +.. code-block:: ipython In [8]: type(df.to_dict()['a'][0]) Out[8]: numpy.int64 diff --git a/pandas/core/base.py b/pandas/core/base.py index 62d89eac4b3548..f0e8d8a16661bb 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -892,18 +892,31 @@ def argmin(self, axis=None): def tolist(self): """ - return a list of the values; box to scalars + Return a list of the values. + + These are each a scalar type, which is a Python scalar + (for str, int, float) or a pandas scalar + (for Timestamp/Timedelta/Interval/Period) + + See Also + -------- + numpy.tolist """ - return list(self.__iter__()) + + if is_datetimelike(self): + return [_maybe_box_datetimelike(x) for x in self._values] + else: + return self._values.tolist() def __iter__(self): """ - provide iteration over the values; box to scalars + Return an iterator of the values. + + These are each a scalar type, which is a Python scalar + (for str, int, float) or a pandas scalar + (for Timestamp/Timedelta/Interval/Period) """ - if is_datetimelike(self): - return (_maybe_box_datetimelike(x) for x in self._values) - else: - return iter(self._values.tolist()) + return iter(self.tolist()) @cache_readonly def hasnans(self): diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index dbd2a79b7e46d9..97df72900428c4 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -26,7 +26,7 @@ is_integer_dtype, is_bool, is_list_like, is_sequence, is_scalar) -from pandas.core.common import is_null_slice +from pandas.core.common import is_null_slice, _maybe_box_datetimelike from pandas.core.algorithms import factorize, take_1d, unique1d from pandas.core.base import (PandasObject, PandasDelegate, @@ -401,8 +401,14 @@ def itemsize(self): def tolist(self): """ - return a list of my values + Return a list of the values. + + These are each a scalar type, which is a Python scalar + (for str, int, float) or a pandas scalar + (for Timestamp/Timedelta/Interval/Period) """ + if is_datetimelike(self.categories): + return [_maybe_box_datetimelike(x) for x in self] return np.array(self).tolist() def reshape(self, new_shape, *args, **kwargs): diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index baa3ebce6abbcc..71cd4790ac3648 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -251,9 +251,8 @@ def get_values(self): """ return the underlying data as an ndarray """ return self._data.get_values() - def __iter__(self): - """ iterate like Categorical """ - return self._data.__iter__() + def tolist(self): + return self._data.tolist() @property def codes(self): diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 05d31af57b36c5..aac68ebd6abede 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -576,12 +576,13 @@ def test_isin(self): ci.isin(['c', 'a', 'b', np.nan]), np.array([True] * 6)) # mismatched categorical -> coerced to ndarray so doesn't matter - tm.assert_numpy_array_equal( - ci.isin(ci.set_categories(list('abcdefghi'))), np.array([True] * - 6)) - tm.assert_numpy_array_equal( - ci.isin(ci.set_categories(list('defghi'))), - np.array([False] * 5 + [True])) + result = ci.isin(ci.set_categories(list('abcdefghi'))) + expected = np.array([True] * 6) + tm.assert_numpy_array_equal(result, expected) + + result = ci.isin(ci.set_categories(list('defghi'))) + expected = np.array([False] * 5 + [True]) + tm.assert_numpy_array_equal(result, expected) def test_identical(self): diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index b7fbe803f8d3b9..d0805e2bb54d25 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -245,43 +245,6 @@ def test_iter(self): for i, val in enumerate(self.ts): assert val == self.ts[i] - def test_iter_box(self): - vals = [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-01-02')] - s = pd.Series(vals) - assert s.dtype == 'datetime64[ns]' - for res, exp in zip(s, vals): - assert isinstance(res, pd.Timestamp) - assert res.tz is None - assert res == exp - - vals = [pd.Timestamp('2011-01-01', tz='US/Eastern'), - pd.Timestamp('2011-01-02', tz='US/Eastern')] - s = pd.Series(vals) - - assert s.dtype == 'datetime64[ns, US/Eastern]' - for res, exp in zip(s, vals): - assert isinstance(res, pd.Timestamp) - assert res.tz == exp.tz - assert res == exp - - # timedelta - vals = [pd.Timedelta('1 days'), pd.Timedelta('2 days')] - s = pd.Series(vals) - assert s.dtype == 'timedelta64[ns]' - for res, exp in zip(s, vals): - assert isinstance(res, pd.Timedelta) - assert res == exp - - # period (object dtype, not boxed) - vals = [pd.Period('2011-01-01', freq='M'), - pd.Period('2011-01-02', freq='M')] - s = pd.Series(vals) - assert s.dtype == 'object' - for res, exp in zip(s, vals): - assert isinstance(res, pd.Period) - assert res.freq == 'M' - assert res == exp - def test_keys(self): # HACK: By doing this in two stages, we avoid 2to3 wrapping the call # to .keys() in a list() diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 210d0260b8d95b..38d78b12b31aa5 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -1054,10 +1054,7 @@ class TestToIterable(object): ('timedelta64[ns]', Timedelta)] @pytest.mark.parametrize( - 'dtype, rdtype', - dtypes + [ - ('object', object), - ('category', object)]) + 'dtype, rdtype', dtypes) @pytest.mark.parametrize( 'method', [ @@ -1074,6 +1071,43 @@ def test_iterable(self, typ, method, dtype, rdtype): result = method(s)[0] assert isinstance(result, rdtype) + @pytest.mark.parametrize( + 'dtype, rdtype, obj', + [ + ('object', object, 'a'), + ('object', (int, long), 1), + ('category', object, 'a'), + ('category', (int, long), 1)]) + @pytest.mark.parametrize( + 'method', + [ + lambda x: x.tolist(), + lambda x: list(x), + lambda x: list(x.__iter__()), + ], ids=['tolist', 'list', 'iter']) + @pytest.mark.parametrize('typ', [Series, Index]) + def test_iterable_object_and_category(self, typ, method, + dtype, rdtype, obj): + # gh-10904 + # gh-13258 + # coerce iteration to underlying python / pandas types + s = typ([obj], dtype=dtype) + result = method(s)[0] + assert isinstance(result, rdtype) + + @pytest.mark.parametrize( + 'dtype, rdtype', dtypes) + def test_iterable_items(self, dtype, rdtype): + # gh-13258 + # test items / iteritems yields the correct boxed scalars + # this only applies to series + s = Series([1], dtype=dtype) + _, result = list(s.items())[0] + assert isinstance(result, rdtype) + + _, result = list(s.iteritems())[0] + assert isinstance(result, rdtype) + @pytest.mark.parametrize( 'dtype, rdtype', dtypes + [ @@ -1102,3 +1136,40 @@ def test_categorial_datetimelike(self, method): result = method(i)[0] assert isinstance(result, Timestamp) + + def test_iter_box(self): + vals = [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-01-02')] + s = pd.Series(vals) + assert s.dtype == 'datetime64[ns]' + for res, exp in zip(s, vals): + assert isinstance(res, pd.Timestamp) + assert res.tz is None + assert res == exp + + vals = [pd.Timestamp('2011-01-01', tz='US/Eastern'), + pd.Timestamp('2011-01-02', tz='US/Eastern')] + s = pd.Series(vals) + + assert s.dtype == 'datetime64[ns, US/Eastern]' + for res, exp in zip(s, vals): + assert isinstance(res, pd.Timestamp) + assert res.tz == exp.tz + assert res == exp + + # timedelta + vals = [pd.Timedelta('1 days'), pd.Timedelta('2 days')] + s = pd.Series(vals) + assert s.dtype == 'timedelta64[ns]' + for res, exp in zip(s, vals): + assert isinstance(res, pd.Timedelta) + assert res == exp + + # period (object dtype, not boxed) + vals = [pd.Period('2011-01-01', freq='M'), + pd.Period('2011-01-02', freq='M')] + s = pd.Series(vals) + assert s.dtype == 'object' + for res, exp in zip(s, vals): + assert isinstance(res, pd.Period) + assert res.freq == 'M' + assert res == exp From fa557f7391589f351b1260f46b3b3db22492f50b Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 13 Sep 2017 16:20:53 -0700 Subject: [PATCH 081/188] De-privatize timezone funcs (#17502) --- pandas/_libs/index.pyx | 6 +- pandas/_libs/period.pyx | 14 ++--- pandas/_libs/src/inference.pyx | 6 +- pandas/_libs/tslib.pyx | 96 +++++++++++++++---------------- pandas/_libs/tslibs/timezones.pxd | 12 ++-- pandas/_libs/tslibs/timezones.pyx | 20 +++---- 6 files changed, 75 insertions(+), 79 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index bf4d53683c9b71..884117799ec5be 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -17,7 +17,7 @@ cimport tslib from hashtable cimport HashTable -from tslibs.timezones cimport _is_utc +from tslibs.timezones cimport is_utc, get_utcoffset from pandas._libs import tslib, algos, hashtable as _hash from pandas._libs.tslib import Timestamp, Timedelta from datetime import datetime, timedelta @@ -551,8 +551,8 @@ cdef inline _to_i8(object val): tzinfo = getattr(val, 'tzinfo', None) # Save the original date value so we can get the utcoffset from it. ival = _pydatetime_to_dts(val, &dts) - if tzinfo is not None and not _is_utc(tzinfo): - offset = tslib._get_utcoffset(tzinfo, val) + if tzinfo is not None and not is_utc(tzinfo): + offset = get_utcoffset(tzinfo, val) ival -= tslib._delta_to_nanoseconds(offset) return ival return val diff --git a/pandas/_libs/period.pyx b/pandas/_libs/period.pyx index 2b0734f5cf2e7a..9e473a7f362b44 100644 --- a/pandas/_libs/period.pyx +++ b/pandas/_libs/period.pyx @@ -34,7 +34,7 @@ from lib cimport is_null_datetimelike, is_period from pandas._libs import tslib, lib from pandas._libs.tslib import (Timedelta, Timestamp, iNaT, NaT) -from tslibs.timezones cimport _is_utc, _is_tzlocal, _get_utcoffset +from tslibs.timezones cimport is_utc, is_tzlocal, get_utcoffset from tslib cimport ( maybe_get_tz, _get_dst_info, @@ -533,7 +533,7 @@ cdef _reso_local(ndarray[int64_t] stamps, object tz): ndarray[int64_t] trans, deltas, pos pandas_datetimestruct dts - if _is_utc(tz): + if is_utc(tz): for i in range(n): if stamps[i] == NPY_NAT: continue @@ -541,7 +541,7 @@ cdef _reso_local(ndarray[int64_t] stamps, object tz): curr_reso = _reso_stamp(&dts) if curr_reso < reso: reso = curr_reso - elif _is_tzlocal(tz): + elif is_tzlocal(tz): for i in range(n): if stamps[i] == NPY_NAT: continue @@ -549,7 +549,7 @@ cdef _reso_local(ndarray[int64_t] stamps, object tz): &dts) dt = datetime(dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us, tz) - delta = int(_get_utcoffset(tz, dt).total_seconds()) * 1000000000 + delta = int(get_utcoffset(tz, dt).total_seconds()) * 1000000000 pandas_datetime_to_datetimestruct(stamps[i] + delta, PANDAS_FR_ns, &dts) curr_reso = _reso_stamp(&dts) @@ -597,7 +597,7 @@ cdef ndarray[int64_t] localize_dt64arr_to_period(ndarray[int64_t] stamps, ndarray[int64_t] trans, deltas, pos pandas_datetimestruct dts - if _is_utc(tz): + if is_utc(tz): for i in range(n): if stamps[i] == NPY_NAT: result[i] = NPY_NAT @@ -607,7 +607,7 @@ cdef ndarray[int64_t] localize_dt64arr_to_period(ndarray[int64_t] stamps, dts.hour, dts.min, dts.sec, dts.us, dts.ps, freq) - elif _is_tzlocal(tz): + elif is_tzlocal(tz): for i in range(n): if stamps[i] == NPY_NAT: result[i] = NPY_NAT @@ -616,7 +616,7 @@ cdef ndarray[int64_t] localize_dt64arr_to_period(ndarray[int64_t] stamps, &dts) dt = datetime(dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us, tz) - delta = int(_get_utcoffset(tz, dt).total_seconds()) * 1000000000 + delta = int(get_utcoffset(tz, dt).total_seconds()) * 1000000000 pandas_datetime_to_datetimestruct(stamps[i] + delta, PANDAS_FR_ns, &dts) result[i] = get_period_ordinal(dts.year, dts.month, dts.day, diff --git a/pandas/_libs/src/inference.pyx b/pandas/_libs/src/inference.pyx index 95145ff49b02fd..2bb362eab40975 100644 --- a/pandas/_libs/src/inference.pyx +++ b/pandas/_libs/src/inference.pyx @@ -3,7 +3,7 @@ from decimal import Decimal cimport util cimport cython from tslib import NaT -from tslibs.timezones cimport _get_zone +from tslibs.timezones cimport get_timezone from datetime import datetime, timedelta iNaT = util.get_nat() @@ -901,13 +901,13 @@ cpdef bint is_datetime_with_singletz_array(ndarray[object] values): for i in range(n): base_val = values[i] if base_val is not NaT: - base_tz = _get_zone(getattr(base_val, 'tzinfo', None)) + base_tz = get_timezone(getattr(base_val, 'tzinfo', None)) for j in range(i, n): val = values[j] if val is not NaT: tz = getattr(val, 'tzinfo', None) - if base_tz != tz and base_tz != _get_zone(tz): + if base_tz != tz and base_tz != get_timezone(tz): return False break diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index a8ae0fcd733d6c..629325c28ea9c6 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -108,11 +108,11 @@ iNaT = NPY_NAT from tslibs.timezones cimport ( - _is_utc, _is_tzlocal, - _treat_tz_as_dateutil, _treat_tz_as_pytz, - _get_zone, - _get_utcoffset) -from tslibs.timezones import get_timezone, _get_utcoffset # noqa + is_utc, is_tzlocal, + treat_tz_as_dateutil, treat_tz_as_pytz, + get_timezone, + get_utcoffset) +from tslibs.timezones import get_timezone, get_utcoffset # noqa cdef inline object create_timestamp_from_ts( @@ -160,7 +160,7 @@ def ints_to_pydatetime(ndarray[int64_t] arr, tz=None, freq=None, box=False): func_create = create_datetime_from_ts if tz is not None: - if _is_utc(tz): + if is_utc(tz): for i in range(n): value = arr[i] if value == NPY_NAT: @@ -169,7 +169,7 @@ def ints_to_pydatetime(ndarray[int64_t] arr, tz=None, freq=None, box=False): pandas_datetime_to_datetimestruct( value, PANDAS_FR_ns, &dts) result[i] = func_create(value, dts, tz, freq) - elif _is_tzlocal(tz) or _is_fixed_offset(tz): + elif is_tzlocal(tz) or _is_fixed_offset(tz): for i in range(n): value = arr[i] if value == NPY_NAT: @@ -194,7 +194,7 @@ def ints_to_pydatetime(ndarray[int64_t] arr, tz=None, freq=None, box=False): # Adjust datetime64 timestamp, recompute datetimestruct pos = trans.searchsorted(value, side='right') - 1 - if _treat_tz_as_pytz(tz): + if treat_tz_as_pytz(tz): # find right representation of dst etc in pytz timezone new_tz = tz._tzinfos[tz._transition_info[pos]] else: @@ -242,12 +242,12 @@ def ints_to_pytimedelta(ndarray[int64_t] arr, box=False): cdef inline bint _is_fixed_offset(object tz): - if _treat_tz_as_dateutil(tz): + if treat_tz_as_dateutil(tz): if len(tz._trans_idx) == 0 and len(tz._trans_list) == 0: return 1 else: return 0 - elif _treat_tz_as_pytz(tz): + elif treat_tz_as_pytz(tz): if (len(tz._transition_info) == 0 and len(tz._utc_transition_times) == 0): return 1 @@ -1107,12 +1107,12 @@ cdef class _Timestamp(datetime): try: stamp += self.strftime('%z') if self.tzinfo: - zone = _get_zone(self.tzinfo) + zone = get_timezone(self.tzinfo) except ValueError: year2000 = self.replace(year=2000) stamp += year2000.strftime('%z') if self.tzinfo: - zone = _get_zone(self.tzinfo) + zone = get_timezone(self.tzinfo) try: stamp += zone.strftime(' %%Z') @@ -1272,7 +1272,7 @@ cdef class _Timestamp(datetime): cdef: int64_t val val = self.value - if self.tz is not None and not _is_utc(self.tz): + if self.tz is not None and not is_utc(self.tz): val = tz_convert_single(self.value, 'UTC', self.tz) return val @@ -1510,14 +1510,14 @@ cdef convert_to_tsobject(object ts, object tz, object unit, except: pass obj.value = _pydatetime_to_dts(ts, &obj.dts) - ts_offset = _get_utcoffset(ts.tzinfo, ts) + ts_offset = get_utcoffset(ts.tzinfo, ts) obj.value -= _delta_to_nanoseconds(ts_offset) - tz_offset = _get_utcoffset(tz, ts) + tz_offset = get_utcoffset(tz, ts) obj.value += _delta_to_nanoseconds(tz_offset) pandas_datetime_to_datetimestruct(obj.value, PANDAS_FR_ns, &obj.dts) obj.tzinfo = tz - elif not _is_utc(tz): + elif not is_utc(tz): ts = _localize_pydatetime(ts, tz) obj.value = _pydatetime_to_dts(ts, &obj.dts) obj.tzinfo = ts.tzinfo @@ -1529,8 +1529,8 @@ cdef convert_to_tsobject(object ts, object tz, object unit, obj.value = _pydatetime_to_dts(ts, &obj.dts) obj.tzinfo = ts.tzinfo - if obj.tzinfo is not None and not _is_utc(obj.tzinfo): - offset = _get_utcoffset(obj.tzinfo, ts) + if obj.tzinfo is not None and not is_utc(obj.tzinfo): + offset = get_utcoffset(obj.tzinfo, ts) obj.value -= _delta_to_nanoseconds(offset) if is_timestamp(ts): @@ -1641,13 +1641,13 @@ cdef inline void _localize_tso(_TSObject obj, object tz): """ Take a TSObject in UTC and localizes to timezone tz. """ - if _is_utc(tz): + if is_utc(tz): obj.tzinfo = tz - elif _is_tzlocal(tz): + elif is_tzlocal(tz): pandas_datetime_to_datetimestruct(obj.value, PANDAS_FR_ns, &obj.dts) dt = datetime(obj.dts.year, obj.dts.month, obj.dts.day, obj.dts.hour, obj.dts.min, obj.dts.sec, obj.dts.us, tz) - delta = int(_get_utcoffset(tz, dt).total_seconds()) * 1000000000 + delta = int(get_utcoffset(tz, dt).total_seconds()) * 1000000000 if obj.value != NPY_NAT: pandas_datetime_to_datetimestruct(obj.value + delta, PANDAS_FR_ns, &obj.dts) @@ -1671,7 +1671,7 @@ cdef inline void _localize_tso(_TSObject obj, object tz): pandas_datetime_to_datetimestruct( obj.value, PANDAS_FR_ns, &obj.dts) obj.tzinfo = tz - elif _treat_tz_as_pytz(tz): + elif treat_tz_as_pytz(tz): inf = tz._transition_info[pos] if obj.value != NPY_NAT: pandas_datetime_to_datetimestruct(obj.value + deltas[pos], @@ -1680,7 +1680,7 @@ cdef inline void _localize_tso(_TSObject obj, object tz): pandas_datetime_to_datetimestruct(obj.value, PANDAS_FR_ns, &obj.dts) obj.tzinfo = tz._tzinfos[inf] - elif _treat_tz_as_dateutil(tz): + elif treat_tz_as_dateutil(tz): if obj.value != NPY_NAT: pandas_datetime_to_datetimestruct(obj.value + deltas[pos], PANDAS_FR_ns, &obj.dts) @@ -1770,10 +1770,10 @@ def datetime_to_datetime64(ndarray[object] values): elif PyDateTime_Check(val): if val.tzinfo is not None: if inferred_tz is not None: - if _get_zone(val.tzinfo) != inferred_tz: + if get_timezone(val.tzinfo) != inferred_tz: raise ValueError('Array must be all same time zone') else: - inferred_tz = _get_zone(val.tzinfo) + inferred_tz = get_timezone(val.tzinfo) _ts = convert_to_tsobject(val, None, None, 0, 0) iresult[i] = _ts.value @@ -4088,9 +4088,9 @@ def tz_convert(ndarray[int64_t] vals, object tz1, object tz2): return np.array([], dtype=np.int64) # Convert to UTC - if _get_zone(tz1) != 'UTC': + if get_timezone(tz1) != 'UTC': utc_dates = np.empty(n, dtype=np.int64) - if _is_tzlocal(tz1): + if is_tzlocal(tz1): for i in range(n): v = vals[i] if v == NPY_NAT: @@ -4099,7 +4099,7 @@ def tz_convert(ndarray[int64_t] vals, object tz1, object tz2): pandas_datetime_to_datetimestruct(v, PANDAS_FR_ns, &dts) dt = datetime(dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us, tz1) - delta = (int(_get_utcoffset(tz1, dt).total_seconds()) + delta = (int(get_utcoffset(tz1, dt).total_seconds()) * 1000000000) utc_dates[i] = v - delta else: @@ -4126,11 +4126,11 @@ def tz_convert(ndarray[int64_t] vals, object tz1, object tz2): else: utc_dates = vals - if _get_zone(tz2) == 'UTC': + if get_timezone(tz2) == 'UTC': return utc_dates result = np.zeros(n, dtype=np.int64) - if _is_tzlocal(tz2): + if is_tzlocal(tz2): for i in range(n): v = utc_dates[i] if v == NPY_NAT: @@ -4139,7 +4139,7 @@ def tz_convert(ndarray[int64_t] vals, object tz1, object tz2): pandas_datetime_to_datetimestruct(v, PANDAS_FR_ns, &dts) dt = datetime(dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us, tz2) - delta = (int(_get_utcoffset(tz2, dt).total_seconds()) + delta = (int(get_utcoffset(tz2, dt).total_seconds()) * 1000000000) result[i] = v + delta return result @@ -4202,13 +4202,13 @@ def tz_convert_single(int64_t val, object tz1, object tz2): return val # Convert to UTC - if _is_tzlocal(tz1): + if is_tzlocal(tz1): pandas_datetime_to_datetimestruct(val, PANDAS_FR_ns, &dts) dt = datetime(dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us, tz1) - delta = int(_get_utcoffset(tz1, dt).total_seconds()) * 1000000000 + delta = int(get_utcoffset(tz1, dt).total_seconds()) * 1000000000 utc_date = val - delta - elif _get_zone(tz1) != 'UTC': + elif get_timezone(tz1) != 'UTC': trans, deltas, typ = _get_dst_info(tz1) pos = trans.searchsorted(val, side='right') - 1 if pos < 0: @@ -4218,13 +4218,13 @@ def tz_convert_single(int64_t val, object tz1, object tz2): else: utc_date = val - if _get_zone(tz2) == 'UTC': + if get_timezone(tz2) == 'UTC': return utc_date - if _is_tzlocal(tz2): + if is_tzlocal(tz2): pandas_datetime_to_datetimestruct(val, PANDAS_FR_ns, &dts) dt = datetime(dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us, tz2) - delta = int(_get_utcoffset(tz2, dt).total_seconds()) * 1000000000 + delta = int(get_utcoffset(tz2, dt).total_seconds()) * 1000000000 return utc_date + delta # Convert UTC to other timezone @@ -4289,13 +4289,13 @@ cdef object _get_dst_info(object tz): """ cache_key = _tz_cache_key(tz) if cache_key is None: - num = int(_get_utcoffset(tz, None).total_seconds()) * 1000000000 + num = int(get_utcoffset(tz, None).total_seconds()) * 1000000000 return (np.array([NPY_NAT + 1], dtype=np.int64), np.array([num], dtype=np.int64), None) if cache_key not in dst_cache: - if _treat_tz_as_pytz(tz): + if treat_tz_as_pytz(tz): trans = np.array(tz._utc_transition_times, dtype='M8[ns]') trans = trans.view('i8') try: @@ -4306,7 +4306,7 @@ cdef object _get_dst_info(object tz): deltas = _unbox_utcoffsets(tz._transition_info) typ = 'pytz' - elif _treat_tz_as_dateutil(tz): + elif treat_tz_as_dateutil(tz): if len(tz._trans_list): # get utc trans times trans_list = _get_utc_trans_times_from_dateutil_tz(tz) @@ -4336,7 +4336,7 @@ cdef object _get_dst_info(object tz): else: # static tzinfo trans = np.array([NPY_NAT + 1], dtype=np.int64) - num = int(_get_utcoffset(tz, None).total_seconds()) * 1000000000 + num = int(get_utcoffset(tz, None).total_seconds()) * 1000000000 deltas = np.array([num], dtype=np.int64) typ = 'static' @@ -4405,13 +4405,13 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None, result = np.empty(n, dtype=np.int64) - if _is_tzlocal(tz): + if is_tzlocal(tz): for i in range(n): v = vals[i] pandas_datetime_to_datetimestruct(v, PANDAS_FR_ns, &dts) dt = datetime(dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us, tz) - delta = int(_get_utcoffset(tz, dt).total_seconds()) * 1000000000 + delta = int(get_utcoffset(tz, dt).total_seconds()) * 1000000000 result[i] = v - delta return result @@ -5116,7 +5116,7 @@ cdef _normalize_local(ndarray[int64_t] stamps, object tz): ndarray[int64_t] trans, deltas, pos pandas_datetimestruct dts - if _is_utc(tz): + if is_utc(tz): with nogil: for i in range(n): if stamps[i] == NPY_NAT: @@ -5125,7 +5125,7 @@ cdef _normalize_local(ndarray[int64_t] stamps, object tz): pandas_datetime_to_datetimestruct( stamps[i], PANDAS_FR_ns, &dts) result[i] = _normalized_stamp(&dts) - elif _is_tzlocal(tz): + elif is_tzlocal(tz): for i in range(n): if stamps[i] == NPY_NAT: result[i] = NPY_NAT @@ -5133,7 +5133,7 @@ cdef _normalize_local(ndarray[int64_t] stamps, object tz): pandas_datetime_to_datetimestruct(stamps[i], PANDAS_FR_ns, &dts) dt = datetime(dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us, tz) - delta = int(_get_utcoffset(tz, dt).total_seconds()) * 1000000000 + delta = int(get_utcoffset(tz, dt).total_seconds()) * 1000000000 pandas_datetime_to_datetimestruct(stamps[i] + delta, PANDAS_FR_ns, &dts) result[i] = _normalized_stamp(&dts) @@ -5180,12 +5180,12 @@ def dates_normalized(ndarray[int64_t] stamps, tz=None): Py_ssize_t i, n = len(stamps) pandas_datetimestruct dts - if tz is None or _is_utc(tz): + if tz is None or is_utc(tz): for i in range(n): pandas_datetime_to_datetimestruct(stamps[i], PANDAS_FR_ns, &dts) if (dts.hour + dts.min + dts.sec + dts.us) > 0: return False - elif _is_tzlocal(tz): + elif is_tzlocal(tz): for i in range(n): pandas_datetime_to_datetimestruct(stamps[i], PANDAS_FR_ns, &dts) dt = datetime(dts.year, dts.month, dts.day, dts.hour, dts.min, diff --git a/pandas/_libs/tslibs/timezones.pxd b/pandas/_libs/tslibs/timezones.pxd index 897bd8af7e2deb..ead5566440ca08 100644 --- a/pandas/_libs/tslibs/timezones.pxd +++ b/pandas/_libs/tslibs/timezones.pxd @@ -1,12 +1,12 @@ # -*- coding: utf-8 -*- # cython: profile=False -cdef bint _is_utc(object tz) -cdef bint _is_tzlocal(object tz) +cdef bint is_utc(object tz) +cdef bint is_tzlocal(object tz) -cdef bint _treat_tz_as_pytz(object tz) -cdef bint _treat_tz_as_dateutil(object tz) +cdef bint treat_tz_as_pytz(object tz) +cdef bint treat_tz_as_dateutil(object tz) -cdef object _get_zone(object tz) +cpdef object get_timezone(object tz) -cpdef _get_utcoffset(tzinfo, obj) +cpdef get_utcoffset(tzinfo, obj) diff --git a/pandas/_libs/tslibs/timezones.pyx b/pandas/_libs/tslibs/timezones.pyx index 249eedef4bb098..3db369a09ba2d0 100644 --- a/pandas/_libs/tslibs/timezones.pyx +++ b/pandas/_libs/tslibs/timezones.pyx @@ -10,24 +10,24 @@ import pytz UTC = pytz.utc -cdef inline bint _is_utc(object tz): +cdef inline bint is_utc(object tz): return tz is UTC or isinstance(tz, _dateutil_tzutc) -cdef inline bint _is_tzlocal(object tz): +cdef inline bint is_tzlocal(object tz): return isinstance(tz, _dateutil_tzlocal) -cdef inline bint _treat_tz_as_pytz(object tz): +cdef inline bint treat_tz_as_pytz(object tz): return hasattr(tz, '_utc_transition_times') and hasattr( tz, '_transition_info') -cdef inline bint _treat_tz_as_dateutil(object tz): +cdef inline bint treat_tz_as_dateutil(object tz): return hasattr(tz, '_trans_list') and hasattr(tz, '_trans_idx') -cdef inline object _get_zone(object tz): +cpdef inline object get_timezone(object tz): """ We need to do several things here: 1) Distinguish between pytz and dateutil timezones @@ -40,10 +40,10 @@ cdef inline object _get_zone(object tz): the tz name. It needs to be a string so that we can serialize it with UJSON/pytables. maybe_get_tz (below) is the inverse of this process. """ - if _is_utc(tz): + if is_utc(tz): return 'UTC' else: - if _treat_tz_as_dateutil(tz): + if treat_tz_as_dateutil(tz): if '.tar.gz' in tz._filename: raise ValueError( 'Bad tz filename. Dateutil on python 3 on windows has a ' @@ -64,14 +64,10 @@ cdef inline object _get_zone(object tz): except AttributeError: return tz - -def get_timezone(tz): - return _get_zone(tz) - #---------------------------------------------------------------------- # UTC Offsets -cpdef _get_utcoffset(tzinfo, obj): +cpdef get_utcoffset(tzinfo, obj): try: return tzinfo._utcoffset except AttributeError: From 2cf2566de98201454b10b749ac628d538f9695a9 Mon Sep 17 00:00:00 2001 From: jschendel Date: Thu, 14 Sep 2017 04:11:30 -0600 Subject: [PATCH 082/188] Make *_range functions consistent (#17482) --- doc/source/api.rst | 9 + doc/source/timeseries.rst | 9 + doc/source/whatsnew/v0.21.0.txt | 55 +++- pandas/core/indexes/datetimes.py | 58 ++-- pandas/core/indexes/interval.py | 170 ++++++++--- pandas/core/indexes/period.py | 62 +++- pandas/core/indexes/timedeltas.py | 54 +++- .../indexes/datetimes/test_construction.py | 5 +- .../indexes/datetimes/test_date_range.py | 51 +++- .../tests/indexes/period/test_construction.py | 5 +- .../tests/indexes/period/test_period_range.py | 94 ++++++ pandas/tests/indexes/test_interval.py | 279 ++++++++++++++++-- .../indexes/timedeltas/test_construction.py | 5 +- .../timedeltas/test_timedelta_range.py | 21 +- 14 files changed, 747 insertions(+), 130 deletions(-) create mode 100644 pandas/tests/indexes/period/test_period_range.py diff --git a/doc/source/api.rst b/doc/source/api.rst index 27a4ab9cc6cbc4..1541bbccefe214 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -218,10 +218,19 @@ Top-level dealing with datetimelike to_timedelta date_range bdate_range + cdate_range period_range timedelta_range infer_freq +Top-level dealing with intervals +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autosummary:: + :toctree: generated/ + + interval_range + Top-level evaluation ~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index c86c58c3183f6f..5422d5c53043d3 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -1705,6 +1705,15 @@ has multiplied span. pd.PeriodIndex(start='2014-01', freq='3M', periods=4) +If ``start`` or ``end`` are ``Period`` objects, they will be used as anchor +endpoints for a ``PeriodIndex`` with frequency matching that of the +``PeriodIndex`` constructor. + +.. ipython:: python + + pd.PeriodIndex(start=pd.Period('2017Q1', freq='Q'), + end=pd.Period('2017Q2', freq='Q'), freq='M') + Just like ``DatetimeIndex``, a ``PeriodIndex`` can also be used to index pandas objects: diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 9da1f321ef5740..939199d3f6fa6d 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -218,7 +218,7 @@ Furthermore this will now correctly box the results of iteration for :func:`Data .. ipython:: ipython d = {'a':[1], 'b':['b']} - df = pd,DataFrame(d) + df = pd.DataFrame(d) Previously: @@ -358,6 +358,59 @@ Previously, :func:`to_datetime` did not localize datetime ``Series`` data when ` Additionally, DataFrames with datetime columns that were parsed by :func:`read_sql_table` and :func:`read_sql_query` will also be localized to UTC only if the original SQL columns were timezone aware datetime columns. +.. _whatsnew_0210.api.consistency_of_range_functions: + +Consistency of Range Functions +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In previous versions, there were some inconsistencies between the various range functions: func:`date_range`, func:`bdate_range`, func:`cdate_range`, func:`period_range`, func:`timedelta_range`, and func:`interval_range`. (:issue:`17471`). + +One of the inconsistent behaviors occurred when the ``start``, ``end`` and ``period`` parameters were all specified, potentially leading to ambiguous ranges. When all three parameters were passed, ``interval_range`` ignored the ``period`` parameter, ``period_range`` ignored the ``end`` parameter, and the other range functions raised. To promote consistency among the range functions, and avoid potentially ambiguous ranges, ``interval_range`` and ``period_range`` will now raise when all three parameters are passed. + +Previous Behavior: + +.. code-block:: ipython + + In [2]: pd.interval_range(start=0, end=4, periods=6) + Out[2]: + IntervalIndex([(0, 1], (1, 2], (2, 3]] + closed='right', + dtype='interval[int64]') + + In [3]: pd.period_range(start='2017Q1', end='2017Q4', periods=6, freq='Q') + Out[3]: PeriodIndex(['2017Q1', '2017Q2', '2017Q3', '2017Q4', '2018Q1', '2018Q2'], dtype='period[Q-DEC]', freq='Q-DEC') + +New Behavior: + +.. code-block:: ipython + + In [2]: pd.interval_range(start=0, end=4, periods=6) + --------------------------------------------------------------------------- + ValueError: Of the three parameters: start, end, and periods, exactly two must be specified + + In [3]: pd.period_range(start='2017Q1', end='2017Q4', periods=6, freq='Q') + --------------------------------------------------------------------------- + ValueError: Of the three parameters: start, end, and periods, exactly two must be specified + +Additionally, the endpoint parameter ``end`` was not included in the intervals produced by ``interval_range``. However, all other range functions include ``end`` in their output. To promote consistency among the range functions, ``interval_range`` will now include ``end`` as the right endpoint of the final interval, except if ``freq`` is specified in a way which skips ``end``. + +Previous Behavior: + +.. code-block:: ipython + + In [4]: pd.interval_range(start=0, end=4) + Out[4]: + IntervalIndex([(0, 1], (1, 2], (2, 3]] + closed='right', + dtype='interval[int64]') + + +New Behavior: + + .. ipython:: python + + pd.interval_range(start=0, end=4) + .. _whatsnew_0210.api: Other API Changes diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 4cfb7547e7d0ac..1c8d0b334b91c5 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -292,8 +292,8 @@ def __new__(cls, data=None, if is_float(periods): periods = int(periods) elif not is_integer(periods): - raise ValueError('Periods must be a number, got %s' % - str(periods)) + msg = 'periods must be a number, got {periods}' + raise TypeError(msg.format(periods=periods)) if data is None and freq is None: raise ValueError("Must provide freq argument if no data is " @@ -412,7 +412,8 @@ def __new__(cls, data=None, def _generate(cls, start, end, periods, name, offset, tz=None, normalize=False, ambiguous='raise', closed=None): if com._count_not_none(start, end, periods) != 2: - raise ValueError('Must specify two of start, end, or periods') + raise ValueError('Of the three parameters: start, end, and ' + 'periods, exactly two must be specified') _normalized = True @@ -2004,7 +2005,7 @@ def _generate_regular_range(start, end, periods, offset): def date_range(start=None, end=None, periods=None, freq='D', tz=None, normalize=False, name=None, closed=None, **kwargs): """ - Return a fixed frequency datetime index, with day (calendar) as the default + Return a fixed frequency DatetimeIndex, with day (calendar) as the default frequency Parameters @@ -2013,24 +2014,25 @@ def date_range(start=None, end=None, periods=None, freq='D', tz=None, Left bound for generating dates end : string or datetime-like, default None Right bound for generating dates - periods : integer or None, default None - If None, must specify start and end + periods : integer, default None + Number of periods to generate freq : string or DateOffset, default 'D' (calendar daily) Frequency strings can have multiples, e.g. '5H' - tz : string or None + tz : string, default None Time zone name for returning localized DatetimeIndex, for example Asia/Hong_Kong normalize : bool, default False Normalize start/end dates to midnight before generating date range - name : str, default None - Name of the resulting index - closed : string or None, default None + name : string, default None + Name of the resulting DatetimeIndex + closed : string, default None Make the interval closed with respect to the given frequency to the 'left', 'right', or both sides (None) Notes ----- - 2 of start, end, or periods must be specified + Of the three parameters: ``start``, ``end``, and ``periods``, exactly two + must be specified. To learn more about the frequency strings, please see `this link `__. @@ -2047,7 +2049,7 @@ def date_range(start=None, end=None, periods=None, freq='D', tz=None, def bdate_range(start=None, end=None, periods=None, freq='B', tz=None, normalize=True, name=None, closed=None, **kwargs): """ - Return a fixed frequency datetime index, with business day as the default + Return a fixed frequency DatetimeIndex, with business day as the default frequency Parameters @@ -2056,8 +2058,8 @@ def bdate_range(start=None, end=None, periods=None, freq='B', tz=None, Left bound for generating dates end : string or datetime-like, default None Right bound for generating dates - periods : integer or None, default None - If None, must specify start and end + periods : integer, default None + Number of periods to generate freq : string or DateOffset, default 'B' (business daily) Frequency strings can have multiples, e.g. '5H' tz : string or None @@ -2065,15 +2067,16 @@ def bdate_range(start=None, end=None, periods=None, freq='B', tz=None, Asia/Beijing normalize : bool, default False Normalize start/end dates to midnight before generating date range - name : str, default None - Name for the resulting index - closed : string or None, default None + name : string, default None + Name of the resulting DatetimeIndex + closed : string, default None Make the interval closed with respect to the given frequency to the 'left', 'right', or both sides (None) Notes ----- - 2 of start, end, or periods must be specified + Of the three parameters: ``start``, ``end``, and ``periods``, exactly two + must be specified. To learn more about the frequency strings, please see `this link `__. @@ -2091,7 +2094,7 @@ def bdate_range(start=None, end=None, periods=None, freq='B', tz=None, def cdate_range(start=None, end=None, periods=None, freq='C', tz=None, normalize=True, name=None, closed=None, **kwargs): """ - **EXPERIMENTAL** Return a fixed frequency datetime index, with + **EXPERIMENTAL** Return a fixed frequency DatetimeIndex, with CustomBusinessDay as the default frequency .. warning:: EXPERIMENTAL @@ -2105,29 +2108,30 @@ def cdate_range(start=None, end=None, periods=None, freq='C', tz=None, Left bound for generating dates end : string or datetime-like, default None Right bound for generating dates - periods : integer or None, default None - If None, must specify start and end + periods : integer, default None + Number of periods to generate freq : string or DateOffset, default 'C' (CustomBusinessDay) Frequency strings can have multiples, e.g. '5H' - tz : string or None + tz : string, default None Time zone name for returning localized DatetimeIndex, for example Asia/Beijing normalize : bool, default False Normalize start/end dates to midnight before generating date range - name : str, default None - Name for the resulting index - weekmask : str, Default 'Mon Tue Wed Thu Fri' + name : string, default None + Name of the resulting DatetimeIndex + weekmask : string, Default 'Mon Tue Wed Thu Fri' weekmask of valid business days, passed to ``numpy.busdaycalendar`` holidays : list list/array of dates to exclude from the set of valid business days, passed to ``numpy.busdaycalendar`` - closed : string or None, default None + closed : string, default None Make the interval closed with respect to the given frequency to the 'left', 'right', or both sides (None) Notes ----- - 2 of start, end, or periods must be specified + Of the three parameters: ``start``, ``end``, and ``periods``, exactly two + must be specified. To learn more about the frequency strings, please see `this link `__. diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index e0ed6c7ea35c0c..6e80f6c900386d 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -15,6 +15,8 @@ is_float_dtype, is_interval_dtype, is_scalar, + is_float, + is_number, is_integer) from pandas.core.indexes.base import ( Index, _ensure_index, @@ -25,11 +27,15 @@ Interval, IntervalMixin, IntervalTree, intervals_to_interval_bounds) +from pandas.core.indexes.datetimes import date_range +from pandas.core.indexes.timedeltas import timedelta_range from pandas.core.indexes.multi import MultiIndex from pandas.compat.numpy import function as nv from pandas.core import common as com from pandas.util._decorators import cache_readonly, Appender from pandas.core.config import get_option +from pandas.tseries.frequencies import to_offset +from pandas.tseries.offsets import DateOffset import pandas.core.indexes.base as ibase _index_doc_kwargs = dict(ibase._index_doc_kwargs) @@ -1028,54 +1034,152 @@ def func(self, other): IntervalIndex._add_logical_methods_disabled() -def interval_range(start=None, end=None, freq=None, periods=None, - name=None, closed='right', **kwargs): +def _is_valid_endpoint(endpoint): + """helper for interval_range to check if start/end are valid types""" + return any([is_number(endpoint), + isinstance(endpoint, Timestamp), + isinstance(endpoint, Timedelta), + endpoint is None]) + + +def _is_type_compatible(a, b): + """helper for interval_range to check type compat of start/end/freq""" + is_ts_compat = lambda x: isinstance(x, (Timestamp, DateOffset)) + is_td_compat = lambda x: isinstance(x, (Timedelta, DateOffset)) + return ((is_number(a) and is_number(b)) or + (is_ts_compat(a) and is_ts_compat(b)) or + (is_td_compat(a) and is_td_compat(b)) or + com._any_none(a, b)) + + +def interval_range(start=None, end=None, periods=None, freq=None, + name=None, closed='right'): """ Return a fixed frequency IntervalIndex Parameters ---------- - start : string or datetime-like, default None - Left bound for generating data - end : string or datetime-like, default None - Right bound for generating data - freq : interger, string or DateOffset, default 1 - periods : interger, default None - name : str, default None - Name of the resulting index + start : numeric or datetime-like, default None + Left bound for generating intervals + end : numeric or datetime-like, default None + Right bound for generating intervals + periods : integer, default None + Number of periods to generate + freq : numeric, string, or DateOffset, default None + The length of each interval. Must be consistent with the type of start + and end, e.g. 2 for numeric, or '5H' for datetime-like. Default is 1 + for numeric and 'D' (calendar daily) for datetime-like. + name : string, default None + Name of the resulting IntervalIndex closed : string, default 'right' options are: 'left', 'right', 'both', 'neither' Notes ----- - 2 of start, end, or periods must be specified + Of the three parameters: ``start``, ``end``, and ``periods``, exactly two + must be specified. Returns ------- rng : IntervalIndex + + Examples + -------- + + Numeric ``start`` and ``end`` is supported. + + >>> pd.interval_range(start=0, end=5) + IntervalIndex([(0, 1], (1, 2], (2, 3], (3, 4], (4, 5]] + closed='right', dtype='interval[int64]') + + Additionally, datetime-like input is also supported. + + >>> pd.interval_range(start='2017-01-01', end='2017-01-04') + IntervalIndex([(2017-01-01, 2017-01-02], (2017-01-02, 2017-01-03], + (2017-01-03, 2017-01-04]] + closed='right', dtype='interval[datetime64[ns]]') + + The ``freq`` parameter specifies the frequency between the left and right. + endpoints of the individual intervals within the ``IntervalIndex``. For + numeric ``start`` and ``end``, the frequency must also be numeric. + + >>> pd.interval_range(start=0, periods=4, freq=1.5) + IntervalIndex([(0.0, 1.5], (1.5, 3.0], (3.0, 4.5], (4.5, 6.0]] + closed='right', dtype='interval[float64]') + + Similarly, for datetime-like ``start`` and ``end``, the frequency must be + convertible to a DateOffset. + + >>> pd.interval_range(start='2017-01-01', periods=3, freq='MS') + IntervalIndex([(2017-01-01, 2017-02-01], (2017-02-01, 2017-03-01], + (2017-03-01, 2017-04-01]] + closed='right', dtype='interval[datetime64[ns]]') + + The ``closed`` parameter specifies which endpoints of the individual + intervals within the ``IntervalIndex`` are closed. + + >>> pd.interval_range(end=5, periods=4, closed='both') + IntervalIndex([[1, 2], [2, 3], [3, 4], [4, 5]] + closed='both', dtype='interval[int64]') """ + if com._count_not_none(start, end, periods) != 2: + raise ValueError('Of the three parameters: start, end, and periods, ' + 'exactly two must be specified') + + start = com._maybe_box_datetimelike(start) + end = com._maybe_box_datetimelike(end) + endpoint = next(com._not_none(start, end)) + + if not _is_valid_endpoint(start): + msg = 'start must be numeric or datetime-like, got {start}' + raise ValueError(msg.format(start=start)) + + if not _is_valid_endpoint(end): + msg = 'end must be numeric or datetime-like, got {end}' + raise ValueError(msg.format(end=end)) + + if is_float(periods): + periods = int(periods) + elif not is_integer(periods) and periods is not None: + msg = 'periods must be a number, got {periods}' + raise TypeError(msg.format(periods=periods)) + + freq = freq or (1 if is_number(endpoint) else 'D') + if not is_number(freq): + try: + freq = to_offset(freq) + except ValueError: + raise ValueError('freq must be numeric or convertible to ' + 'DateOffset, got {freq}'.format(freq=freq)) - if freq is None: - freq = 1 + # verify type compatibility + if not all([_is_type_compatible(start, end), + _is_type_compatible(start, freq), + _is_type_compatible(end, freq)]): + raise TypeError("start, end, freq need to be type compatible") - if start is None: - if periods is None or end is None: - raise ValueError("must specify 2 of start, end, periods") - start = end - periods * freq - if end is None: - if periods is None or start is None: - raise ValueError("must specify 2 of start, end, periods") + if is_number(endpoint): + if periods is None: + periods = int((end - start) // freq) + + if start is None: + start = end - periods * freq + + # force end to be consistent with freq (lower if freq skips over end) end = start + periods * freq - if periods is None: - if start is None or end is None: - raise ValueError("must specify 2 of start, end, periods") - pass - - # must all be same units or None - arr = np.array([start, end, freq]) - if is_object_dtype(arr): - raise ValueError("start, end, freq need to be the same type") - - return IntervalIndex.from_breaks(np.arange(start, end, freq), - name=name, - closed=closed) + + # end + freq for inclusive endpoint + breaks = np.arange(start, end + freq, freq) + elif isinstance(endpoint, Timestamp): + # add one to account for interval endpoints (n breaks = n-1 intervals) + if periods is not None: + periods += 1 + breaks = date_range(start=start, end=end, periods=periods, freq=freq) + else: + # add one to account for interval endpoints (n breaks = n-1 intervals) + if periods is not None: + periods += 1 + breaks = timedelta_range(start=start, end=end, periods=periods, + freq=freq) + + return IntervalIndex.from_breaks(breaks, name=name, closed=closed) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 0915462d4d4212..fb47d1db48610b 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -199,8 +199,8 @@ def __new__(cls, data=None, ordinal=None, freq=None, start=None, end=None, if is_float(periods): periods = int(periods) elif not is_integer(periods): - raise ValueError('Periods must be a number, got %s' % - str(periods)) + msg = 'periods must be a number, got {periods}' + raise TypeError(msg.format(periods=periods)) if name is None and hasattr(data, 'name'): name = data.name @@ -1051,8 +1051,9 @@ def tz_localize(self, tz, infer_dst=False): def _get_ordinal_range(start, end, periods, freq, mult=1): - if com._count_not_none(start, end, periods) < 2: - raise ValueError('Must specify 2 of start, end, periods') + if com._count_not_none(start, end, periods) != 2: + raise ValueError('Of the three parameters: start, end, and periods, ' + 'exactly two must be specified') if freq is not None: _, mult = _gfc(freq) @@ -1066,9 +1067,9 @@ def _get_ordinal_range(start, end, periods, freq, mult=1): is_end_per = isinstance(end, Period) if is_start_per and is_end_per and start.freq != end.freq: - raise ValueError('Start and end must have same freq') + raise ValueError('start and end must have same freq') if (start is tslib.NaT or end is tslib.NaT): - raise ValueError('Start and end must not be NaT') + raise ValueError('start and end must not be NaT') if freq is None: if is_start_per: @@ -1157,24 +1158,55 @@ def pnow(freq=None): def period_range(start=None, end=None, periods=None, freq='D', name=None): """ - Return a fixed frequency datetime index, with day (calendar) as the default + Return a fixed frequency PeriodIndex, with day (calendar) as the default frequency - Parameters ---------- - start : starting value, period-like, optional - end : ending value, period-like, optional - periods : int, default None - Number of periods in the index - freq : str/DateOffset, default 'D' + start : string or period-like, default None + Left bound for generating periods + end : string or period-like, default None + Right bound for generating periods + periods : integer, default None + Number of periods to generate + freq : string or DateOffset, default 'D' (calendar daily) Frequency alias - name : str, default None - Name for the resulting PeriodIndex + name : string, default None + Name of the resulting PeriodIndex + + Notes + ----- + Of the three parameters: ``start``, ``end``, and ``periods``, exactly two + must be specified. + + To learn more about the frequency strings, please see `this link + `__. Returns ------- prng : PeriodIndex + + Examples + -------- + + >>> pd.period_range(start='2017-01-01', end='2018-01-01', freq='M') + PeriodIndex(['2017-01', '2017-02', '2017-03', '2017-04', '2017-05', + '2017-06', '2017-06', '2017-07', '2017-08', '2017-09', + '2017-10', '2017-11', '2017-12', '2018-01'], + dtype='period[M]', freq='M') + + If ``start`` or ``end`` are ``Period`` objects, they will be used as anchor + endpoints for a ``PeriodIndex`` with frequency matching that of the + ``period_range`` constructor. + + >>> pd.period_range(start=pd.Period('2017Q1', freq='Q'), + ... end=pd.Period('2017Q2', freq='Q'), freq='M') + PeriodIndex(['2017-03', '2017-04', '2017-05', '2017-06'], + dtype='period[M]', freq='M') """ + if com._count_not_none(start, end, periods) != 2: + raise ValueError('Of the three parameters: start, end, and periods, ' + 'exactly two must be specified') + return PeriodIndex(start=start, end=end, periods=periods, freq=freq, name=name) diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 2823951c0f3487..d7b7d56d74a3a9 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -180,8 +180,8 @@ def __new__(cls, data=None, unit=None, if is_float(periods): periods = int(periods) elif not is_integer(periods): - raise ValueError('Periods must be a number, got %s' % - str(periods)) + msg = 'periods must be a number, got {periods}' + raise TypeError(msg.format(periods=periods)) if data is None and freq is None: raise ValueError("Must provide freq argument if no data is " @@ -234,7 +234,8 @@ def __new__(cls, data=None, unit=None, @classmethod def _generate(cls, start, end, periods, name, offset, closed=None): if com._count_not_none(start, end, periods) != 2: - raise ValueError('Must specify two of start, end, or periods') + raise ValueError('Of the three parameters: start, end, and ' + 'periods, exactly two must be specified') if start is not None: start = Timedelta(start) @@ -960,22 +961,22 @@ def _generate_regular_range(start, end, periods, offset): def timedelta_range(start=None, end=None, periods=None, freq='D', name=None, closed=None): """ - Return a fixed frequency timedelta index, with day as the default + Return a fixed frequency TimedeltaIndex, with day as the default frequency Parameters ---------- start : string or timedelta-like, default None - Left bound for generating dates - end : string or datetime-like, default None - Right bound for generating dates - periods : integer or None, default None - If None, must specify start and end + Left bound for generating timedeltas + end : string or timedelta-like, default None + Right bound for generating timedeltas + periods : integer, default None + Number of periods to generate freq : string or DateOffset, default 'D' (calendar daily) Frequency strings can have multiples, e.g. '5H' - name : str, default None - Name of the resulting index - closed : string or None, default None + name : string, default None + Name of the resulting TimedeltaIndex + closed : string, default None Make the interval closed with respect to the given frequency to the 'left', 'right', or both sides (None) @@ -985,11 +986,34 @@ def timedelta_range(start=None, end=None, periods=None, freq='D', Notes ----- - 2 of start, end, or periods must be specified. + Of the three parameters: ``start``, ``end``, and ``periods``, exactly two + must be specified. To learn more about the frequency strings, please see `this link `__. + + Examples + -------- + + >>> pd.timedelta_range(start='1 day', periods=4) + TimedeltaIndex(['1 days', '2 days', '3 days', '4 days'], + dtype='timedelta64[ns]', freq='D') + + The ``closed`` parameter specifies which endpoint is included. The default + behavior is to include both endpoints. + + >>> pd.timedelta_range(start='1 day', periods=4, closed='right') + TimedeltaIndex(['2 days', '3 days', '4 days'], + dtype='timedelta64[ns]', freq='D') + + The ``freq`` parameter specifies the frequency of the TimedeltaIndex. + Only fixed frequencies can be passed, non-fixed frequencies such as + 'M' (month end) will raise. + + >>> pd.timedelta_range(start='1 day', end='2 days', freq='6H') + TimedeltaIndex(['1 days 00:00:00', '1 days 06:00:00', '1 days 12:00:00', + '1 days 18:00:00', '2 days 00:00:00'], + dtype='timedelta64[ns]', freq='6H') """ return TimedeltaIndex(start=start, end=end, periods=periods, - freq=freq, name=name, - closed=closed) + freq=freq, name=name, closed=closed) diff --git a/pandas/tests/indexes/datetimes/test_construction.py b/pandas/tests/indexes/datetimes/test_construction.py index cf896b06130a24..a4706dd8a3767b 100644 --- a/pandas/tests/indexes/datetimes/test_construction.py +++ b/pandas/tests/indexes/datetimes/test_construction.py @@ -307,8 +307,9 @@ def test_constructor_coverage(self): exp = date_range('1/1/2000', periods=10) tm.assert_index_equal(rng, exp) - pytest.raises(ValueError, DatetimeIndex, start='1/1/2000', - periods='foo', freq='D') + msg = 'periods must be a number, got foo' + with tm.assert_raises_regex(TypeError, msg): + DatetimeIndex(start='1/1/2000', periods='foo', freq='D') pytest.raises(ValueError, DatetimeIndex, start='1/1/2000', end='1/10/2000') diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index da4ca83c10dda2..8d86bebdd4d5e4 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -107,8 +107,10 @@ def test_date_range_ambiguous_arguments(self): start = datetime(2011, 1, 1, 5, 3, 40) end = datetime(2011, 1, 1, 8, 9, 40) - pytest.raises(ValueError, date_range, start, end, freq='s', - periods=10) + msg = ('Of the three parameters: start, end, and periods, ' + 'exactly two must be specified') + with tm.assert_raises_regex(ValueError, msg): + date_range(start, end, periods=10, freq='s') def test_date_range_businesshour(self): idx = DatetimeIndex(['2014-07-04 09:00', '2014-07-04 10:00', @@ -146,14 +148,29 @@ def test_date_range_businesshour(self): def test_range_misspecified(self): # GH #1095 + msg = ('Of the three parameters: start, end, and periods, ' + 'exactly two must be specified') + + with tm.assert_raises_regex(ValueError, msg): + date_range(start='1/1/2000') + + with tm.assert_raises_regex(ValueError, msg): + date_range(end='1/1/2000') + + with tm.assert_raises_regex(ValueError, msg): + date_range(periods=10) + + with tm.assert_raises_regex(ValueError, msg): + date_range(start='1/1/2000', freq='H') - pytest.raises(ValueError, date_range, '1/1/2000') - pytest.raises(ValueError, date_range, end='1/1/2000') - pytest.raises(ValueError, date_range, periods=10) + with tm.assert_raises_regex(ValueError, msg): + date_range(end='1/1/2000', freq='H') - pytest.raises(ValueError, date_range, '1/1/2000', freq='H') - pytest.raises(ValueError, date_range, end='1/1/2000', freq='H') - pytest.raises(ValueError, date_range, periods=10, freq='H') + with tm.assert_raises_regex(ValueError, msg): + date_range(periods=10, freq='H') + + with tm.assert_raises_regex(ValueError, msg): + date_range() def test_compat_replace(self): # https://github.com/statsmodels/statsmodels/issues/3349 @@ -231,8 +248,13 @@ def test_constructor(self): bdate_range(START, END, freq=BDay()) bdate_range(START, periods=20, freq=BDay()) bdate_range(end=START, periods=20, freq=BDay()) - pytest.raises(ValueError, date_range, '2011-1-1', '2012-1-1', 'B') - pytest.raises(ValueError, bdate_range, '2011-1-1', '2012-1-1', 'B') + + msg = 'periods must be a number, got B' + with tm.assert_raises_regex(TypeError, msg): + date_range('2011-1-1', '2012-1-1', 'B') + + with tm.assert_raises_regex(TypeError, msg): + bdate_range('2011-1-1', '2012-1-1', 'B') def test_naive_aware_conflicts(self): naive = bdate_range(START, END, freq=BDay(), tz=None) @@ -510,8 +532,13 @@ def test_constructor(self): cdate_range(START, END, freq=CDay()) cdate_range(START, periods=20, freq=CDay()) cdate_range(end=START, periods=20, freq=CDay()) - pytest.raises(ValueError, date_range, '2011-1-1', '2012-1-1', 'C') - pytest.raises(ValueError, cdate_range, '2011-1-1', '2012-1-1', 'C') + + msg = 'periods must be a number, got C' + with tm.assert_raises_regex(TypeError, msg): + date_range('2011-1-1', '2012-1-1', 'C') + + with tm.assert_raises_regex(TypeError, msg): + cdate_range('2011-1-1', '2012-1-1', 'C') def test_cached_range(self): DatetimeIndex._cached_range(START, END, offset=CDay()) diff --git a/pandas/tests/indexes/period/test_construction.py b/pandas/tests/indexes/period/test_construction.py index e5b889e1003070..639a9272c38082 100644 --- a/pandas/tests/indexes/period/test_construction.py +++ b/pandas/tests/indexes/period/test_construction.py @@ -436,11 +436,12 @@ def test_constructor_error(self): start = Period('02-Apr-2005', 'B') end_intv = Period('2006-12-31', ('w', 1)) - msg = 'Start and end must have same freq' + msg = 'start and end must have same freq' with tm.assert_raises_regex(ValueError, msg): PeriodIndex(start=start, end=end_intv) - msg = 'Must specify 2 of start, end, periods' + msg = ('Of the three parameters: start, end, and periods, ' + 'exactly two must be specified') with tm.assert_raises_regex(ValueError, msg): PeriodIndex(start=start) diff --git a/pandas/tests/indexes/period/test_period_range.py b/pandas/tests/indexes/period/test_period_range.py new file mode 100644 index 00000000000000..640f24f67f72f2 --- /dev/null +++ b/pandas/tests/indexes/period/test_period_range.py @@ -0,0 +1,94 @@ +import pytest +import pandas.util.testing as tm +from pandas import date_range, NaT, period_range, Period, PeriodIndex + + +class TestPeriodRange(object): + + @pytest.mark.parametrize('freq', ['D', 'W', 'M', 'Q', 'A']) + def test_construction_from_string(self, freq): + # non-empty + expected = date_range(start='2017-01-01', periods=5, + freq=freq, name='foo').to_period() + start, end = str(expected[0]), str(expected[-1]) + + result = period_range(start=start, end=end, freq=freq, name='foo') + tm.assert_index_equal(result, expected) + + result = period_range(start=start, periods=5, freq=freq, name='foo') + tm.assert_index_equal(result, expected) + + result = period_range(end=end, periods=5, freq=freq, name='foo') + tm.assert_index_equal(result, expected) + + # empty + expected = PeriodIndex([], freq=freq, name='foo') + + result = period_range(start=start, periods=0, freq=freq, name='foo') + tm.assert_index_equal(result, expected) + + result = period_range(end=end, periods=0, freq=freq, name='foo') + tm.assert_index_equal(result, expected) + + result = period_range(start=end, end=start, freq=freq, name='foo') + tm.assert_index_equal(result, expected) + + def test_construction_from_period(self): + # upsampling + start, end = Period('2017Q1', freq='Q'), Period('2018Q1', freq='Q') + expected = date_range(start='2017-03-31', end='2018-03-31', freq='M', + name='foo').to_period() + result = period_range(start=start, end=end, freq='M', name='foo') + tm.assert_index_equal(result, expected) + + # downsampling + start, end = Period('2017-1', freq='M'), Period('2019-12', freq='M') + expected = date_range(start='2017-01-31', end='2019-12-31', freq='Q', + name='foo').to_period() + result = period_range(start=start, end=end, freq='Q', name='foo') + tm.assert_index_equal(result, expected) + + # empty + expected = PeriodIndex([], freq='W', name='foo') + + result = period_range(start=start, periods=0, freq='W', name='foo') + tm.assert_index_equal(result, expected) + + result = period_range(end=end, periods=0, freq='W', name='foo') + tm.assert_index_equal(result, expected) + + result = period_range(start=end, end=start, freq='W', name='foo') + tm.assert_index_equal(result, expected) + + def test_errors(self): + # not enough params + msg = ('Of the three parameters: start, end, and periods, ' + 'exactly two must be specified') + with tm.assert_raises_regex(ValueError, msg): + period_range(start='2017Q1') + + with tm.assert_raises_regex(ValueError, msg): + period_range(end='2017Q1') + + with tm.assert_raises_regex(ValueError, msg): + period_range(periods=5) + + with tm.assert_raises_regex(ValueError, msg): + period_range() + + # too many params + with tm.assert_raises_regex(ValueError, msg): + period_range(start='2017Q1', end='2018Q1', periods=8, freq='Q') + + # start/end NaT + msg = 'start and end must not be NaT' + with tm.assert_raises_regex(ValueError, msg): + period_range(start=NaT, end='2018Q1') + + with tm.assert_raises_regex(ValueError, msg): + period_range(start='2017Q1', end=NaT) + + # invalid periods param + msg = 'periods must be a number, got foo' + with tm.assert_raises_regex(TypeError, msg): + period_range(start='2017Q1', periods='foo') diff --git a/pandas/tests/indexes/test_interval.py b/pandas/tests/indexes/test_interval.py index 18eefc3fbdca6e..13c3b35e4d85d9 100644 --- a/pandas/tests/indexes/test_interval.py +++ b/pandas/tests/indexes/test_interval.py @@ -2,10 +2,11 @@ import pytest import numpy as np - +from datetime import timedelta from pandas import (Interval, IntervalIndex, Index, isna, interval_range, Timestamp, Timedelta, - compat) + compat, date_range, timedelta_range, DateOffset) +from pandas.tseries.offsets import Day from pandas._libs.interval import IntervalTree from pandas.tests.indexes.common import Base import pandas.util.testing as tm @@ -721,40 +722,278 @@ def test_is_non_overlapping_monotonic(self): class TestIntervalRange(object): - def test_construction(self): - result = interval_range(0, 5, name='foo', closed='both') + @pytest.mark.parametrize('closed', ['left', 'right', 'neither', 'both']) + def test_construction_from_numeric(self, closed): + # combinations of start/end/periods without freq expected = IntervalIndex.from_breaks( - np.arange(0, 5), name='foo', closed='both') + np.arange(0, 6), name='foo', closed=closed) + + result = interval_range(start=0, end=5, name='foo', closed=closed) tm.assert_index_equal(result, expected) - def test_errors(self): + result = interval_range(start=0, periods=5, name='foo', closed=closed) + tm.assert_index_equal(result, expected) + + result = interval_range(end=5, periods=5, name='foo', closed=closed) + tm.assert_index_equal(result, expected) + + # combinations of start/end/periods with freq + expected = IntervalIndex.from_tuples([(0, 2), (2, 4), (4, 6)], + name='foo', closed=closed) + + result = interval_range(start=0, end=6, freq=2, name='foo', + closed=closed) + tm.assert_index_equal(result, expected) + + result = interval_range(start=0, periods=3, freq=2, name='foo', + closed=closed) + tm.assert_index_equal(result, expected) + + result = interval_range(end=6, periods=3, freq=2, name='foo', + closed=closed) + tm.assert_index_equal(result, expected) + + # output truncates early if freq causes end to be skipped. + expected = IntervalIndex.from_tuples([(0.0, 1.5), (1.5, 3.0)], + name='foo', closed=closed) + result = interval_range(start=0, end=4, freq=1.5, name='foo', + closed=closed) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize('closed', ['left', 'right', 'neither', 'both']) + def test_construction_from_timestamp(self, closed): + # combinations of start/end/periods without freq + start, end = Timestamp('2017-01-01'), Timestamp('2017-01-06') + breaks = date_range(start=start, end=end) + expected = IntervalIndex.from_breaks(breaks, name='foo', closed=closed) + + result = interval_range(start=start, end=end, name='foo', + closed=closed) + tm.assert_index_equal(result, expected) + + result = interval_range(start=start, periods=5, name='foo', + closed=closed) + tm.assert_index_equal(result, expected) + + result = interval_range(end=end, periods=5, name='foo', + closed=closed) + tm.assert_index_equal(result, expected) + + # combinations of start/end/periods with fixed freq + freq = '2D' + start, end = Timestamp('2017-01-01'), Timestamp('2017-01-07') + breaks = date_range(start=start, end=end, freq=freq) + expected = IntervalIndex.from_breaks(breaks, name='foo', closed=closed) + + result = interval_range(start=start, end=end, freq=freq, name='foo', + closed=closed) + tm.assert_index_equal(result, expected) + + result = interval_range(start=start, periods=3, freq=freq, name='foo', + closed=closed) + tm.assert_index_equal(result, expected) + + result = interval_range(end=end, periods=3, freq=freq, name='foo', + closed=closed) + tm.assert_index_equal(result, expected) + + # output truncates early if freq causes end to be skipped. + end = Timestamp('2017-01-08') + result = interval_range(start=start, end=end, freq=freq, name='foo', + closed=closed) + tm.assert_index_equal(result, expected) + + # combinations of start/end/periods with non-fixed freq + freq = 'M' + start, end = Timestamp('2017-01-01'), Timestamp('2017-12-31') + breaks = date_range(start=start, end=end, freq=freq) + expected = IntervalIndex.from_breaks(breaks, name='foo', closed=closed) + + result = interval_range(start=start, end=end, freq=freq, name='foo', + closed=closed) + tm.assert_index_equal(result, expected) + + result = interval_range(start=start, periods=11, freq=freq, name='foo', + closed=closed) + tm.assert_index_equal(result, expected) + + result = interval_range(end=end, periods=11, freq=freq, name='foo', + closed=closed) + tm.assert_index_equal(result, expected) + + # output truncates early if freq causes end to be skipped. + end = Timestamp('2018-01-15') + result = interval_range(start=start, end=end, freq=freq, name='foo', + closed=closed) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize('closed', ['left', 'right', 'neither', 'both']) + def test_construction_from_timedelta(self, closed): + # combinations of start/end/periods without freq + start, end = Timedelta('1 day'), Timedelta('6 days') + breaks = timedelta_range(start=start, end=end) + expected = IntervalIndex.from_breaks(breaks, name='foo', closed=closed) + + result = interval_range(start=start, end=end, name='foo', + closed=closed) + tm.assert_index_equal(result, expected) + + result = interval_range(start=start, periods=5, name='foo', + closed=closed) + tm.assert_index_equal(result, expected) + + result = interval_range(end=end, periods=5, name='foo', + closed=closed) + tm.assert_index_equal(result, expected) + + # combinations of start/end/periods with fixed freq + freq = '2D' + start, end = Timedelta('1 day'), Timedelta('7 days') + breaks = timedelta_range(start=start, end=end, freq=freq) + expected = IntervalIndex.from_breaks(breaks, name='foo', closed=closed) + + result = interval_range(start=start, end=end, freq=freq, name='foo', + closed=closed) + tm.assert_index_equal(result, expected) + + result = interval_range(start=start, periods=3, freq=freq, name='foo', + closed=closed) + tm.assert_index_equal(result, expected) + + result = interval_range(end=end, periods=3, freq=freq, name='foo', + closed=closed) + tm.assert_index_equal(result, expected) + + # output truncates early if freq causes end to be skipped. + end = Timedelta('7 days 1 hour') + result = interval_range(start=start, end=end, freq=freq, name='foo', + closed=closed) + tm.assert_index_equal(result, expected) + + def test_constructor_coverage(self): + # float value for periods + expected = pd.interval_range(start=0, periods=10) + result = pd.interval_range(start=0, periods=10.5) + tm.assert_index_equal(result, expected) + + # equivalent timestamp-like start/end + start, end = Timestamp('2017-01-01'), Timestamp('2017-01-15') + expected = pd.interval_range(start=start, end=end) + + result = pd.interval_range(start=start.to_pydatetime(), + end=end.to_pydatetime()) + tm.assert_index_equal(result, expected) + + result = pd.interval_range(start=start.tz_localize('UTC'), + end=end.tz_localize('UTC')) + tm.assert_index_equal(result, expected) + + result = pd.interval_range(start=start.asm8, end=end.asm8) + tm.assert_index_equal(result, expected) + + # equivalent freq with timestamp + equiv_freq = ['D', Day(), Timedelta(days=1), timedelta(days=1), + DateOffset(days=1)] + for freq in equiv_freq: + result = pd.interval_range(start=start, end=end, freq=freq) + tm.assert_index_equal(result, expected) + + # equivalent timedelta-like start/end + start, end = Timedelta(days=1), Timedelta(days=10) + expected = pd.interval_range(start=start, end=end) + + result = pd.interval_range(start=start.to_pytimedelta(), + end=end.to_pytimedelta()) + tm.assert_index_equal(result, expected) + + result = pd.interval_range(start=start.asm8, end=end.asm8) + tm.assert_index_equal(result, expected) + + # equivalent freq with timedelta + equiv_freq = ['D', Day(), Timedelta(days=1), timedelta(days=1)] + for freq in equiv_freq: + result = pd.interval_range(start=start, end=end, freq=freq) + tm.assert_index_equal(result, expected) + def test_errors(self): # not enough params - def f(): - interval_range(0) + msg = ('Of the three parameters: start, end, and periods, ' + 'exactly two must be specified') - pytest.raises(ValueError, f) + with tm.assert_raises_regex(ValueError, msg): + interval_range(start=0) - def f(): - interval_range(periods=2) + with tm.assert_raises_regex(ValueError, msg): + interval_range(end=5) - pytest.raises(ValueError, f) + with tm.assert_raises_regex(ValueError, msg): + interval_range(periods=2) - def f(): + with tm.assert_raises_regex(ValueError, msg): interval_range() - pytest.raises(ValueError, f) + # too many params + with tm.assert_raises_regex(ValueError, msg): + interval_range(start=0, end=5, periods=6) # mixed units - def f(): - interval_range(0, Timestamp('20130101'), freq=2) + msg = 'start, end, freq need to be type compatible' + with tm.assert_raises_regex(TypeError, msg): + interval_range(start=0, end=Timestamp('20130101'), freq=2) - pytest.raises(ValueError, f) + with tm.assert_raises_regex(TypeError, msg): + interval_range(start=0, end=Timedelta('1 day'), freq=2) - def f(): - interval_range(0, 10, freq=Timedelta('1day')) + with tm.assert_raises_regex(TypeError, msg): + interval_range(start=0, end=10, freq='D') - pytest.raises(ValueError, f) + with tm.assert_raises_regex(TypeError, msg): + interval_range(start=Timestamp('20130101'), end=10, freq='D') + + with tm.assert_raises_regex(TypeError, msg): + interval_range(start=Timestamp('20130101'), + end=Timedelta('1 day'), freq='D') + + with tm.assert_raises_regex(TypeError, msg): + interval_range(start=Timestamp('20130101'), + end=Timestamp('20130110'), freq=2) + + with tm.assert_raises_regex(TypeError, msg): + interval_range(start=Timedelta('1 day'), end=10, freq='D') + + with tm.assert_raises_regex(TypeError, msg): + interval_range(start=Timedelta('1 day'), + end=Timestamp('20130110'), freq='D') + + with tm.assert_raises_regex(TypeError, msg): + interval_range(start=Timedelta('1 day'), + end=Timedelta('10 days'), freq=2) + + # invalid periods + msg = 'periods must be a number, got foo' + with tm.assert_raises_regex(TypeError, msg): + interval_range(start=0, periods='foo') + + # invalid start + msg = 'start must be numeric or datetime-like, got foo' + with tm.assert_raises_regex(ValueError, msg): + interval_range(start='foo', periods=10) + + # invalid end + msg = 'end must be numeric or datetime-like, got \(0, 1\]' + with tm.assert_raises_regex(ValueError, msg): + interval_range(end=Interval(0, 1), periods=10) + + # invalid freq for datetime-like + msg = 'freq must be numeric or convertible to DateOffset, got foo' + with tm.assert_raises_regex(ValueError, msg): + interval_range(start=0, end=10, freq='foo') + + with tm.assert_raises_regex(ValueError, msg): + interval_range(start=Timestamp('20130101'), periods=10, freq='foo') + + with tm.assert_raises_regex(ValueError, msg): + interval_range(end=Timedelta('1 day'), periods=10, freq='foo') class TestIntervalTree(object): diff --git a/pandas/tests/indexes/timedeltas/test_construction.py b/pandas/tests/indexes/timedeltas/test_construction.py index dd25e2cca2e553..70aadd9f571740 100644 --- a/pandas/tests/indexes/timedeltas/test_construction.py +++ b/pandas/tests/indexes/timedeltas/test_construction.py @@ -50,8 +50,9 @@ def test_constructor_coverage(self): exp = timedelta_range('1 days', periods=10) tm.assert_index_equal(rng, exp) - pytest.raises(ValueError, TimedeltaIndex, start='1 days', - periods='foo', freq='D') + msg = 'periods must be a number, got foo' + with tm.assert_raises_regex(TypeError, msg): + TimedeltaIndex(start='1 days', periods='foo', freq='D') pytest.raises(ValueError, TimedeltaIndex, start='1 days', end='10 days') diff --git a/pandas/tests/indexes/timedeltas/test_timedelta_range.py b/pandas/tests/indexes/timedeltas/test_timedelta_range.py index 4732a0ce110dea..7624e1f79af152 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta_range.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta_range.py @@ -1,5 +1,4 @@ import numpy as np - import pandas as pd import pandas.util.testing as tm from pandas.tseries.offsets import Day, Second @@ -49,3 +48,23 @@ def test_timedelta_range(self): expected = df.loc[pd.Timedelta('0s'):, :] result = df.loc['0s':, :] assert_frame_equal(expected, result) + + def test_errors(self): + # not enough params + msg = ('Of the three parameters: start, end, and periods, ' + 'exactly two must be specified') + with tm.assert_raises_regex(ValueError, msg): + timedelta_range(start='0 days') + + with tm.assert_raises_regex(ValueError, msg): + timedelta_range(end='5 days') + + with tm.assert_raises_regex(ValueError, msg): + timedelta_range(periods=2) + + with tm.assert_raises_regex(ValueError, msg): + timedelta_range() + + # too many params + with tm.assert_raises_regex(ValueError, msg): + timedelta_range(start='0 days', end='5 days', periods=10) From 97abd2c9c11aeee0e3d2c58a74d85fa75062ca1f Mon Sep 17 00:00:00 2001 From: Kirk Hansen Date: Thu, 14 Sep 2017 05:14:43 -0500 Subject: [PATCH 083/188] TST: Made s3 related tests mock boto (#17388) --- appveyor.yml | 6 ++ ci/install_circle.sh | 1 + ci/install_travis.sh | 2 +- ci/requirements-2.7_WIN.pip | 0 ci/requirements-3.6_NUMPY_DEV.pip | 0 ci/requirements-3.6_WIN.pip | 0 ci/requirements_dev.txt | 1 + pandas/tests/io/parser/data/tips.csv.bz2 | Bin 0 -> 1316 bytes pandas/tests/io/parser/data/tips.csv.gz | Bin 0 -> 1740 bytes pandas/tests/io/parser/test_network.py | 100 ++++++++++++++--------- pandas/tests/io/test_excel.py | 58 ++++++------- tox.ini | 1 + 12 files changed, 102 insertions(+), 67 deletions(-) create mode 100644 ci/requirements-2.7_WIN.pip create mode 100644 ci/requirements-3.6_NUMPY_DEV.pip create mode 100644 ci/requirements-3.6_WIN.pip create mode 100644 pandas/tests/io/parser/data/tips.csv.bz2 create mode 100644 pandas/tests/io/parser/data/tips.csv.gz diff --git a/appveyor.yml b/appveyor.yml index 65e62f887554e5..a1f8886f6d068f 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -74,12 +74,18 @@ install: # create our env - cmd: conda create -n pandas python=%PYTHON_VERSION% cython pytest>=3.1.0 pytest-xdist - cmd: activate pandas + - cmd: pip install moto - SET REQ=ci\requirements-%PYTHON_VERSION%_WIN.run - cmd: echo "installing requirements from %REQ%" - cmd: conda install -n pandas --file=%REQ% - cmd: conda list -n pandas - cmd: echo "installing requirements from %REQ% - done" + # add some pip only reqs to the env + - SET REQ=ci\requirements-%PYTHON_VERSION%_WIN.pip + - cmd: echo "installing requirements from %REQ%" + - cmd: pip install -Ur %REQ% + # build em using the local source checkout in the correct windows env - cmd: '%CMD_IN_ENV% python setup.py build_ext --inplace' diff --git a/ci/install_circle.sh b/ci/install_circle.sh index 29ca69970104b0..fd79f907625e9d 100755 --- a/ci/install_circle.sh +++ b/ci/install_circle.sh @@ -67,6 +67,7 @@ time conda create -n pandas -q --file=${REQ_BUILD} || exit 1 time conda install -n pandas pytest>=3.1.0 || exit 1 source activate pandas +time pip install moto || exit 1 # build but don't install echo "[build em]" diff --git a/ci/install_travis.sh b/ci/install_travis.sh index d26689f2e6b4bd..b85263daa1eaca 100755 --- a/ci/install_travis.sh +++ b/ci/install_travis.sh @@ -104,7 +104,7 @@ if [ -e ${REQ} ]; then fi time conda install -n pandas pytest>=3.1.0 -time pip install pytest-xdist +time pip install pytest-xdist moto if [ "$LINT" ]; then conda install flake8 diff --git a/ci/requirements-2.7_WIN.pip b/ci/requirements-2.7_WIN.pip new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/ci/requirements-3.6_NUMPY_DEV.pip b/ci/requirements-3.6_NUMPY_DEV.pip new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/ci/requirements-3.6_WIN.pip b/ci/requirements-3.6_WIN.pip new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/ci/requirements_dev.txt b/ci/requirements_dev.txt index c7190c506ba18f..dbc4f6cbd65098 100644 --- a/ci/requirements_dev.txt +++ b/ci/requirements_dev.txt @@ -5,3 +5,4 @@ cython pytest>=3.1.0 pytest-cov flake8 +moto diff --git a/pandas/tests/io/parser/data/tips.csv.bz2 b/pandas/tests/io/parser/data/tips.csv.bz2 new file mode 100644 index 0000000000000000000000000000000000000000..1452896b05e9d41f58ffd816a0459d86796718a6 GIT binary patch literal 1316 zcmV+<1>5>UT4*^jL0KkKS@WgHUjPpp-+%xR00n>G1qTcuzHi=eU zr8L5NfM@_34^ia=nn@4@ngc)pXm>4ki*@VR?6|SRoF#LZ+TkL$)Z)}c<#mBig_KMX zruJeOi&bv;V=*04xP@hDQp(ibF*2pqxW%nuMr@F6Gix?+fsH|aKayy7UwGa_-`dVs zYfM$)R7$k8wpC6gfmM#M!-v|)iP#1h4cPkh|rkJNTD3*02| zUew#%bX<$c*~vCvMH>_%oV^S&6a+#ukskADG3ECrBRBE^v4aChy? zvDazQUv(jtyOFJd%+RitVq;Fo?$ru4tx8y4RWLAw3OQ&r5YZ6QA(|s=%EqEnNvFyDucBxbJ63X0f6|L)lrAb?vZoDHd%^>qwTK z8M-E+R_N`PibFFSF!cCl2Z7}>xeJ`*<3&DX2?dNalnbN*vYZ7QTLis}+CyTbyv{>s zl!hm_!_I4KZE}>uSzBr=*www83fCT-SPZ&+p@dCkFG(R6{D)ETHdAf-8>fnW#-GXdM4pE5VK!{hIp z4{*7H7hK39V*E6-z)7yKmA;#^4 z#PVN7@@@mJL*EhAX#`mH2SAk2lkhNXJBL>BHS&`^r&JS)>z58UjoYiOCqY*zmz*K6 z1SFlk-!Cn`6liVaz=_bPhSWpu1LJ>%Cxlk3T;w2WIQ0LRX3%vrxUPW z8d$X$uIXc_sI{9kN=EXFie6i&h29y!AZcb)r??rFOLu%3R3P<2gpt$oRe1O6gk~8T zu3j+kM{M-PhPbG60sxBGP*RgE)NL!@Yr%+f=+n7l@JL0;84IYj5yo31-0M)BHp<)Q zzkK_6UA}%i|M3mU6cFV&C+q8L8zqA-)xv!>^z@7=Fgi9q_iLEzwg+!G2w0Ts9jf*M z64F>g8RrtB4m-(FnM=?v>|@tRdI1$7H2kMsssN5^GU(*!z`p{ft@Qr;@_OlzdPSq# z=N&m=z8R{dV?dV-Iwe>fL1(0h{JJ}+<6sZ(@ePlLCs;FVmX?rYPxs1DA(^whpU+gQLdb{bOK!0;_ zkQW*TzXUDj{aqJ}zCZT`AFw?MCRq$YLmUun3sPt|TJ|F1y1->qh6EwxZc5srUOK?6 zfIOA24Gq;xs91xZWkXI-kgFkpK@VM+dImzp9WY2eRlGn`2@#FO*RJOK&vl0mX5&x| zsC*~R>SEi53Wfn0JC1s5&DImTC?CmS%t%KJn8SnJ{vz7Tu;z{(oX1Uj?2r-D=FHLg z#Nx)*tqL1*0`$uskSzVPPI~Zw87JK{kHS;|mjvLPazsSBBGTEE(XeUKcA)Oa1!1&{ ziGd~d!Xgpq$A_L=)+{U2btCFAD_NiGHe#QuSj!mhzmK3jN5V2e#ai_;@D^ZS3^-kH z6guhK*S?INWvhtT8n-^y8%I8HZbrKc2koF=btc|VG&cU-G4a~h=kf7qrTv=Ut%I~S zEXzKRMTs`<+xJ_K%nb(}Ie8d~S$W#@BiccQnPiO(+O^Yd9ou<9tf*;o$=WeUAZqAG zyzyj!F_p;rzPQ?Y92;+@To35Y<=xOSTm>@DJ;}6?*Lzr=TgaG9BIbr{y}$`b72TY! zqYYtgpVJv*bV|eFpvy$Pm>HFtbh_Na_)b19LfLd-0+3QVd;u1iG1e^0tsmq27&c@f zqhD+!jOz~T@n@5$<6yJqL9iFfH0&B9mSe(Zd*O_H&`()&cv#qX>*83gV@pnS)Uxa6 zh&!W4Kw{zbuyG*bJ30s^kL%1hKc#3Y!TLa1|HGI+q2~|%8;0j+sEAdd#O2^p#_J5{ zqk&o!uGkw*Xq2S)W72nPTLSJR3mF;xQOdr}*By;^C3XK=k7;*$ zylq6O8Vck|96AOM^M;z(GGMh%)?T{?8o*P+jIR3%VPB~S`#)bVj@Hps@zV;k&aoL? zJT_x>_m~9QgT~p5h literal 0 HcmV?d00001 diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index 3344243f8137af..27cc708889fa23 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -4,13 +4,20 @@ Tests parsers ability to read and parse non-local files and hence require a network connection to be read. """ - import os + import pytest +import moto import pandas.util.testing as tm from pandas import DataFrame from pandas.io.parsers import read_csv, read_table +from pandas.compat import BytesIO + + +@pytest.fixture(scope='module') +def tips_file(): + return os.path.join(tm.get_data_path(), 'tips.csv') @pytest.fixture(scope='module') @@ -19,6 +26,40 @@ def salaries_table(): return read_table(path) +@pytest.fixture(scope='module') +def s3_resource(tips_file): + pytest.importorskip('s3fs') + moto.mock_s3().start() + + test_s3_files = [ + ('tips.csv', tips_file), + ('tips.csv.gz', tips_file + '.gz'), + ('tips.csv.bz2', tips_file + '.bz2'), + ] + + def add_tips_files(bucket_name): + for s3_key, file_name in test_s3_files: + with open(file_name, 'rb') as f: + conn.Bucket(bucket_name).put_object( + Key=s3_key, + Body=f) + + boto3 = pytest.importorskip('boto3') + # see gh-16135 + bucket = 'pandas-test' + + conn = boto3.resource("s3", region_name="us-east-1") + conn.create_bucket(Bucket=bucket) + add_tips_files(bucket) + + conn.create_bucket(Bucket='cant_get_it', ACL='private') + add_tips_files('cant_get_it') + + yield conn + + moto.mock_s3().stop() + + @pytest.mark.network @pytest.mark.parametrize( "compression,extension", @@ -51,15 +92,11 @@ def check_compressed_urls(salaries_table, compression, extension, mode, class TestS3(object): - - def setup_method(self, method): - try: - import s3fs # noqa - except ImportError: - pytest.skip("s3fs not installed") - @tm.network def test_parse_public_s3_bucket(self): + pytest.importorskip('s3fs') + # more of an integration test due to the not-public contents portion + # can probably mock this though. for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: df = read_csv('s3://pandas-test/tips.csv' + ext, compression=comp) @@ -74,8 +111,8 @@ def test_parse_public_s3_bucket(self): assert not df.empty tm.assert_frame_equal(read_csv(tm.get_data_path('tips.csv')), df) - @tm.network - def test_parse_public_s3n_bucket(self): + def test_parse_public_s3n_bucket(self, s3_resource): + # Read from AWS s3 as "s3n" URL df = read_csv('s3n://pandas-test/tips.csv', nrows=10) assert isinstance(df, DataFrame) @@ -83,8 +120,7 @@ def test_parse_public_s3n_bucket(self): tm.assert_frame_equal(read_csv( tm.get_data_path('tips.csv')).iloc[:10], df) - @tm.network - def test_parse_public_s3a_bucket(self): + def test_parse_public_s3a_bucket(self, s3_resource): # Read from AWS s3 as "s3a" URL df = read_csv('s3a://pandas-test/tips.csv', nrows=10) assert isinstance(df, DataFrame) @@ -92,8 +128,7 @@ def test_parse_public_s3a_bucket(self): tm.assert_frame_equal(read_csv( tm.get_data_path('tips.csv')).iloc[:10], df) - @tm.network - def test_parse_public_s3_bucket_nrows(self): + def test_parse_public_s3_bucket_nrows(self, s3_resource): for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: df = read_csv('s3://pandas-test/tips.csv' + ext, nrows=10, compression=comp) @@ -102,8 +137,7 @@ def test_parse_public_s3_bucket_nrows(self): tm.assert_frame_equal(read_csv( tm.get_data_path('tips.csv')).iloc[:10], df) - @tm.network - def test_parse_public_s3_bucket_chunked(self): + def test_parse_public_s3_bucket_chunked(self, s3_resource): # Read with a chunksize chunksize = 5 local_tips = read_csv(tm.get_data_path('tips.csv')) @@ -121,8 +155,7 @@ def test_parse_public_s3_bucket_chunked(self): chunksize * i_chunk: chunksize * (i_chunk + 1)] tm.assert_frame_equal(true_df, df) - @tm.network - def test_parse_public_s3_bucket_chunked_python(self): + def test_parse_public_s3_bucket_chunked_python(self, s3_resource): # Read with a chunksize using the Python parser chunksize = 5 local_tips = read_csv(tm.get_data_path('tips.csv')) @@ -140,8 +173,7 @@ def test_parse_public_s3_bucket_chunked_python(self): chunksize * i_chunk: chunksize * (i_chunk + 1)] tm.assert_frame_equal(true_df, df) - @tm.network - def test_parse_public_s3_bucket_python(self): + def test_parse_public_s3_bucket_python(self, s3_resource): for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python', compression=comp) @@ -150,8 +182,7 @@ def test_parse_public_s3_bucket_python(self): tm.assert_frame_equal(read_csv( tm.get_data_path('tips.csv')), df) - @tm.network - def test_infer_s3_compression(self): + def test_infer_s3_compression(self, s3_resource): for ext in ['', '.gz', '.bz2']: df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python', compression='infer') @@ -160,8 +191,7 @@ def test_infer_s3_compression(self): tm.assert_frame_equal(read_csv( tm.get_data_path('tips.csv')), df) - @tm.network - def test_parse_public_s3_bucket_nrows_python(self): + def test_parse_public_s3_bucket_nrows_python(self, s3_resource): for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python', nrows=10, compression=comp) @@ -170,8 +200,7 @@ def test_parse_public_s3_bucket_nrows_python(self): tm.assert_frame_equal(read_csv( tm.get_data_path('tips.csv')).iloc[:10], df) - @tm.network - def test_s3_fails(self): + def test_s3_fails(self, s3_resource): with pytest.raises(IOError): read_csv('s3://nyqpug/asdf.csv') @@ -180,21 +209,18 @@ def test_s3_fails(self): with pytest.raises(IOError): read_csv('s3://cant_get_it/') - @tm.network - def boto3_client_s3(self): + def test_read_csv_handles_boto_s3_object(self, + s3_resource, + tips_file): # see gh-16135 - # boto3 is a dependency of s3fs - import boto3 - client = boto3.client("s3") - - key = "/tips.csv" - bucket = "pandas-test" - s3_object = client.get_object(Bucket=bucket, Key=key) + s3_object = s3_resource.meta.client.get_object( + Bucket='pandas-test', + Key='tips.csv') - result = read_csv(s3_object["Body"]) + result = read_csv(BytesIO(s3_object["Body"].read()), encoding='utf8') assert isinstance(result, DataFrame) assert not result.empty - expected = read_csv(tm.get_data_path('tips.csv')) + expected = read_csv(tips_file) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index 92147b46097b80..6a399f41975e5b 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -1,33 +1,32 @@ # pylint: disable=E1101 - -from pandas.compat import u, range, map, openpyxl_compat, BytesIO, iteritems -from datetime import datetime, date, time -import sys +import functools +import operator import os +import sys +import warnings +from datetime import datetime, date, time from distutils.version import LooseVersion from functools import partial - -import warnings from warnings import catch_warnings -import operator -import functools -import pytest -from numpy import nan import numpy as np +import pytest +from numpy import nan +import moto import pandas as pd +import pandas.util.testing as tm from pandas import DataFrame, Index, MultiIndex -from pandas.io.formats.excel import ExcelFormatter -from pandas.io.parsers import read_csv +from pandas.compat import u, range, map, openpyxl_compat, BytesIO, iteritems +from pandas.core.config import set_option, get_option +from pandas.io.common import URLError from pandas.io.excel import ( ExcelFile, ExcelWriter, read_excel, _XlwtWriter, _Openpyxl1Writer, _Openpyxl20Writer, _Openpyxl22Writer, register_writer, _XlsxWriter ) -from pandas.io.common import URLError +from pandas.io.formats.excel import ExcelFormatter +from pandas.io.parsers import read_csv from pandas.util.testing import ensure_clean, makeCustomDataframe as mkdf -from pandas.core.config import set_option, get_option -import pandas.util.testing as tm def _skip_if_no_xlrd(): @@ -67,13 +66,6 @@ def _skip_if_no_excelsuite(): _skip_if_no_openpyxl() -def _skip_if_no_s3fs(): - try: - import s3fs # noqa - except ImportError: - pytest.skip('s3fs not installed, skipping') - - _seriesd = tm.getSeriesData() _tsd = tm.getTimeSeriesData() _frame = DataFrame(_seriesd)[:10] @@ -605,14 +597,22 @@ def test_read_from_http_url(self): local_table = self.get_exceldf('test1') tm.assert_frame_equal(url_table, local_table) - @tm.network(check_before_test=True) def test_read_from_s3_url(self): - _skip_if_no_s3fs() - - url = ('s3://pandas-test/test1' + self.ext) - url_table = read_excel(url) - local_table = self.get_exceldf('test1') - tm.assert_frame_equal(url_table, local_table) + boto3 = pytest.importorskip('boto3') + pytest.importorskip('s3fs') + + with moto.mock_s3(): + conn = boto3.resource("s3", region_name="us-east-1") + conn.create_bucket(Bucket="pandas-test") + file_name = os.path.join(self.dirpath, 'test1' + self.ext) + with open(file_name, 'rb') as f: + conn.Bucket("pandas-test").put_object(Key="test1" + self.ext, + Body=f) + + url = ('s3://pandas-test/test1' + self.ext) + url_table = read_excel(url) + local_table = self.get_exceldf('test1') + tm.assert_frame_equal(url_table, local_table) @pytest.mark.slow def test_read_from_file_url(self): diff --git a/tox.ini b/tox.ini index 45ad7fc451e764..f055251581a93c 100644 --- a/tox.ini +++ b/tox.ini @@ -19,6 +19,7 @@ deps = xlrd six sqlalchemy + moto # cd to anything but the default {toxinidir} which # contains the pandas subdirectory and confuses From 0097cb712a7361a69eb4f5ebb9bc13c2b8733f19 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 14 Sep 2017 11:09:30 -0500 Subject: [PATCH 084/188] PERF: Avoid values in Categorical.set_categories (#17515) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mater: ```python In [1]: import pandas as pd; import numpy as np In [2]: arr = ['s%04d' % i for i in np.random.randint(0, 500000 // 10, size=500000)]; s = pd.Series(arr).astype('category') In [3]: %timeit s.cat.set_categories(s.cat.categories) 68.5 ms ± 846 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) ``` HEAD: ```python In [1]: import pandas as pd; import numpy as np In [2]: arr = ['s%04d' % i for i in np.random.randint(0, 500000 // 10, size=500000)] s = pd.Series(arr).astype('category') In [3]: %timeit s.cat.set_categories(s.cat.categories) 7.43 ms ± 110 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) ``` Closes https://github.com/pandas-dev/pandas/issues/17508 --- asv_bench/benchmarks/categoricals.py | 3 ++ doc/source/whatsnew/v0.21.0.txt | 1 + pandas/core/categorical.py | 37 ++++++++++++++++- pandas/core/dtypes/concat.py | 11 ++--- pandas/tests/test_categorical.py | 62 ++++++++++++++++++++++++++++ 5 files changed, 104 insertions(+), 10 deletions(-) diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index 6432ccfb19efec..d90c994b3d194a 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -67,6 +67,9 @@ def time_value_counts_dropna(self): def time_rendering(self): str(self.sel) + def time_set_categories(self): + self.ts.cat.set_categories(self.ts.cat.categories[::2]) + class Categoricals3(object): goal_time = 0.2 diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 939199d3f6fa6d..6495ad3e7f6adb 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -467,6 +467,7 @@ Performance Improvements - Improved performance of instantiating :class:`SparseDataFrame` (:issue:`16773`) - :attr:`Series.dt` no longer performs frequency inference, yielding a large speedup when accessing the attribute (:issue:`17210`) +- Improved performance of :meth:`Categorical.set_categories` by not materializing the values (:issue:`17508`) - :attr:`Timestamp.microsecond` no longer re-computes on attribute access (:issue:`17331`) .. _whatsnew_0210.bug_fixes: diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 97df72900428c4..e67ce2936819f5 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -777,8 +777,9 @@ def set_categories(self, new_categories, ordered=None, rename=False, # remove all _codes which are larger and set to -1/NaN self._codes[self._codes >= len(new_categories)] = -1 else: - values = cat.__array__() - cat._codes = _get_codes_for_values(values, new_categories) + codes = _recode_for_categories(self.codes, self.categories, + new_categories) + cat._codes = codes cat._categories = new_categories if ordered is None: @@ -2113,6 +2114,38 @@ def _get_codes_for_values(values, categories): return coerce_indexer_dtype(t.lookup(vals), cats) +def _recode_for_categories(codes, old_categories, new_categories): + """ + Convert a set of codes for to a new set of categories + + Parameters + ---------- + codes : array + old_categories, new_categories : Index + + Returns + ------- + new_codes : array + + Examples + -------- + >>> old_cat = pd.Index(['b', 'a', 'c']) + >>> new_cat = pd.Index(['a', 'b']) + >>> codes = np.array([0, 1, 1, 2]) + >>> _recode_for_categories(codes, old_cat, new_cat) + array([ 1, 0, 0, -1]) + """ + from pandas.core.algorithms import take_1d + + if len(old_categories) == 0: + # All null anyway, so just retain the nulls + return codes + indexer = coerce_indexer_dtype(new_categories.get_indexer(old_categories), + new_categories) + new_codes = take_1d(indexer, codes.copy(), fill_value=-1) + return new_codes + + def _convert_to_list_like(list_like): if hasattr(list_like, "dtype"): return list_like diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 0ce45eea119ed2..f6f956832eebe8 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -314,6 +314,7 @@ def union_categoricals(to_union, sort_categories=False, ignore_order=False): Categories (3, object): [b, c, a] """ from pandas import Index, Categorical, CategoricalIndex, Series + from pandas.core.categorical import _recode_for_categories if len(to_union) == 0: raise ValueError('No Categoricals to union') @@ -359,14 +360,8 @@ def _maybe_unwrap(x): new_codes = [] for c in to_union: - if len(c.categories) > 0: - indexer = categories.get_indexer(c.categories) - - from pandas.core.algorithms import take_1d - new_codes.append(take_1d(indexer, c.codes, fill_value=-1)) - else: - # must be all NaN - new_codes.append(c.codes) + new_codes.append(_recode_for_categories(c.codes, c.categories, + categories)) new_codes = np.concatenate(new_codes) else: # ordered - to show a proper error message diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 7bbe220378993b..8a5f6bf110be32 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -26,6 +26,7 @@ Interval, IntervalIndex) from pandas.compat import range, lrange, u, PY3, PYPY from pandas.core.config import option_context +from pandas.core.categorical import _recode_for_categories class TestCategorical(object): @@ -963,6 +964,67 @@ def test_rename_categories(self): with pytest.raises(ValueError): cat.rename_categories([1, 2]) + @pytest.mark.parametrize('codes, old, new, expected', [ + ([0, 1], ['a', 'b'], ['a', 'b'], [0, 1]), + ([0, 1], ['b', 'a'], ['b', 'a'], [0, 1]), + ([0, 1], ['a', 'b'], ['b', 'a'], [1, 0]), + ([0, 1], ['b', 'a'], ['a', 'b'], [1, 0]), + ([0, 1, 0, 1], ['a', 'b'], ['a', 'b', 'c'], [0, 1, 0, 1]), + ([0, 1, 2, 2], ['a', 'b', 'c'], ['a', 'b'], [0, 1, -1, -1]), + ([0, 1, -1], ['a', 'b', 'c'], ['a', 'b', 'c'], [0, 1, -1]), + ([0, 1, -1], ['a', 'b', 'c'], ['b'], [-1, 0, -1]), + ([0, 1, -1], ['a', 'b', 'c'], ['d'], [-1, -1, -1]), + ([0, 1, -1], ['a', 'b', 'c'], [], [-1, -1, -1]), + ([-1, -1], [], ['a', 'b'], [-1, -1]), + ([1, 0], ['b', 'a'], ['a', 'b'], [0, 1]), + ]) + def test_recode_to_categories(self, codes, old, new, expected): + codes = np.asanyarray(codes, dtype=np.int8) + expected = np.asanyarray(expected, dtype=np.int8) + old = Index(old) + new = Index(new) + result = _recode_for_categories(codes, old, new) + tm.assert_numpy_array_equal(result, expected) + + def test_recode_to_categories_large(self): + N = 1000 + codes = np.arange(N) + old = Index(codes) + expected = np.arange(N - 1, -1, -1, dtype=np.int16) + new = Index(expected) + result = _recode_for_categories(codes, old, new) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize('values, categories, new_categories', [ + # No NaNs, same cats, same order + (['a', 'b', 'a'], ['a', 'b'], ['a', 'b'],), + # No NaNs, same cats, different order + (['a', 'b', 'a'], ['a', 'b'], ['b', 'a'],), + # Same, unsorted + (['b', 'a', 'a'], ['a', 'b'], ['a', 'b'],), + # No NaNs, same cats, different order + (['b', 'a', 'a'], ['a', 'b'], ['b', 'a'],), + # NaNs + (['a', 'b', 'c'], ['a', 'b'], ['a', 'b']), + (['a', 'b', 'c'], ['a', 'b'], ['b', 'a']), + (['b', 'a', 'c'], ['a', 'b'], ['a', 'b']), + (['b', 'a', 'c'], ['a', 'b'], ['a', 'b']), + # Introduce NaNs + (['a', 'b', 'c'], ['a', 'b'], ['a']), + (['a', 'b', 'c'], ['a', 'b'], ['b']), + (['b', 'a', 'c'], ['a', 'b'], ['a']), + (['b', 'a', 'c'], ['a', 'b'], ['a']), + # No overlap + (['a', 'b', 'c'], ['a', 'b'], ['d', 'e']), + ]) + @pytest.mark.parametrize('ordered', [True, False]) + def test_set_categories_many(self, values, categories, new_categories, + ordered): + c = Categorical(values, categories) + expected = Categorical(values, new_categories, ordered) + result = c.set_categories(new_categories, ordered=ordered) + tm.assert_categorical_equal(result, expected) + def test_reorder_categories(self): cat = Categorical(["a", "b", "c", "a"], ordered=True) old = cat.copy() From 06a6e63c317e5291eb78081e2a21bc163ddaab6e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 14 Sep 2017 15:48:59 -0700 Subject: [PATCH 085/188] remove period_helper from non-period reqs (#17531) --- setup.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index 434ca644739165..664478cc35845f 100755 --- a/setup.py +++ b/setup.py @@ -461,7 +461,6 @@ def pxd(name): tseries_depends = ['pandas/_libs/src/datetime/np_datetime.h', 'pandas/_libs/src/datetime/np_datetime_strings.h', - 'pandas/_libs/src/period_helper.h', 'pandas/_libs/src/datetime.pxd'] @@ -478,11 +477,11 @@ def pxd(name): 'pxdfiles': ['_libs/src/util'], 'depends': tseries_depends, 'sources': ['pandas/_libs/src/datetime/np_datetime.c', - 'pandas/_libs/src/datetime/np_datetime_strings.c', - 'pandas/_libs/src/period_helper.c']}, + 'pandas/_libs/src/datetime/np_datetime_strings.c']}, '_libs.tslibs.timezones': {'pyxfile': '_libs/tslibs/timezones'}, '_libs.period': {'pyxfile': '_libs/period', - 'depends': tseries_depends, + 'depends': (tseries_depends + + ['pandas/_libs/src/period_helper.h']), 'sources': ['pandas/_libs/src/datetime/np_datetime.c', 'pandas/_libs/src/datetime/np_datetime_strings.c', 'pandas/_libs/src/period_helper.c']}, From ad70ed4ba921360169820dabd16e4475c527479f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 14 Sep 2017 15:52:53 -0700 Subject: [PATCH 086/188] Fix bug where offset.copy() != offset (#17452) --- pandas/tests/tseries/test_offsets.py | 5 + pandas/tseries/offsets.py | 180 ++++++++++++++++----------- 2 files changed, 115 insertions(+), 70 deletions(-) diff --git a/pandas/tests/tseries/test_offsets.py b/pandas/tests/tseries/test_offsets.py index 7e6e85f322fe0f..cd2c29ffe3ac6b 100644 --- a/pandas/tests/tseries/test_offsets.py +++ b/pandas/tests/tseries/test_offsets.py @@ -1955,6 +1955,11 @@ def _check_roundtrip(obj): _check_roundtrip(self._object(2)) _check_roundtrip(self._object() * 2) + def test_copy(self): + # GH 17452 + off = self._object(weekmask='Mon Wed Fri') + assert off == off.copy() + class TestCustomBusinessMonthEnd(CustomBusinessMonthBase, Base): _object = CBMonthEnd diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index 7ccecaa84e6d6d..d82a3a209af6bf 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -11,6 +11,7 @@ from dateutil.relativedelta import relativedelta, weekday from dateutil.easter import easter from pandas._libs import tslib, Timestamp, OutOfBoundsDatetime, Timedelta +from pandas.util._decorators import cache_readonly import functools import operator @@ -573,9 +574,9 @@ def __setstate__(self, state): """Reconstruct an instance from a pickled state""" self.__dict__ = state if 'weekmask' in state and 'holidays' in state: - calendar, holidays = self.get_calendar(weekmask=self.weekmask, - holidays=self.holidays, - calendar=None) + calendar, holidays = _get_calendar(weekmask=self.weekmask, + holidays=self.holidays, + calendar=None) self.kwds['calendar'] = self.calendar = calendar self.kwds['holidays'] = self.holidays = holidays self.kwds['weekmask'] = state['weekmask'] @@ -978,9 +979,9 @@ def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri', self.normalize = normalize self.kwds = kwds self.offset = kwds.get('offset', timedelta(0)) - calendar, holidays = self.get_calendar(weekmask=weekmask, - holidays=holidays, - calendar=calendar) + calendar, holidays = _get_calendar(weekmask=weekmask, + holidays=holidays, + calendar=calendar) # CustomBusinessDay instances are identified by the # following two attributes. See DateOffset._params() # holidays, weekmask @@ -989,36 +990,6 @@ def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri', self.kwds['holidays'] = self.holidays = holidays self.kwds['calendar'] = self.calendar = calendar - def get_calendar(self, weekmask, holidays, calendar): - """Generate busdaycalendar""" - if isinstance(calendar, np.busdaycalendar): - if not holidays: - holidays = tuple(calendar.holidays) - elif not isinstance(holidays, tuple): - holidays = tuple(holidays) - else: - # trust that calendar.holidays and holidays are - # consistent - pass - return calendar, holidays - - if holidays is None: - holidays = [] - try: - holidays = holidays + calendar.holidays().tolist() - except AttributeError: - pass - holidays = [self._to_dt64(dt, dtype='datetime64[D]') for dt in - holidays] - holidays = tuple(sorted(holidays)) - - kwargs = {'weekmask': weekmask} - if holidays: - kwargs['holidays'] = holidays - - busdaycalendar = np.busdaycalendar(**kwargs) - return busdaycalendar, holidays - @apply_wraps def apply(self, other): if self.n <= 0: @@ -1050,25 +1021,10 @@ def apply(self, other): def apply_index(self, i): raise NotImplementedError - @staticmethod - def _to_dt64(dt, dtype='datetime64'): - # Currently - # > np.datetime64(dt.datetime(2013,5,1),dtype='datetime64[D]') - # numpy.datetime64('2013-05-01T02:00:00.000000+0200') - # Thus astype is needed to cast datetime to datetime64[D] - if getattr(dt, 'tzinfo', None) is not None: - i8 = tslib.pydt_to_i8(dt) - dt = tslib.tz_convert_single(i8, 'UTC', dt.tzinfo) - dt = Timestamp(dt) - dt = np.datetime64(dt) - if dt.dtype.name != dtype: - dt = dt.astype(dtype) - return dt - def onOffset(self, dt): if self.normalize and not _is_normalized(dt): return False - day64 = self._to_dt64(dt, 'datetime64[D]') + day64 = _to_dt64(dt, 'datetime64[D]') return np.is_busday(day64, busdaycal=self.calendar) @@ -1087,19 +1043,25 @@ def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri', self.n = int(n) self.normalize = normalize super(CustomBusinessHour, self).__init__(**kwds) + + calendar, holidays = _get_calendar(weekmask=weekmask, + holidays=holidays, + calendar=calendar) + self.kwds['weekmask'] = self.weekmask = weekmask + self.kwds['holidays'] = self.holidays = holidays + self.kwds['calendar'] = self.calendar = calendar + + @cache_readonly + def next_bday(self): # used for moving to next businessday if self.n >= 0: nb_offset = 1 else: nb_offset = -1 - self.next_bday = CustomBusinessDay(n=nb_offset, - weekmask=weekmask, - holidays=holidays, - calendar=calendar) - - self.kwds['weekmask'] = self.next_bday.weekmask - self.kwds['holidays'] = self.next_bday.holidays - self.kwds['calendar'] = self.next_bday.calendar + return CustomBusinessDay(n=nb_offset, + weekmask=self.weekmask, + holidays=self.holidays, + calendar=self.calendar) class MonthOffset(SingleConstructorOffset): @@ -1471,11 +1433,25 @@ def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri', self.normalize = normalize self.kwds = kwds self.offset = kwds.get('offset', timedelta(0)) - self.cbday = CustomBusinessDay(n=self.n, normalize=normalize, - weekmask=weekmask, holidays=holidays, - calendar=calendar, **kwds) - self.m_offset = MonthEnd(n=1, normalize=normalize, **kwds) - self.kwds['calendar'] = self.cbday.calendar # cache numpy calendar + + calendar, holidays = _get_calendar(weekmask=weekmask, + holidays=holidays, + calendar=calendar) + self.kwds['weekmask'] = self.weekmask = weekmask + self.kwds['holidays'] = self.holidays = holidays + self.kwds['calendar'] = self.calendar = calendar + + @cache_readonly + def cbday(self): + kwds = self.kwds + return CustomBusinessDay(n=self.n, normalize=self.normalize, **kwds) + + @cache_readonly + def m_offset(self): + kwds = self.kwds + kwds = {key: kwds[key] for key in kwds + if key not in ['calendar', 'weekmask', 'holidays']} + return MonthEnd(n=1, normalize=self.normalize, **kwds) @apply_wraps def apply(self, other): @@ -1531,11 +1507,27 @@ def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri', self.normalize = normalize self.kwds = kwds self.offset = kwds.get('offset', timedelta(0)) - self.cbday = CustomBusinessDay(n=self.n, normalize=normalize, - weekmask=weekmask, holidays=holidays, - calendar=calendar, **kwds) - self.m_offset = MonthBegin(n=1, normalize=normalize, **kwds) - self.kwds['calendar'] = self.cbday.calendar # cache numpy calendar + + # _get_calendar does validation and possible transformation + # of calendar and holidays. + calendar, holidays = _get_calendar(weekmask=weekmask, + holidays=holidays, + calendar=calendar) + kwds['calendar'] = self.calendar = calendar + kwds['weekmask'] = self.weekmask = weekmask + kwds['holidays'] = self.holidays = holidays + + @cache_readonly + def cbday(self): + kwds = self.kwds + return CustomBusinessDay(n=self.n, normalize=self.normalize, **kwds) + + @cache_readonly + def m_offset(self): + kwds = self.kwds + kwds = {key: kwds[key] for key in kwds + if key not in ['calendar', 'weekmask', 'holidays']} + return MonthBegin(n=1, normalize=self.normalize, **kwds) @apply_wraps def apply(self, other): @@ -2861,6 +2853,54 @@ class Nano(Tick): CBMonthBegin = CustomBusinessMonthBegin CDay = CustomBusinessDay +# --------------------------------------------------------------------- +# Business Calendar helpers + + +def _get_calendar(weekmask, holidays, calendar): + """Generate busdaycalendar""" + if isinstance(calendar, np.busdaycalendar): + if not holidays: + holidays = tuple(calendar.holidays) + elif not isinstance(holidays, tuple): + holidays = tuple(holidays) + else: + # trust that calendar.holidays and holidays are + # consistent + pass + return calendar, holidays + + if holidays is None: + holidays = [] + try: + holidays = holidays + calendar.holidays().tolist() + except AttributeError: + pass + holidays = [_to_dt64(dt, dtype='datetime64[D]') for dt in holidays] + holidays = tuple(sorted(holidays)) + + kwargs = {'weekmask': weekmask} + if holidays: + kwargs['holidays'] = holidays + + busdaycalendar = np.busdaycalendar(**kwargs) + return busdaycalendar, holidays + + +def _to_dt64(dt, dtype='datetime64'): + # Currently + # > np.datetime64(dt.datetime(2013,5,1),dtype='datetime64[D]') + # numpy.datetime64('2013-05-01T02:00:00.000000+0200') + # Thus astype is needed to cast datetime to datetime64[D] + if getattr(dt, 'tzinfo', None) is not None: + i8 = tslib.pydt_to_i8(dt) + dt = tslib.tz_convert_single(i8, 'UTC', dt.tzinfo) + dt = Timestamp(dt) + dt = np.datetime64(dt) + if dt.dtype.name != dtype: + dt = dt.astype(dtype) + return dt + def _get_firstbday(wkday): """ From 94266d48e5f54287a877cf7a0e94ef740e3eda22 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 14 Sep 2017 18:29:39 -0500 Subject: [PATCH 087/188] PERF: Faster CategoricalIndex from categorical (#17513) --- doc/source/whatsnew/v0.21.0.txt | 1 + pandas/core/indexes/category.py | 4 ++++ pandas/tests/indexes/test_category.py | 10 ++++++++++ 3 files changed, 15 insertions(+) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 6495ad3e7f6adb..52e056103cbdc3 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -469,6 +469,7 @@ Performance Improvements - :attr:`Series.dt` no longer performs frequency inference, yielding a large speedup when accessing the attribute (:issue:`17210`) - Improved performance of :meth:`Categorical.set_categories` by not materializing the values (:issue:`17508`) - :attr:`Timestamp.microsecond` no longer re-computes on attribute access (:issue:`17331`) +- Improved performance of the :class:`CategoricalIndex` for data that is already categorical dtype (:issue:`17513`) .. _whatsnew_0210.bug_fixes: diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 71cd4790ac3648..ef1dc4d971f37f 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -130,6 +130,10 @@ def _create_categorical(self, data, categories=None, ordered=None): ------- Categorical """ + if (isinstance(data, (ABCSeries, type(self))) and + is_categorical_dtype(data)): + data = data.values + if not isinstance(data, ABCCategorical): ordered = False if ordered is None else ordered from pandas.core.categorical import Categorical diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index aac68ebd6abede..cf365465763fab 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -125,6 +125,16 @@ def test_construction_with_dtype(self): result = CategoricalIndex(idx, categories=idx, ordered=True) tm.assert_index_equal(result, expected, exact=True) + def test_create_categorical(self): + # https://github.com/pandas-dev/pandas/pull/17513 + # The public CI constructor doesn't hit this code path with + # instances of CategoricalIndex, but we still want to test the code + ci = CategoricalIndex(['a', 'b', 'c']) + # First ci is self, second ci is data. + result = CategoricalIndex._create_categorical(ci, ci) + expected = Categorical(['a', 'b', 'c']) + tm.assert_categorical_equal(result, expected) + def test_disallow_set_ops(self): # GH 10039 From 9b21c5456eb4b2cdbc7f74569c4b8660ada951fe Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 14 Sep 2017 18:33:03 -0700 Subject: [PATCH 088/188] Remove unnecessary iNaT checks from _Period properties (#17421) --- asv_bench/benchmarks/period.py | 59 +++++++++++++++ pandas/_libs/period.pyx | 127 ++++++++++++++++++++------------- 2 files changed, 135 insertions(+), 51 deletions(-) diff --git a/asv_bench/benchmarks/period.py b/asv_bench/benchmarks/period.py index 78d66295f28cc6..df3c2bf3e4b464 100644 --- a/asv_bench/benchmarks/period.py +++ b/asv_bench/benchmarks/period.py @@ -78,6 +78,65 @@ def time_value_counts_pindex(self): self.i.value_counts() +class Properties(object): + def setup(self): + self.per = Period('2017-09-06 08:28', freq='min') + + def time_year(self): + self.per.year + + def time_month(self): + self.per.month + + def time_day(self): + self.per.day + + def time_hour(self): + self.per.hour + + def time_minute(self): + self.per.minute + + def time_second(self): + self.per.second + + def time_is_leap_year(self): + self.per.is_leap_year + + def time_quarter(self): + self.per.quarter + + def time_qyear(self): + self.per.qyear + + def time_week(self): + self.per.week + + def time_daysinmonth(self): + self.per.daysinmonth + + def time_dayofweek(self): + self.per.dayofweek + + def time_dayofyear(self): + self.per.dayofyear + + def time_start_time(self): + self.per.start_time + + def time_end_time(self): + self.per.end_time + + def time_to_timestamp(): + self.per.to_timestamp() + + def time_now(): + self.per.now() + + def time_asfreq(): + self.per.asfreq('A') + + class period_standard_indexing(object): goal_time = 0.2 diff --git a/pandas/_libs/period.pyx b/pandas/_libs/period.pyx index 9e473a7f362b44..babe0f7c6834d9 100644 --- a/pandas/_libs/period.pyx +++ b/pandas/_libs/period.pyx @@ -107,6 +107,8 @@ cdef extern from "period_helper.h": int pday(int64_t ordinal, int freq) except INT32_MIN int pweekday(int64_t ordinal, int freq) except INT32_MIN int pday_of_week(int64_t ordinal, int freq) except INT32_MIN + # TODO: pday_of_week and pweekday are identical. Make one an alias instead + # of importing them separately. int pday_of_year(int64_t ordinal, int freq) except INT32_MIN int pweek(int64_t ordinal, int freq) except INT32_MIN int phour(int64_t ordinal, int freq) except INT32_MIN @@ -868,58 +870,81 @@ cdef class _Period(object): dt64 = period_ordinal_to_dt64(val.ordinal, base) return Timestamp(dt64, tz=tz) - cdef _field(self, alias): + @property + def year(self): + base, mult = get_freq_code(self.freq) + return pyear(self.ordinal, base) + + @property + def month(self): + base, mult = get_freq_code(self.freq) + return pmonth(self.ordinal, base) + + @property + def day(self): + base, mult = get_freq_code(self.freq) + return pday(self.ordinal, base) + + @property + def hour(self): + base, mult = get_freq_code(self.freq) + return phour(self.ordinal, base) + + @property + def minute(self): + base, mult = get_freq_code(self.freq) + return pminute(self.ordinal, base) + + @property + def second(self): + base, mult = get_freq_code(self.freq) + return psecond(self.ordinal, base) + + @property + def weekofyear(self): + base, mult = get_freq_code(self.freq) + return pweek(self.ordinal, base) + + @property + def week(self): + return self.weekofyear + + @property + def dayofweek(self): + base, mult = get_freq_code(self.freq) + return pweekday(self.ordinal, base) + + @property + def weekday(self): + return self.dayofweek + + @property + def dayofyear(self): + base, mult = get_freq_code(self.freq) + return pday_of_year(self.ordinal, base) + + @property + def quarter(self): base, mult = get_freq_code(self.freq) - return get_period_field(alias, self.ordinal, base) - - property year: - def __get__(self): - return self._field(0) - property month: - def __get__(self): - return self._field(3) - property day: - def __get__(self): - return self._field(4) - property hour: - def __get__(self): - return self._field(5) - property minute: - def __get__(self): - return self._field(6) - property second: - def __get__(self): - return self._field(7) - property weekofyear: - def __get__(self): - return self._field(8) - property week: - def __get__(self): - return self.weekofyear - property dayofweek: - def __get__(self): - return self._field(10) - property weekday: - def __get__(self): - return self.dayofweek - property dayofyear: - def __get__(self): - return self._field(9) - property quarter: - def __get__(self): - return self._field(2) - property qyear: - def __get__(self): - return self._field(1) - property days_in_month: - def __get__(self): - return self._field(11) - property daysinmonth: - def __get__(self): - return self.days_in_month - property is_leap_year: - def __get__(self): - return bool(is_leapyear(self._field(0))) + return pquarter(self.ordinal, base) + + @property + def qyear(self): + base, mult = get_freq_code(self.freq) + return pqyear(self.ordinal, base) + + @property + def days_in_month(self): + base, mult = get_freq_code(self.freq) + return pdays_in_month(self.ordinal, base) + + @property + def daysinmonth(self): + return self.days_in_month + + @property + def is_leap_year(self): + return bool(is_leapyear(self.year)) @classmethod def now(cls, freq=None): From 72c38883f09c6902863345de432d3c90a29140b3 Mon Sep 17 00:00:00 2001 From: jschendel Date: Fri, 15 Sep 2017 02:18:24 -0600 Subject: [PATCH 089/188] CLN: Fix Spelling Errors (#17535) --- doc/source/advanced.rst | 10 +++++----- doc/source/api.rst | 2 +- doc/source/basics.rst | 2 +- doc/source/computation.rst | 2 +- doc/source/groupby.rst | 4 ++-- doc/source/indexing.rst | 2 +- doc/source/io.rst | 2 +- doc/source/merging.rst | 6 +++--- doc/source/missing_data.rst | 2 +- doc/source/options.rst | 4 ++-- doc/source/reshaping.rst | 2 +- doc/source/sparse.rst | 2 +- doc/source/style.ipynb | 2 +- doc/source/timeseries.rst | 18 +++++++++--------- doc/source/visualization.rst | 2 +- pandas/core/algorithms.py | 2 +- pandas/core/indexes/interval.py | 2 +- pandas/core/reshape/concat.py | 2 +- pandas/core/reshape/merge.py | 6 +++--- pandas/core/reshape/tile.py | 2 +- pandas/io/formats/excel.py | 4 ++-- pandas/io/pytables.py | 12 ++++++------ pandas/io/stata.py | 4 ++-- pandas/plotting/_misc.py | 2 +- pandas/plotting/_tools.py | 2 +- pandas/tests/frame/test_convert_to.py | 4 ++-- pandas/tests/groupby/test_transform.py | 2 +- pandas/tests/indexes/datetimes/test_tools.py | 2 +- pandas/tests/io/json/test_json_table_schema.py | 2 +- pandas/tests/io/parser/test_read_fwf.py | 2 +- pandas/tests/io/test_pytables.py | 8 ++++---- pandas/tests/plotting/test_datetimelike.py | 2 +- pandas/tests/series/test_dtypes.py | 2 +- pandas/tests/test_categorical.py | 2 +- pandas/tests/test_sorting.py | 2 +- pandas/tseries/util.py | 2 +- 36 files changed, 65 insertions(+), 65 deletions(-) diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst index 3f145cf9556645..3bda8c7eacb61b 100644 --- a/doc/source/advanced.rst +++ b/doc/source/advanced.rst @@ -625,7 +625,7 @@ Index Types We have discussed ``MultiIndex`` in the previous sections pretty extensively. ``DatetimeIndex`` and ``PeriodIndex`` are shown :ref:`here `. ``TimedeltaIndex`` are :ref:`here `. -In the following sub-sections we will highlite some other index types. +In the following sub-sections we will highlight some other index types. .. _indexing.categoricalindex: @@ -645,7 +645,7 @@ and allows efficient indexing and storage of an index with a large number of dup df.dtypes df.B.cat.categories -Setting the index, will create create a ``CategoricalIndex`` +Setting the index, will create a ``CategoricalIndex`` .. ipython:: python @@ -681,7 +681,7 @@ Groupby operations on the index will preserve the index nature as well Reindexing operations, will return a resulting index based on the type of the passed indexer, meaning that passing a list will return a plain-old-``Index``; indexing with a ``Categorical`` will return a ``CategoricalIndex``, indexed according to the categories -of the PASSED ``Categorical`` dtype. This allows one to arbitrarly index these even with +of the PASSED ``Categorical`` dtype. This allows one to arbitrarily index these even with values NOT in the categories, similarly to how you can reindex ANY pandas index. .. ipython :: python @@ -722,7 +722,7 @@ Int64Index and RangeIndex Prior to 0.18.0, the ``Int64Index`` would provide the default index for all ``NDFrame`` objects. ``RangeIndex`` is a sub-class of ``Int64Index`` added in version 0.18.0, now providing the default index for all ``NDFrame`` objects. -``RangeIndex`` is an optimized version of ``Int64Index`` that can represent a monotonic ordered set. These are analagous to python `range types `__. +``RangeIndex`` is an optimized version of ``Int64Index`` that can represent a monotonic ordered set. These are analogous to python `range types `__. .. _indexing.float64index: @@ -963,7 +963,7 @@ index can be somewhat complicated. For example, the following does not work: s.loc['c':'e'+1] A very common use case is to limit a time series to start and end at two -specific dates. To enable this, we made the design design to make label-based +specific dates. To enable this, we made the design to make label-based slicing include both endpoints: .. ipython:: python diff --git a/doc/source/api.rst b/doc/source/api.rst index 1541bbccefe214..4e02f7b11f466c 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -1291,7 +1291,7 @@ Index ----- **Many of these methods or variants thereof are available on the objects -that contain an index (Series/Dataframe) and those should most likely be +that contain an index (Series/DataFrame) and those should most likely be used before calling these methods directly.** .. autosummary:: diff --git a/doc/source/basics.rst b/doc/source/basics.rst index 42c28df3a6030f..0990d2bd15ee6f 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -923,7 +923,7 @@ Passing a named function will yield that name for the row: Aggregating with a dict +++++++++++++++++++++++ -Passing a dictionary of column names to a scalar or a list of scalars, to ``DataFame.agg`` +Passing a dictionary of column names to a scalar or a list of scalars, to ``DataFrame.agg`` allows you to customize which functions are applied to which columns. Note that the results are not in any particular order, you can use an ``OrderedDict`` instead to guarantee ordering. diff --git a/doc/source/computation.rst b/doc/source/computation.rst index 23699393958cfe..14cfdbc3648375 100644 --- a/doc/source/computation.rst +++ b/doc/source/computation.rst @@ -654,7 +654,7 @@ aggregation with, outputting a DataFrame: r['A'].agg([np.sum, np.mean, np.std]) -On a widowed DataFrame, you can pass a list of functions to apply to each +On a windowed DataFrame, you can pass a list of functions to apply to each column, which produces an aggregated result with a hierarchical index: .. ipython:: python diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index e1231b9a4a2007..e9a7d8dd0a46ea 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -561,7 +561,7 @@ must be either implemented on GroupBy or available via :ref:`dispatching .. note:: - If you pass a dict to ``aggregate``, the ordering of the output colums is + If you pass a dict to ``aggregate``, the ordering of the output columns is non-deterministic. If you want to be sure the output columns will be in a specific order, you can use an ``OrderedDict``. Compare the output of the following two commands: @@ -1211,7 +1211,7 @@ Groupby by Indexer to 'resample' data Resampling produces new hypothetical samples (resamples) from already existing observed data or from a model that generates data. These new samples are similar to the pre-existing samples. -In order to resample to work on indices that are non-datetimelike , the following procedure can be utilized. +In order to resample to work on indices that are non-datetimelike, the following procedure can be utilized. In the following examples, **df.index // 5** returns a binary array which is used to determine what gets selected for the groupby operation. diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index 8474116c380825..edbc4e6d7fd225 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -714,7 +714,7 @@ Finally, one can also set a seed for ``sample``'s random number generator using Setting With Enlargement ------------------------ -The ``.loc/[]`` operations can perform enlargement when setting a non-existant key for that axis. +The ``.loc/[]`` operations can perform enlargement when setting a non-existent key for that axis. In the ``Series`` case this is effectively an appending operation diff --git a/doc/source/io.rst b/doc/source/io.rst index 8fbb23769492e4..fcf7f6029197bd 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -3077,7 +3077,7 @@ Compressed pickle files .. versionadded:: 0.20.0 -:func:`read_pickle`, :meth:`DataFame.to_pickle` and :meth:`Series.to_pickle` can read +:func:`read_pickle`, :meth:`DataFrame.to_pickle` and :meth:`Series.to_pickle` can read and write compressed pickle files. The compression types of ``gzip``, ``bz2``, ``xz`` are supported for reading and writing. `zip`` file supports read only and must contain only one data file to be read in. diff --git a/doc/source/merging.rst b/doc/source/merging.rst index a5ee1b1a9384cc..72787ea97a7824 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -1329,7 +1329,7 @@ By default we are taking the asof of the quotes. on='time', by='ticker') -We only asof within ``2ms`` betwen the quote time and the trade time. +We only asof within ``2ms`` between the quote time and the trade time. .. ipython:: python @@ -1338,8 +1338,8 @@ We only asof within ``2ms`` betwen the quote time and the trade time. by='ticker', tolerance=pd.Timedelta('2ms')) -We only asof within ``10ms`` betwen the quote time and the trade time and we exclude exact matches on time. -Note that though we exclude the exact matches (of the quotes), prior quotes DO propogate to that point +We only asof within ``10ms`` between the quote time and the trade time and we exclude exact matches on time. +Note that though we exclude the exact matches (of the quotes), prior quotes DO propagate to that point in time. .. ipython:: python diff --git a/doc/source/missing_data.rst b/doc/source/missing_data.rst index 65b411ccd4af26..b33b5c304853ae 100644 --- a/doc/source/missing_data.rst +++ b/doc/source/missing_data.rst @@ -320,7 +320,7 @@ Interpolation The ``limit_direction`` keyword argument was added. -Both Series and Dataframe objects have an ``interpolate`` method that, by default, +Both Series and DataFrame objects have an ``interpolate`` method that, by default, performs linear interpolation at missing datapoints. .. ipython:: python diff --git a/doc/source/options.rst b/doc/source/options.rst index 1592caf90546c7..f042e4d3f51204 100644 --- a/doc/source/options.rst +++ b/doc/source/options.rst @@ -313,9 +313,9 @@ display.large_repr truncate For DataFrames exceeding max_ro display.latex.repr False Whether to produce a latex DataFrame representation for jupyter frontends that support it. -display.latex.escape True Escapes special caracters in Dataframes, when +display.latex.escape True Escapes special characters in DataFrames, when using the to_latex method. -display.latex.longtable False Specifies if the to_latex method of a Dataframe +display.latex.longtable False Specifies if the to_latex method of a DataFrame uses the longtable format. display.latex.multicolumn True Combines columns when using a MultiIndex display.latex.multicolumn_format 'l' Alignment of multicolumn labels diff --git a/doc/source/reshaping.rst b/doc/source/reshaping.rst index fab83222b313f1..1209c4a8d6be80 100644 --- a/doc/source/reshaping.rst +++ b/doc/source/reshaping.rst @@ -156,7 +156,7 @@ the level numbers: stacked.unstack('second') Notice that the ``stack`` and ``unstack`` methods implicitly sort the index -levels involved. Hence a call to ``stack`` and then ``unstack``, or viceversa, +levels involved. Hence a call to ``stack`` and then ``unstack``, or vice versa, will result in a **sorted** copy of the original DataFrame or Series: .. ipython:: python diff --git a/doc/source/sparse.rst b/doc/source/sparse.rst index cf16cee501a3e5..89efa7b4be3eee 100644 --- a/doc/source/sparse.rst +++ b/doc/source/sparse.rst @@ -132,7 +132,7 @@ dtype, ``fill_value`` default changes: s.to_sparse() You can change the dtype using ``.astype()``, the result is also sparse. Note that -``.astype()`` also affects to the ``fill_value`` to keep its dense represantation. +``.astype()`` also affects to the ``fill_value`` to keep its dense representation. .. ipython:: python diff --git a/doc/source/style.ipynb b/doc/source/style.ipynb index c250787785e14e..1d6ce163cf977b 100644 --- a/doc/source/style.ipynb +++ b/doc/source/style.ipynb @@ -169,7 +169,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Notice the similarity with the standard `df.applymap`, which operates on DataFrames elementwise. We want you to be able to resuse your existing knowledge of how to interact with DataFrames.\n", + "Notice the similarity with the standard `df.applymap`, which operates on DataFrames elementwise. We want you to be able to reuse your existing knowledge of how to interact with DataFrames.\n", "\n", "Notice also that our function returned a string containing the CSS attribute and value, separated by a colon just like in a `""") + # We use the "scoped" attribute here so that the desired + # style properties for the data frame are not then applied + # throughout the entire notebook. + template_first = """\ + """ + template_select = """\ + .dataframe %s { + %s: %s; + }""" + element_props = [('tbody tr th:only-of-type', + 'vertical-align', + 'middle'), + ('tbody tr th', + 'vertical-align', + 'top')] + if isinstance(self.columns, MultiIndex): + element_props.append(('thead tr th', + 'text-align', + 'left')) + if all((self.fmt.has_index_names, + self.fmt.index, + self.fmt.show_index_names)): + element_props.append(('thead tr:last-of-type th', + 'text-align', + 'right')) + else: + element_props.append(('thead th', + 'text-align', + 'right')) + template_mid = '\n\n'.join(map(lambda t: template_select % t, + element_props)) + template = dedent('\n'.join((template_first, + template_mid, + template_last))) if self.notebook: self.write(template) diff --git a/pandas/tests/io/formats/test_to_html.py b/pandas/tests/io/formats/test_to_html.py index 1e174c34221d55..194b5ba3e02765 100644 --- a/pandas/tests/io/formats/test_to_html.py +++ b/pandas/tests/io/formats/test_to_html.py @@ -1868,12 +1868,16 @@ def test_to_html_no_index_max_rows(self): def test_to_html_notebook_has_style(self): df = pd.DataFrame({"A": [1, 2, 3]}) result = df.to_html(notebook=True) - assert "thead tr:only-child" in result + assert "tbody tr th:only-of-type" in result + assert "vertical-align: middle;" in result + assert "thead th" in result def test_to_html_notebook_has_no_style(self): df = pd.DataFrame({"A": [1, 2, 3]}) result = df.to_html() - assert "thead tr:only-child" not in result + assert "tbody tr th:only-of-type" not in result + assert "vertical-align: middle;" not in result + assert "thead th" not in result def test_to_html_with_index_names_false(self): # gh-16493 From 8276a420a36c26eaab38856177023cb064963f19 Mon Sep 17 00:00:00 2001 From: jschendel Date: Fri, 22 Sep 2017 01:22:19 -0600 Subject: [PATCH 115/188] DOC: Remove experimental warning from custom offsets (#17584) --- pandas/tseries/offsets.py | 23 +++++------------------ 1 file changed, 5 insertions(+), 18 deletions(-) diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index 6a518937b11957..452d30322b4cfa 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -951,14 +951,9 @@ def next_bday(self): class CustomBusinessDay(BusinessDay): """ - **EXPERIMENTAL** DateOffset subclass representing possibly n business days + DateOffset subclass representing possibly n custom business days, excluding holidays - .. warning:: EXPERIMENTAL - - This class is not officially supported and the API is likely to change - in future versions. Use this at your own risk. - Parameters ---------- n : int, default 1 @@ -1405,12 +1400,8 @@ def onOffset(self, dt): class CustomBusinessMonthEnd(BusinessMixin, MonthOffset): """ - **EXPERIMENTAL** DateOffset of one custom business month - - .. warning:: EXPERIMENTAL - - This class is not officially supported and the API is likely to change - in future versions. Use this at your own risk. + DateOffset subclass representing one custom business month, incrementing + between end of month dates Parameters ---------- @@ -1479,12 +1470,8 @@ def apply(self, other): class CustomBusinessMonthBegin(BusinessMixin, MonthOffset): """ - **EXPERIMENTAL** DateOffset of one custom business month - - .. warning:: EXPERIMENTAL - - This class is not officially supported and the API is likely to change - in future versions. Use this at your own risk. + DateOffset subclass representing one custom business month, incrementing + between beginning of month dates Parameters ---------- From 9732af248a6bcc6db05413fd671c08a23724dba4 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 22 Sep 2017 06:10:31 -0700 Subject: [PATCH 116/188] Separate properties module (#17590) --- pandas/_libs/lib.pyx | 1 - pandas/_libs/{src => }/properties.pyx | 5 ++++- pandas/core/generic.py | 4 ++-- pandas/util/_decorators.py | 2 +- setup.py | 3 ++- 5 files changed, 9 insertions(+), 6 deletions(-) rename pandas/_libs/{src => }/properties.pyx (95%) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 53ca41e4b24893..01548e17d39abf 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1907,5 +1907,4 @@ cdef class BlockPlacement: include "reduce.pyx" -include "properties.pyx" include "inference.pyx" diff --git a/pandas/_libs/src/properties.pyx b/pandas/_libs/properties.pyx similarity index 95% rename from pandas/_libs/src/properties.pyx rename to pandas/_libs/properties.pyx index 4a3fd4b771a171..22d66356ebdc34 100644 --- a/pandas/_libs/src/properties.pyx +++ b/pandas/_libs/properties.pyx @@ -1,5 +1,8 @@ + +from cython cimport Py_ssize_t + from cpython cimport ( - PyDict_Contains, PyDict_GetItem, PyDict_GetItem, PyDict_SetItem) + PyDict_Contains, PyDict_GetItem, PyDict_SetItem) cdef class cache_readonly(object): diff --git a/pandas/core/generic.py b/pandas/core/generic.py index a71bf7be1bc753..e0a9fdb08dcb2c 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9,7 +9,7 @@ import numpy as np import pandas as pd -from pandas._libs import tslib, lib +from pandas._libs import tslib, lib, properties from pandas.core.dtypes.common import ( _ensure_int64, _ensure_object, @@ -258,7 +258,7 @@ def _setup_axes(cls, axes, info_axis=None, stat_axis=None, aliases=None, if build_axes: def set_axis(a, i): - setattr(cls, a, lib.AxisProperty(i)) + setattr(cls, a, properties.AxisProperty(i)) cls._internal_names_set.add(a) if axes_are_reversed: diff --git a/pandas/util/_decorators.py b/pandas/util/_decorators.py index bb7ffe45c689b0..31e27817913c5a 100644 --- a/pandas/util/_decorators.py +++ b/pandas/util/_decorators.py @@ -1,5 +1,5 @@ from pandas.compat import callable, signature -from pandas._libs.lib import cache_readonly # noqa +from pandas._libs.properties import cache_readonly # noqa import types import warnings from textwrap import dedent diff --git a/setup.py b/setup.py index 0e4e22b875e1db..d28c4ba8be5b00 100755 --- a/setup.py +++ b/setup.py @@ -437,7 +437,7 @@ def get_tag(self): cmdclass['build_src'] = DummyBuildSrc cmdclass['build_ext'] = CheckingBuildExt -lib_depends = ['reduce', 'inference', 'properties'] +lib_depends = ['reduce', 'inference'] def srcpath(name=None, suffix='.pyx', subdir='src'): @@ -478,6 +478,7 @@ def pxd(name): ext_data = { '_libs.lib': {'pyxfile': '_libs/lib', 'depends': lib_depends + tseries_depends}, + '_libs.properties': {'pyxfile': '_libs/properties', 'include': []}, '_libs.hashtable': {'pyxfile': '_libs/hashtable', 'pxdfiles': ['_libs/hashtable'], 'depends': (['pandas/_libs/src/klib/khash_python.h'] From 26681db1ce339af641d276bc45fbb48dc329b044 Mon Sep 17 00:00:00 2001 From: jschendel Date: Fri, 22 Sep 2017 07:15:12 -0600 Subject: [PATCH 117/188] PERF: Implement RangeIndex min/max using RangeIndex properties (#17611) --- asv_bench/benchmarks/index_object.py | 20 ++++++++++++++++++++ doc/source/api.rst | 14 ++++++++++++++ doc/source/whatsnew/v0.21.0.txt | 1 + pandas/core/indexes/range.py | 18 ++++++++++++++++++ pandas/tests/indexes/test_range.py | 21 ++++++++++++++++++++- 5 files changed, 73 insertions(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py index 3fb53ce9b3c98e..454d9ccdda102f 100644 --- a/asv_bench/benchmarks/index_object.py +++ b/asv_bench/benchmarks/index_object.py @@ -199,3 +199,23 @@ def time_datetime_level_values_full(self): def time_datetime_level_values_sliced(self): self.mi[:10].values + + +class Range(object): + goal_time = 0.2 + + def setup(self): + self.idx_inc = RangeIndex(start=0, stop=10**7, step=3) + self.idx_dec = RangeIndex(start=10**7, stop=-1, step=-3) + + def time_max(self): + self.idx_inc.max() + + def time_max_trivial(self): + self.idx_dec.max() + + def time_min(self): + self.idx_dec.min() + + def time_min_trivial(self): + self.idx_inc.min() diff --git a/doc/source/api.rst b/doc/source/api.rst index 6b3e6bedcb24b3..96c7f68f57aaaa 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -1416,6 +1416,20 @@ Selecting Index.slice_indexer Index.slice_locs +.. _api.numericindex: + +Numeric Index +------------- + +.. autosummary:: + :toctree: generated/ + :template: autosummary/class_without_autosummary.rst + + RangeIndex + Int64Index + UInt64Index + Float64Index + .. _api.categoricalindex: CategoricalIndex diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 1cd65bb530f731..bf3a4f28b0a4c2 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -473,6 +473,7 @@ Performance Improvements - Improved performance of :meth:`Categorical.set_categories` by not materializing the values (:issue:`17508`) - :attr:`Timestamp.microsecond` no longer re-computes on attribute access (:issue:`17331`) - Improved performance of the :class:`CategoricalIndex` for data that is already categorical dtype (:issue:`17513`) +- Improved performance of :meth:`RangeIndex.min` and :meth:`RangeIndex.max` by using ``RangeIndex`` properties to perform the computations (:issue:`17607`) .. _whatsnew_0210.bug_fixes: diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index b759abaed4e564..16523257c2f77c 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -269,6 +269,24 @@ def copy(self, name=None, deep=False, dtype=None, **kwargs): return RangeIndex(name=name, fastpath=True, **dict(self._get_data_as_items())) + def _minmax(self, meth): + no_steps = len(self) - 1 + if no_steps == -1: + return np.nan + elif ((meth == 'min' and self._step > 0) or + (meth == 'max' and self._step < 0)): + return self._start + + return self._start + self._step * no_steps + + def min(self): + """The minimum value of the RangeIndex""" + return self._minmax('min') + + def max(self): + """The maximum value of the RangeIndex""" + return self._minmax('max') + def argsort(self, *args, **kwargs): """ Returns the indices that would sort the index and its diff --git a/pandas/tests/indexes/test_range.py b/pandas/tests/indexes/test_range.py index d206c36ee51c95..8dc5a40ced4bfd 100644 --- a/pandas/tests/indexes/test_range.py +++ b/pandas/tests/indexes/test_range.py @@ -10,7 +10,7 @@ import numpy as np -from pandas import (notna, Series, Index, Float64Index, +from pandas import (isna, notna, Series, Index, Float64Index, Int64Index, RangeIndex) import pandas.util.testing as tm @@ -994,3 +994,22 @@ def test_append(self): # Append single item rather than list result2 = indices[0].append(indices[1]) tm.assert_index_equal(result2, expected, exact=True) + + @pytest.mark.parametrize('start,stop,step', + [(0, 400, 3), (500, 0, -6), (-10**6, 10**6, 4), + (10**6, -10**6, -4), (0, 10, 20)]) + def test_max_min(self, start, stop, step): + # GH17607 + idx = RangeIndex(start, stop, step) + expected = idx._int64index.max() + result = idx.max() + assert result == expected + + expected = idx._int64index.min() + result = idx.min() + assert result == expected + + # empty + idx = RangeIndex(start, stop, -step) + assert isna(idx.max()) + assert isna(idx.min()) From 49cfdd7a0d72e732d07dbf4d4b96c6801cdb6719 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 22 Sep 2017 06:22:54 -0700 Subject: [PATCH 118/188] Simplify to_pydatetime() (#17592) --- asv_bench/benchmarks/timestamp.py | 6 ++++++ pandas/_libs/tslib.pyx | 13 ++++--------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/asv_bench/benchmarks/timestamp.py b/asv_bench/benchmarks/timestamp.py index e4f3023037580d..e8cb4c9d1c75bd 100644 --- a/asv_bench/benchmarks/timestamp.py +++ b/asv_bench/benchmarks/timestamp.py @@ -81,3 +81,9 @@ def time_replace_across_dst(self): def time_replace_None(self): self.ts_tz.replace(tzinfo=None) + + def time_to_pydatetime(self): + self.ts.to_pydatetime() + + def time_to_pydatetime_tz(self): + self.ts_tz.to_pydatetime() diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 8238552b44e031..6ba37062ac8691 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -1158,18 +1158,13 @@ cdef class _Timestamp(datetime): If warn=True, issue a warning if nanoseconds is nonzero. """ - cdef: - pandas_datetimestruct dts - _TSObject ts - if self.nanosecond != 0 and warn: warnings.warn("Discarding nonzero nanoseconds in conversion", UserWarning, stacklevel=2) - ts = convert_to_tsobject(self, self.tzinfo, None, 0, 0) - dts = ts.dts - return datetime(dts.year, dts.month, dts.day, - dts.hour, dts.min, dts.sec, - dts.us, ts.tzinfo) + + return datetime(self.year, self.month, self.day, + self.hour, self.minute, self.second, + self.microsecond, self.tzinfo) cpdef to_datetime64(self): """ Returns a numpy.datetime64 object with 'ns' precision """ From 2352fd6f88a0cc96488849d288b93ea8f46d1f7b Mon Sep 17 00:00:00 2001 From: Guilherme Beltramini Date: Fri, 22 Sep 2017 09:30:26 -0400 Subject: [PATCH 119/188] ERR: Raise ImportError when xlrd is not present Related issues: #8515, #14673 Author: Guilherme Beltramini Closes #17613 from gcbeltramini/xlrd-import and squashes the following commits: dee1998 [Guilherme Beltramini] Add PR number and blank line c2759cb [Guilherme Beltramini] Throw ImportError --- doc/source/whatsnew/v0.21.0.txt | 3 ++- pandas/io/excel.py | 15 ++++++++++----- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index bf3a4f28b0a4c2..885babfdd1d19c 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -116,6 +116,7 @@ Other Enhancements - :func:`Styler.where` has been implemented. It is as a convenience for :func:`Styler.applymap` and enables simple DataFrame styling on the Jupyter notebook (:issue:`17474`). - :func:`MultiIndex.is_monotonic_decreasing` has been implemented. Previously returned ``False`` in all cases. (:issue:`16554`) - :func:`Categorical.rename_categories` now accepts a dict-like argument as `new_categories` and only updates the categories found in that dict. (:issue:`17336`) +- :func:`read_excel` raises ``ImportError`` with a better message if ``xlrd`` is not installed. (:issue:`17613`) .. _whatsnew_0210.api_breaking: @@ -523,7 +524,7 @@ I/O - Bug in :func:`read_stata` where the index was not set (:issue:`16342`) - Bug in :func:`read_html` where import check fails when run in multiple threads (:issue:`16928`) - Bug in :func:`read_csv` where automatic delimiter detection caused a ``TypeError`` to be thrown when a bad line was encountered rather than the correct error message (:issue:`13374`) -- Bug in ``DataFrame.to_html()`` with ``notebook=True`` where DataFrames with named indices or non-MultiIndex indices had undesired horizontal or vertical alignment for column or row labels, respectively (:issue:`16792`) +- Bug in ``DataFrame.to_html()`` with ``notebook=True`` where DataFrames with named indices or non-MultiIndex indices had undesired horizontal or vertical alignment for column or row labels, respectively (:issue:`16792`) Plotting ^^^^^^^^ diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 5db4603c37be0a..faafdba435ff21 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -239,12 +239,17 @@ class ExcelFile(object): def __init__(self, io, **kwds): - import xlrd # throw an ImportError if we need to + err_msg = "Install xlrd >= 0.9.0 for Excel support" - ver = tuple(map(int, xlrd.__VERSION__.split(".")[:2])) - if ver < (0, 9): # pragma: no cover - raise ImportError("pandas requires xlrd >= 0.9.0 for excel " - "support, current version " + xlrd.__VERSION__) + try: + import xlrd + except ImportError: + raise ImportError(err_msg) + else: + ver = tuple(map(int, xlrd.__VERSION__.split(".")[:2])) + if ver < (0, 9): # pragma: no cover + raise ImportError(err_msg + + ". Current version " + xlrd.__VERSION__) # could be a str, ExcelFile, Book, etc. self.io = io From a5c9abf9cc88f2245637156a1b8fcc238d5f2100 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 22 Sep 2017 09:39:40 -0400 Subject: [PATCH 120/188] DOC: whatsnew fixes (#17626) closes #17601 --- doc/source/whatsnew/v0.21.0.txt | 111 ++++++++++++++++---------------- 1 file changed, 57 insertions(+), 54 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 885babfdd1d19c..a80fa744780a2a 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -135,7 +135,7 @@ We have updated our minimum supported versions of dependencies (:issue:`15206`, +--------------+-----------------+----------+ | Package | Minimum Version | Required | - +======================+=========+==========+ + +==============+=================+==========+ | Numpy | 1.9.0 | X | +--------------+-----------------+----------+ | Matplotlib | 1.4.3 | | @@ -241,54 +241,53 @@ New Behaviour: Dtype Conversions ^^^^^^^^^^^^^^^^^ -- Previously assignments, ``.where()`` and ``.fillna()`` with a ``bool`` assignment, would coerce to - same the type (e.g. int / float), or raise for datetimelikes. These will now preseve the bools with ``object`` dtypes. (:issue:`16821`). +Previously assignments, ``.where()`` and ``.fillna()`` with a ``bool`` assignment, would coerce to same the type (e.g. int / float), or raise for datetimelikes. These will now preseve the bools with ``object`` dtypes. (:issue:`16821`). - .. ipython:: python +.. ipython:: python - s = Series([1, 2, 3]) + s = Series([1, 2, 3]) - .. code-block:: python +.. code-block:: python - In [5]: s[1] = True + In [5]: s[1] = True - In [6]: s - Out[6]: - 0 1 - 1 1 - 2 3 - dtype: int64 + In [6]: s + Out[6]: + 0 1 + 1 1 + 2 3 + dtype: int64 - New Behavior +New Behavior - .. ipython:: python +.. ipython:: python - s[1] = True - s + s[1] = True + s -- Previously, as assignment to a datetimelike with a non-datetimelike would coerce the - non-datetime-like item being assigned (:issue:`14145`). +Previously, as assignment to a datetimelike with a non-datetimelike would coerce the +non-datetime-like item being assigned (:issue:`14145`). - .. ipython:: python +.. ipython:: python - s = pd.Series([pd.Timestamp('2011-01-01'), pd.Timestamp('2012-01-01')]) + s = pd.Series([pd.Timestamp('2011-01-01'), pd.Timestamp('2012-01-01')]) - .. code-block:: python +.. code-block:: python - In [1]: s[1] = 1 + In [1]: s[1] = 1 - In [2]: s - Out[2]: - 0 2011-01-01 00:00:00.000000000 - 1 1970-01-01 00:00:00.000000001 - dtype: datetime64[ns] + In [2]: s + Out[2]: + 0 2011-01-01 00:00:00.000000000 + 1 1970-01-01 00:00:00.000000001 + dtype: datetime64[ns] - These now coerce to ``object`` dtype. +These now coerce to ``object`` dtype. - .. ipython:: python +.. ipython:: python - s[1] = 1 - s + s[1] = 1 + s - Inconsistent behavior in ``.where()`` with datetimelikes which would raise rather than coerce to ``object`` (:issue:`16402`) - Bug in assignment against ``int64`` data with ``np.ndarray`` with ``float64`` dtype may keep ``int64`` dtype (:issue:`14001`) @@ -338,26 +337,26 @@ UTC Localization with Series Previously, :func:`to_datetime` did not localize datetime ``Series`` data when ``utc=True`` was passed. Now, :func:`to_datetime` will correctly localize ``Series`` with a ``datetime64[ns, UTC]`` dtype to be consistent with how list-like and ``Index`` data are handled. (:issue:`6415`). - Previous Behavior +Previous Behavior - .. ipython:: python +.. ipython:: python - s = Series(['20130101 00:00:00'] * 3) + s = Series(['20130101 00:00:00'] * 3) - .. code-block:: ipython +.. code-block:: ipython - In [12]: pd.to_datetime(s, utc=True) - Out[12]: - 0 2013-01-01 - 1 2013-01-01 - 2 2013-01-01 - dtype: datetime64[ns] + In [12]: pd.to_datetime(s, utc=True) + Out[12]: + 0 2013-01-01 + 1 2013-01-01 + 2 2013-01-01 + dtype: datetime64[ns] - New Behavior +New Behavior - .. ipython:: python +.. ipython:: python - pd.to_datetime(s, utc=True) + pd.to_datetime(s, utc=True) Additionally, DataFrames with datetime columns that were parsed by :func:`read_sql_table` and :func:`read_sql_query` will also be localized to UTC only if the original SQL columns were timezone aware datetime columns. @@ -410,9 +409,9 @@ Previous Behavior: New Behavior: - .. ipython:: python +.. ipython:: python - pd.interval_range(start=0, end=4) + pd.interval_range(start=0, end=4) .. _whatsnew_0210.api: @@ -476,6 +475,14 @@ Performance Improvements - Improved performance of the :class:`CategoricalIndex` for data that is already categorical dtype (:issue:`17513`) - Improved performance of :meth:`RangeIndex.min` and :meth:`RangeIndex.max` by using ``RangeIndex`` properties to perform the computations (:issue:`17607`) +.. _whatsnew_0210.docs: + +Documentation Changes +~~~~~~~~~~~~~~~~~~~~~ + +- Several ``NaT`` method docstrings (e.g. :func:`NaT.ctime`) were incorrect (:issue:`17327`) +- The documentation has had references to versions < v0.17 removed and cleaned up (:issue:`17442`, :issue:`17442`, :issue:`17404` & :issue:`17504`) + .. _whatsnew_0210.bug_fixes: Bug Fixes @@ -530,7 +537,7 @@ Plotting ^^^^^^^^ - Bug in plotting methods using ``secondary_y`` and ``fontsize`` not setting secondary axis font size (:issue:`12565`) - Bug when plotting ``timedelta`` and ``datetime`` dtypes on y-axis (:issue:`16953`) -- Line plots no longer assume monotonic x data when calculating xlims, they show the entire lines now even for unsorted x data. (:issue:`11310`)(:issue:`11471`) +- Line plots no longer assume monotonic x data when calculating xlims, they show the entire lines now even for unsorted x data. (:issue:`11310`, :issue:`11471`) - With matplotlib 2.0.0 and above, calculation of x limits for line plots is left to matplotlib, so that its new default settings are applied. (:issue:`15495`) - Bug in ``Series.plot.bar`` or ``DataFramee.plot.bar`` with ``y`` not respecting user-passed ``color`` (:issue:`16822`) @@ -575,10 +582,8 @@ Numeric Categorical ^^^^^^^^^^^ - Bug in :func:`Series.isin` when called with a categorical (:issue`16639`) -- Bug in the categorical constructor with empty values and categories causing - the ``.categories`` to be an empty ``Float64Index`` rather than an empty - ``Index`` with object dtype (:issue:`17248`) -- Bug in categorical operations with :ref:`Series.cat ' not preserving the original Series' name (:issue:`17509`) +- Bug in the categorical constructor with empty values and categories causing the ``.categories`` to be an empty ``Float64Index`` rather than an empty ``Index`` with object dtype (:issue:`17248`) +- Bug in categorical operations with :ref:`Series.cat ' not preserving the original Series' name (:issue:`17509`) PyPy ^^^^ @@ -593,5 +598,3 @@ PyPy Other ^^^^^ - Bug in :func:`eval` where the ``inplace`` parameter was being incorrectly handled (:issue:`16732`) -- Several ``NaT`` method docstrings (e.g. :func:`NaT.ctime`) were incorrect (:issue:`17327`) -- The documentation has had references to versions < v0.17 removed and cleaned up (:issue:`17442`, :issue:`17442`, :issue:`17404` & :issue:`17504`) From d1fe892a754bf48839d9ac4029e258883ee64a2e Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 22 Sep 2017 09:41:31 -0400 Subject: [PATCH 121/188] Revert "BLD: pin numpy to particular variant that is built for all our deps (#17619)" (#17625) This reverts commit 6930f27e78b2b61a4df31b667a816fa53e49ffed. closes #17620 --- ci/requirements-3.6.build | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/ci/requirements-3.6.build b/ci/requirements-3.6.build index 31ffd5acc7fcc7..1c4b46aea3865d 100644 --- a/ci/requirements-3.6.build +++ b/ci/requirements-3.6.build @@ -2,7 +2,5 @@ python=3.6* python-dateutil pytz nomkl +numpy cython - -# pin numpy that is built for all our deps -numpy=1.13.1=py36_blas_openblas_201 From e6d8953f8cd5ad9f22894a8948e9b6340ad819f4 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 22 Sep 2017 12:50:53 -0700 Subject: [PATCH 122/188] Fix make_signature TypeError in py3 (#17609) --- pandas/tests/util/test_util.py | 16 +++++++++++++++- pandas/util/_decorators.py | 2 +- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/pandas/tests/util/test_util.py b/pandas/tests/util/test_util.py index abd82cfa89f942..ffc9703abff41d 100644 --- a/pandas/tests/util/test_util.py +++ b/pandas/tests/util/test_util.py @@ -9,7 +9,7 @@ import pytest from pandas.compat import intern from pandas.util._move import move_into_mutable_buffer, BadMove, stolenbuf -from pandas.util._decorators import deprecate_kwarg +from pandas.util._decorators import deprecate_kwarg, make_signature from pandas.util._validators import (validate_args, validate_kwargs, validate_args_and_kwargs, validate_bool_kwarg) @@ -467,3 +467,17 @@ def test_set_locale(self): current_locale = locale.getlocale() assert current_locale == self.current_locale + + +def test_make_signature(): + # See GH 17608 + # Case where the func does not have default kwargs + sig = make_signature(validate_kwargs) + assert sig == (['fname', 'kwargs', 'compat_args'], + ['fname', 'kwargs', 'compat_args']) + + # Case where the func does have default kwargs + sig = make_signature(deprecate_kwarg) + assert sig == (['old_arg_name', 'new_arg_name', + 'mapping=None', 'stacklevel=2'], + ['old_arg_name', 'new_arg_name', 'mapping', 'stacklevel']) diff --git a/pandas/util/_decorators.py b/pandas/util/_decorators.py index 31e27817913c5a..3733e4311aa732 100644 --- a/pandas/util/_decorators.py +++ b/pandas/util/_decorators.py @@ -242,7 +242,7 @@ def make_signature(func): defaults = ('',) * n_wo_defaults else: n_wo_defaults = len(spec.args) - len(spec.defaults) - defaults = ('',) * n_wo_defaults + spec.defaults + defaults = ('',) * n_wo_defaults + tuple(spec.defaults) args = [] for i, (var, default) in enumerate(zip(spec.args, defaults)): args.append(var if default == '' else var + '=' + repr(default)) From f797c1dc8d838eb9df5ede3be681949dab852148 Mon Sep 17 00:00:00 2001 From: Licht Takeuchi Date: Sat, 23 Sep 2017 06:39:12 +0900 Subject: [PATCH 123/188] BUG: Fix groupby nunique with NaT (#17624) --- doc/source/whatsnew/v0.21.0.txt | 1 + pandas/core/groupby.py | 8 +++++++- pandas/tests/groupby/test_timegrouper.py | 13 +++++++++++++ 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index a80fa744780a2a..5003aa0d97c1c6 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -552,6 +552,7 @@ Groupby/Resample/Rolling - Bug in ``Series.resample(...).apply()`` where an empty ``Series`` modified the source index and did not return the name of a ``Series`` (:issue:`14313`) - Bug in ``.rolling(...).apply(...)`` with a ``DataFrame`` with a ``DatetimeIndex``, a ``window`` of a timedelta-convertible and ``min_periods >= 1` (:issue:`15305`) - Bug in ``DataFrame.groupby`` where index and column keys were not recognized correctly when the number of keys equaled the number of elements on the groupby axis (:issue:`16859`) +- Bug in ``groupby.nunique()`` with ``TimeGrouper`` which cannot handle ``NaT`` correctly (:issue:`17575`) Sparse ^^^^^^ diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index f14ed08a27fae8..a62ae40a85941f 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -3177,7 +3177,13 @@ def nunique(self, dropna=True): out = np.add.reduceat(inc, idx).astype('int64', copy=False) if len(ids): - res = out if ids[0] != -1 else out[1:] + # NaN/NaT group exists if the head of ids is -1, + # so remove it from res and exclude its index from idx + if ids[0] == -1: + res = out[1:] + idx = idx[np.flatnonzero(idx)] + else: + res = out else: res = out[1:] ri = self.grouper.result_index diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index df0a93d7833759..f83a3fcd0668d9 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -608,3 +608,16 @@ def test_first_last_max_min_on_time_data(self): assert_frame_equal(grouped_ref.min(), grouped_test.min()) assert_frame_equal(grouped_ref.first(), grouped_test.first()) assert_frame_equal(grouped_ref.last(), grouped_test.last()) + + def test_nunique_with_timegrouper_and_nat(self): + # GH 17575 + test = pd.DataFrame({ + 'time': [Timestamp('2016-06-28 09:35:35'), + pd.NaT, + Timestamp('2016-06-28 16:46:28')], + 'data': ['1', '2', '3']}) + + grouper = pd.TimeGrouper(key='time', freq='h') + result = test.groupby(grouper)['data'].nunique() + expected = test[test.time.notnull()].groupby(grouper)['data'].nunique() + tm.assert_series_equal(result, expected) From da93f51c0802db6f32e63218e96e3aa7206db6c6 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 22 Sep 2017 22:22:35 -0400 Subject: [PATCH 124/188] TST: remove some warnings (#17638) --- pandas/core/reshape/reshape.py | 2 +- pandas/plotting/_core.py | 2 +- pandas/tests/frame/test_operators.py | 6 ++++++ pandas/tests/indexes/test_interval.py | 2 +- pandas/tests/io/test_stata.py | 8 +++++--- 5 files changed, 14 insertions(+), 6 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 7260bc9a8b7a14..bff09be6149f32 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -851,7 +851,7 @@ def lreshape(data, groups, dropna=True, label=None): return DataFrame(mdata, columns=id_cols + pivot_cols) -def wide_to_long(df, stubnames, i, j, sep="", suffix='\d+'): +def wide_to_long(df, stubnames, i, j, sep="", suffix=r'\d+'): r""" Wide panel to long format. Less flexible but more user-friendly than melt. diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 7a40018494fc4f..aa919d600ec526 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -2726,7 +2726,7 @@ def barh(self, x=None, y=None, **kwds): return self(kind='barh', x=x, y=y, **kwds) def box(self, by=None, **kwds): - """ + r""" Boxplot .. versionadded:: 0.17.0 diff --git a/pandas/tests/frame/test_operators.py b/pandas/tests/frame/test_operators.py index 5052bef24e95a6..309c0f0244d7c8 100644 --- a/pandas/tests/frame/test_operators.py +++ b/pandas/tests/frame/test_operators.py @@ -1035,6 +1035,12 @@ def test_boolean_comparison(self): result = df == tup assert_frame_equal(result, expected) + def test_boolean_comparison_error(self): + + # GH 4576 + # boolean comparisons with a tuple/list give unexpected results + df = DataFrame(np.arange(6).reshape((3, 2))) + # not shape compatible pytest.raises(ValueError, lambda: df == (2, 2)) pytest.raises(ValueError, lambda: df == [2, 2]) diff --git a/pandas/tests/indexes/test_interval.py b/pandas/tests/indexes/test_interval.py index dc59495f619b03..b55bab3a210cc4 100644 --- a/pandas/tests/indexes/test_interval.py +++ b/pandas/tests/indexes/test_interval.py @@ -1068,7 +1068,7 @@ def test_errors(self): interval_range(start='foo', periods=10) # invalid end - msg = 'end must be numeric or datetime-like, got \(0, 1\]' + msg = r'end must be numeric or datetime-like, got \(0, 1\]' with tm.assert_raises_regex(ValueError, msg): interval_range(end=Interval(0, 1), periods=10) diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 94a0ac31e093e4..d6bdb764f1c8e9 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -1053,7 +1053,8 @@ def test_iterator(self): tm.assert_frame_equal(parsed.iloc[0:5, :], chunk) # GH12153 - from_chunks = pd.concat(read_stata(fname, chunksize=4)) + with read_stata(fname, chunksize=4) as itr: + from_chunks = pd.concat(itr) tm.assert_frame_equal(parsed, from_chunks) def test_read_chunks_115(self): @@ -1306,8 +1307,9 @@ def test_value_labels_iterator(self, write_index): df['A'] = df['A'].astype('category') with tm.ensure_clean() as path: df.to_stata(path, write_index=write_index) - dta_iter = pd.read_stata(path, iterator=True) - value_labels = dta_iter.value_labels() + + with pd.read_stata(path, iterator=True) as dta_iter: + value_labels = dta_iter.value_labels() assert value_labels == {'A': {0: 'A', 1: 'B', 2: 'C', 3: 'E'}} def test_set_index(self): From 4004367fb815645bb7f5bbb518eee62cbd476e3a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 22 Sep 2017 19:52:56 -0700 Subject: [PATCH 125/188] BLD: fix inline warnings (#17528) --- pandas/_libs/parsers.pyx | 2 +- pandas/_libs/src/inference.pyx | 2 +- pandas/_libs/src/khash.pxd | 112 ++++++++++++++++----------------- pandas/_libs/src/skiplist.pxd | 10 +-- pandas/_libs/src/util.pxd | 36 +++++------ 5 files changed, 81 insertions(+), 81 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 3e8b5c4bd3febd..5bf9f4ce83cbfa 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -255,7 +255,7 @@ cdef extern from "parser/tokenizer.h": # inline int to_complex(char *item, double *p_real, # double *p_imag, char sci, char decimal) - inline int to_longlong(char *item, long long *p_value) nogil + int to_longlong(char *item, long long *p_value) nogil # inline int to_longlong_thousands(char *item, long long *p_value, # char tsep) int to_boolean(const char *item, uint8_t *val) nogil diff --git a/pandas/_libs/src/inference.pyx b/pandas/_libs/src/inference.pyx index 2bb362eab40975..a2764e87eec556 100644 --- a/pandas/_libs/src/inference.pyx +++ b/pandas/_libs/src/inference.pyx @@ -1015,7 +1015,7 @@ cpdef bint is_interval_array(ndarray[object] values): cdef extern from "parse_helper.h": - inline int floatify(object, double *result, int *maybe_int) except -1 + int floatify(object, double *result, int *maybe_int) except -1 # constants that will be compared to potentially arbitrarily large # python int diff --git a/pandas/_libs/src/khash.pxd b/pandas/_libs/src/khash.pxd index adb0fe285dbb8d..ba9a3c70097b23 100644 --- a/pandas/_libs/src/khash.pxd +++ b/pandas/_libs/src/khash.pxd @@ -11,13 +11,13 @@ cdef extern from "khash_python.h": PyObject **keys size_t *vals - inline kh_pymap_t* kh_init_pymap() - inline void kh_destroy_pymap(kh_pymap_t*) - inline void kh_clear_pymap(kh_pymap_t*) - inline khint_t kh_get_pymap(kh_pymap_t*, PyObject*) - inline void kh_resize_pymap(kh_pymap_t*, khint_t) - inline khint_t kh_put_pymap(kh_pymap_t*, PyObject*, int*) - inline void kh_del_pymap(kh_pymap_t*, khint_t) + kh_pymap_t* kh_init_pymap() + void kh_destroy_pymap(kh_pymap_t*) + void kh_clear_pymap(kh_pymap_t*) + khint_t kh_get_pymap(kh_pymap_t*, PyObject*) + void kh_resize_pymap(kh_pymap_t*, khint_t) + khint_t kh_put_pymap(kh_pymap_t*, PyObject*, int*) + void kh_del_pymap(kh_pymap_t*, khint_t) bint kh_exist_pymap(kh_pymap_t*, khiter_t) @@ -27,13 +27,13 @@ cdef extern from "khash_python.h": PyObject **keys size_t *vals - inline kh_pyset_t* kh_init_pyset() - inline void kh_destroy_pyset(kh_pyset_t*) - inline void kh_clear_pyset(kh_pyset_t*) - inline khint_t kh_get_pyset(kh_pyset_t*, PyObject*) - inline void kh_resize_pyset(kh_pyset_t*, khint_t) - inline khint_t kh_put_pyset(kh_pyset_t*, PyObject*, int*) - inline void kh_del_pyset(kh_pyset_t*, khint_t) + kh_pyset_t* kh_init_pyset() + void kh_destroy_pyset(kh_pyset_t*) + void kh_clear_pyset(kh_pyset_t*) + khint_t kh_get_pyset(kh_pyset_t*, PyObject*) + void kh_resize_pyset(kh_pyset_t*, khint_t) + khint_t kh_put_pyset(kh_pyset_t*, PyObject*, int*) + void kh_del_pyset(kh_pyset_t*, khint_t) bint kh_exist_pyset(kh_pyset_t*, khiter_t) @@ -45,13 +45,13 @@ cdef extern from "khash_python.h": kh_cstr_t *keys size_t *vals - inline kh_str_t* kh_init_str() nogil - inline void kh_destroy_str(kh_str_t*) nogil - inline void kh_clear_str(kh_str_t*) nogil - inline khint_t kh_get_str(kh_str_t*, kh_cstr_t) nogil - inline void kh_resize_str(kh_str_t*, khint_t) nogil - inline khint_t kh_put_str(kh_str_t*, kh_cstr_t, int*) nogil - inline void kh_del_str(kh_str_t*, khint_t) nogil + kh_str_t* kh_init_str() nogil + void kh_destroy_str(kh_str_t*) nogil + void kh_clear_str(kh_str_t*) nogil + khint_t kh_get_str(kh_str_t*, kh_cstr_t) nogil + void kh_resize_str(kh_str_t*, khint_t) nogil + khint_t kh_put_str(kh_str_t*, kh_cstr_t, int*) nogil + void kh_del_str(kh_str_t*, khint_t) nogil bint kh_exist_str(kh_str_t*, khiter_t) nogil @@ -61,13 +61,13 @@ cdef extern from "khash_python.h": int64_t *keys size_t *vals - inline kh_int64_t* kh_init_int64() nogil - inline void kh_destroy_int64(kh_int64_t*) nogil - inline void kh_clear_int64(kh_int64_t*) nogil - inline khint_t kh_get_int64(kh_int64_t*, int64_t) nogil - inline void kh_resize_int64(kh_int64_t*, khint_t) nogil - inline khint_t kh_put_int64(kh_int64_t*, int64_t, int*) nogil - inline void kh_del_int64(kh_int64_t*, khint_t) nogil + kh_int64_t* kh_init_int64() nogil + void kh_destroy_int64(kh_int64_t*) nogil + void kh_clear_int64(kh_int64_t*) nogil + khint_t kh_get_int64(kh_int64_t*, int64_t) nogil + void kh_resize_int64(kh_int64_t*, khint_t) nogil + khint_t kh_put_int64(kh_int64_t*, int64_t, int*) nogil + void kh_del_int64(kh_int64_t*, khint_t) nogil bint kh_exist_int64(kh_int64_t*, khiter_t) nogil @@ -79,13 +79,13 @@ cdef extern from "khash_python.h": khuint64_t *keys size_t *vals - inline kh_uint64_t* kh_init_uint64() nogil - inline void kh_destroy_uint64(kh_uint64_t*) nogil - inline void kh_clear_uint64(kh_uint64_t*) nogil - inline khint_t kh_get_uint64(kh_uint64_t*, int64_t) nogil - inline void kh_resize_uint64(kh_uint64_t*, khint_t) nogil - inline khint_t kh_put_uint64(kh_uint64_t*, int64_t, int*) nogil - inline void kh_del_uint64(kh_uint64_t*, khint_t) nogil + kh_uint64_t* kh_init_uint64() nogil + void kh_destroy_uint64(kh_uint64_t*) nogil + void kh_clear_uint64(kh_uint64_t*) nogil + khint_t kh_get_uint64(kh_uint64_t*, int64_t) nogil + void kh_resize_uint64(kh_uint64_t*, khint_t) nogil + khint_t kh_put_uint64(kh_uint64_t*, int64_t, int*) nogil + void kh_del_uint64(kh_uint64_t*, khint_t) nogil bint kh_exist_uint64(kh_uint64_t*, khiter_t) nogil @@ -95,13 +95,13 @@ cdef extern from "khash_python.h": float64_t *keys size_t *vals - inline kh_float64_t* kh_init_float64() nogil - inline void kh_destroy_float64(kh_float64_t*) nogil - inline void kh_clear_float64(kh_float64_t*) nogil - inline khint_t kh_get_float64(kh_float64_t*, float64_t) nogil - inline void kh_resize_float64(kh_float64_t*, khint_t) nogil - inline khint_t kh_put_float64(kh_float64_t*, float64_t, int*) nogil - inline void kh_del_float64(kh_float64_t*, khint_t) nogil + kh_float64_t* kh_init_float64() nogil + void kh_destroy_float64(kh_float64_t*) nogil + void kh_clear_float64(kh_float64_t*) nogil + khint_t kh_get_float64(kh_float64_t*, float64_t) nogil + void kh_resize_float64(kh_float64_t*, khint_t) nogil + khint_t kh_put_float64(kh_float64_t*, float64_t, int*) nogil + void kh_del_float64(kh_float64_t*, khint_t) nogil bint kh_exist_float64(kh_float64_t*, khiter_t) nogil @@ -111,13 +111,13 @@ cdef extern from "khash_python.h": int32_t *keys size_t *vals - inline kh_int32_t* kh_init_int32() nogil - inline void kh_destroy_int32(kh_int32_t*) nogil - inline void kh_clear_int32(kh_int32_t*) nogil - inline khint_t kh_get_int32(kh_int32_t*, int32_t) nogil - inline void kh_resize_int32(kh_int32_t*, khint_t) nogil - inline khint_t kh_put_int32(kh_int32_t*, int32_t, int*) nogil - inline void kh_del_int32(kh_int32_t*, khint_t) nogil + kh_int32_t* kh_init_int32() nogil + void kh_destroy_int32(kh_int32_t*) nogil + void kh_clear_int32(kh_int32_t*) nogil + khint_t kh_get_int32(kh_int32_t*, int32_t) nogil + void kh_resize_int32(kh_int32_t*, khint_t) nogil + khint_t kh_put_int32(kh_int32_t*, int32_t, int*) nogil + void kh_del_int32(kh_int32_t*, khint_t) nogil bint kh_exist_int32(kh_int32_t*, khiter_t) nogil @@ -129,12 +129,12 @@ cdef extern from "khash_python.h": kh_cstr_t *keys PyObject **vals - inline kh_strbox_t* kh_init_strbox() nogil - inline void kh_destroy_strbox(kh_strbox_t*) nogil - inline void kh_clear_strbox(kh_strbox_t*) nogil - inline khint_t kh_get_strbox(kh_strbox_t*, kh_cstr_t) nogil - inline void kh_resize_strbox(kh_strbox_t*, khint_t) nogil - inline khint_t kh_put_strbox(kh_strbox_t*, kh_cstr_t, int*) nogil - inline void kh_del_strbox(kh_strbox_t*, khint_t) nogil + kh_strbox_t* kh_init_strbox() nogil + void kh_destroy_strbox(kh_strbox_t*) nogil + void kh_clear_strbox(kh_strbox_t*) nogil + khint_t kh_get_strbox(kh_strbox_t*, kh_cstr_t) nogil + void kh_resize_strbox(kh_strbox_t*, khint_t) nogil + khint_t kh_put_strbox(kh_strbox_t*, kh_cstr_t, int*) nogil + void kh_del_strbox(kh_strbox_t*, khint_t) nogil bint kh_exist_strbox(kh_strbox_t*, khiter_t) nogil diff --git a/pandas/_libs/src/skiplist.pxd b/pandas/_libs/src/skiplist.pxd index 69e9df5b542aa6..214aa1c7aeaf00 100644 --- a/pandas/_libs/src/skiplist.pxd +++ b/pandas/_libs/src/skiplist.pxd @@ -14,9 +14,9 @@ cdef extern from "skiplist.h": int size int maxlevels - inline skiplist_t* skiplist_init(int) nogil - inline void skiplist_destroy(skiplist_t*) nogil - inline double skiplist_get(skiplist_t*, int, int*) nogil - inline int skiplist_insert(skiplist_t*, double) nogil - inline int skiplist_remove(skiplist_t*, double) nogil + skiplist_t* skiplist_init(int) nogil + void skiplist_destroy(skiplist_t*) nogil + double skiplist_get(skiplist_t*, int, int*) nogil + int skiplist_insert(skiplist_t*, double) nogil + int skiplist_remove(skiplist_t*, double) nogil diff --git a/pandas/_libs/src/util.pxd b/pandas/_libs/src/util.pxd index 076bc1cd56003a..f7a68c4ade71b5 100644 --- a/pandas/_libs/src/util.pxd +++ b/pandas/_libs/src/util.pxd @@ -3,26 +3,26 @@ cimport numpy as cnp cimport cpython cdef extern from "numpy_helper.h": - inline void set_array_owndata(ndarray ao) - inline void set_array_not_contiguous(ndarray ao) - - inline int is_integer_object(object) - inline int is_float_object(object) - inline int is_complex_object(object) - inline int is_bool_object(object) - inline int is_string_object(object) - inline int is_datetime64_object(object) - inline int is_timedelta64_object(object) - inline int assign_value_1d(ndarray, Py_ssize_t, object) except -1 - inline cnp.int64_t get_nat() - inline object get_value_1d(ndarray, Py_ssize_t) - inline int floatify(object, double*) except -1 - inline char *get_c_string(object) except NULL - inline object char_to_string(char*) - inline void transfer_object_column(char *dst, char *src, size_t stride, + void set_array_owndata(ndarray ao) + void set_array_not_contiguous(ndarray ao) + + int is_integer_object(object) + int is_float_object(object) + int is_complex_object(object) + int is_bool_object(object) + int is_string_object(object) + int is_datetime64_object(object) + int is_timedelta64_object(object) + int assign_value_1d(ndarray, Py_ssize_t, object) except -1 + cnp.int64_t get_nat() + object get_value_1d(ndarray, Py_ssize_t) + int floatify(object, double*) except -1 + char *get_c_string(object) except NULL + object char_to_string(char*) + void transfer_object_column(char *dst, char *src, size_t stride, size_t length) object sarr_from_data(cnp.dtype, int length, void* data) - inline object unbox_if_zerodim(object arr) + object unbox_if_zerodim(object arr) ctypedef fused numeric: cnp.int8_t From 76d17449f868e25b68bd636906b8f70c683761af Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 23 Sep 2017 10:11:01 -0400 Subject: [PATCH 126/188] BUG: overflow on Timedelta construction & arithmetic now raises (#17640) closes #17637 --- doc/source/whatsnew/v0.21.0.txt | 1 + pandas/_libs/tslib.pyx | 6 +++--- pandas/tests/indexes/datetimes/test_tools.py | 7 +++++++ pandas/tests/scalar/test_timedelta.py | 15 +++++++++++++++ 4 files changed, 26 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 5003aa0d97c1c6..43e90f06ed5045 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -498,6 +498,7 @@ Conversion - Bug in :func:`Series.fillna` returns frame when ``inplace=True`` and ``value`` is dict (:issue:`16156`) - Bug in :attr:`Timestamp.weekday_name` returning a UTC-based weekday name when localized to a timezone (:issue:`17354`) - Bug in ``Timestamp.replace`` when replacing ``tzinfo`` around DST changes (:issue:`15683`) +- Bug in ``Timedelta`` construction and arithmetic that would not propagate the ``Overflow`` exception (:issue:`17367`) Indexing ^^^^^^^^ diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 6ba37062ac8691..077603af96947c 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -3514,7 +3514,7 @@ cpdef convert_to_timedelta64(object ts, object unit): ts = np.timedelta64(_delta_to_nanoseconds(ts), 'ns') if isinstance(ts, timedelta): - ts = np.timedelta64(ts) + ts = np.timedelta64(_delta_to_nanoseconds(ts), 'ns') elif not isinstance(ts, np.timedelta64): raise ValueError("Invalid type for timedelta " "scalar: %s" % type(ts)) @@ -3891,8 +3891,7 @@ for _maybe_method_name in dir(NaTType): #---------------------------------------------------------------------- # Conversion routines - -cpdef int64_t _delta_to_nanoseconds(delta): +cpdef int64_t _delta_to_nanoseconds(delta) except? -1: if isinstance(delta, np.ndarray): return delta.astype('m8[ns]').astype('int64') if hasattr(delta, 'nanos'): @@ -3903,6 +3902,7 @@ cpdef int64_t _delta_to_nanoseconds(delta): return delta.astype("timedelta64[ns]").item() if is_integer_object(delta): return delta + return (delta.days * 24 * 60 * 60 * 1000000 + delta.seconds * 1000000 + delta.microseconds) * 1000 diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index be27334384f6b7..e0ccedb834adf9 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -787,6 +787,13 @@ def test_to_datetime_freq(self): assert xp.freq == rs.freq assert xp.tzinfo == rs.tzinfo + def test_to_datetime_overflow(self): + # gh-17637 + # we are overflowing Timedelta range here + + with pytest.raises(OverflowError): + date_range(start='1/1/1700', freq='B', periods=100000) + def test_string_na_nat_conversion(self): # GH #999, #858 diff --git a/pandas/tests/scalar/test_timedelta.py b/pandas/tests/scalar/test_timedelta.py index bc9a0388df9d91..2cabbfacf64161 100644 --- a/pandas/tests/scalar/test_timedelta.py +++ b/pandas/tests/scalar/test_timedelta.py @@ -166,6 +166,13 @@ def test_overflow_on_construction(self): value = pd.Timedelta('1day').value * 20169940 pytest.raises(OverflowError, pd.Timedelta, value) + # xref gh-17637 + with pytest.raises(OverflowError): + pd.Timedelta(7 * 19999, unit='D') + + with pytest.raises(OverflowError): + pd.Timedelta(timedelta(days=13 * 19999)) + def test_total_seconds_scalar(self): # see gh-10939 rng = Timedelta('1 days, 10:11:12.100123456') @@ -612,6 +619,14 @@ def test_timedelta_arithmetic(self): tm.assert_series_equal(result_operator, expected) tm.assert_series_equal(result_method, expected) + def test_arithmetic_overflow(self): + + with pytest.raises(OverflowError): + pd.Timestamp('1700-01-01') + pd.Timedelta(13 * 19999, unit='D') + + with pytest.raises(OverflowError): + pd.Timestamp('1700-01-01') + timedelta(days=13 * 19999) + def test_apply_to_timedelta(self): timedelta_NaT = pd.to_timedelta('NaT') From e2757a2db0faa7878858b36f602235daa936a674 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 23 Sep 2017 16:13:01 +0200 Subject: [PATCH 127/188] DOC: correct example use of nth dropna keyword (#17641) dropna=True is deprecated, see https://github.com/pandas-dev/pandas/pull/17493 --- doc/source/groupby.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index e9a7d8dd0a46ea..91d806ca5dd4f8 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -1060,7 +1060,7 @@ To select from a DataFrame or Series the nth item, use the nth method. This is a g.nth(-1) g.nth(1) -If you want to select the nth not-null item, use the ``dropna`` kwarg. For a DataFrame this should be either ``'any'`` or ``'all'`` just like you would pass to dropna, for a Series this just needs to be truthy. +If you want to select the nth not-null item, use the ``dropna`` kwarg. For a DataFrame this should be either ``'any'`` or ``'all'`` just like you would pass to dropna: .. ipython:: python @@ -1072,7 +1072,7 @@ If you want to select the nth not-null item, use the ``dropna`` kwarg. For a Dat g.nth(-1, dropna='any') # NaNs denote group exhausted when using dropna g.last() - g.B.nth(0, dropna=True) + g.B.nth(0, dropna='all') As with other methods, passing ``as_index=False``, will achieve a filtration, which returns the grouped row. From 85a10671f814301be3e0f3c24c2863488ec27ddd Mon Sep 17 00:00:00 2001 From: skwbc Date: Sun, 24 Sep 2017 01:14:09 +0900 Subject: [PATCH 128/188] BUG: DataFrame.first_valid_index() fails if there is no valid entry. (#17488) Closes #17400 --- doc/source/whatsnew/v0.21.0.txt | 1 + pandas/core/frame.py | 20 ++++++++++++-------- pandas/core/generic.py | 16 ++++++++++++++++ pandas/core/series.py | 10 ++++------ pandas/tests/frame/test_timeseries.py | 5 +++++ 5 files changed, 38 insertions(+), 14 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 43e90f06ed5045..32e4294f06d6bb 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -518,6 +518,7 @@ Indexing - Bug in ``CategoricalIndex`` reindexing in which specified indices containing duplicates were not being respected (:issue:`17323`) - Bug in intersection of ``RangeIndex`` with negative step (:issue:`17296`) - Bug in ``IntervalIndex`` where performing a scalar lookup fails for included right endpoints of non-overlapping monotonic decreasing indexes (:issue:`16417`, :issue:`17271`) +- Bug in :meth:`DataFrame.first_valid_index` and :meth:`DataFrame.last_valid_index` when no valid entry (:issue:`17400`) I/O ^^^ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index dd5d490ea66a8f..346eeb8d2642cd 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4063,23 +4063,27 @@ def update(self, other, join='left', overwrite=True, filter_func=None, # ---------------------------------------------------------------------- # Misc methods + def _get_valid_indices(self): + is_valid = self.count(1) > 0 + return self.index[is_valid] + + @Appender(_shared_docs['valid_index'] % { + 'position': 'first', 'klass': 'DataFrame'}) def first_valid_index(self): - """ - Return label for first non-NA/null value - """ if len(self) == 0: return None - return self.index[self.count(1) > 0][0] + valid_indices = self._get_valid_indices() + return valid_indices[0] if len(valid_indices) else None + @Appender(_shared_docs['valid_index'] % { + 'position': 'first', 'klass': 'DataFrame'}) def last_valid_index(self): - """ - Return label for last non-NA/null value - """ if len(self) == 0: return None - return self.index[self.count(1) > 0][-1] + valid_indices = self._get_valid_indices() + return valid_indices[-1] if len(valid_indices) else None # ---------------------------------------------------------------------- # Data reshaping diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e0a9fdb08dcb2c..241204ef555f6e 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6757,6 +6757,22 @@ def transform(self, func, *args, **kwargs): cls.transform = transform + # ---------------------------------------------------------------------- + # Misc methods + + _shared_docs['valid_index'] = """ + Return index for %(position)s non-NA/null value. + + Notes + -------- + If all elements are non-NA/null, returns None. + Also returns None for empty %(klass)s. + + Returns + -------- + scalar : type of index + """ + def _doc_parms(cls): """Return a tuple of the doc parms.""" diff --git a/pandas/core/series.py b/pandas/core/series.py index ac11c5f908fdcf..02690dec3e1c4c 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2825,10 +2825,9 @@ def dropna(self, axis=0, inplace=False, **kwargs): valid = lambda self, inplace=False, **kwargs: self.dropna(inplace=inplace, **kwargs) + @Appender(generic._shared_docs['valid_index'] % { + 'position': 'first', 'klass': 'Series'}) def first_valid_index(self): - """ - Return label for first non-NA/null value - """ if len(self) == 0: return None @@ -2839,10 +2838,9 @@ def first_valid_index(self): else: return self.index[i] + @Appender(generic._shared_docs['valid_index'] % { + 'position': 'last', 'klass': 'Series'}) def last_valid_index(self): - """ - Return label for last non-NA/null value - """ if len(self) == 0: return None diff --git a/pandas/tests/frame/test_timeseries.py b/pandas/tests/frame/test_timeseries.py index 19fbf854256c6e..26a2c6f9a50450 100644 --- a/pandas/tests/frame/test_timeseries.py +++ b/pandas/tests/frame/test_timeseries.py @@ -440,6 +440,11 @@ def test_first_last_valid(self): assert empty.last_valid_index() is None assert empty.first_valid_index() is None + # GH17400: no valid entries + frame[:] = nan + assert frame.last_valid_index() is None + assert frame.first_valid_index() is None + def test_at_time_frame(self): rng = date_range('1/1/2000', '1/5/2000', freq='5min') ts = DataFrame(np.random.randn(len(rng), 2), index=rng) From b555613259572640e173f45b170c41265a6a7d79 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 23 Sep 2017 10:00:50 -0700 Subject: [PATCH 129/188] Fix apparent copy/paste error skewness--> excess kurtosis (#17647) --- pandas/core/nanops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 858aed7fd3e237..388b2ecdff445d 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -565,7 +565,7 @@ def nanskew(values, axis=None, skipna=True): @disallow('M8', 'm8') def nankurt(values, axis=None, skipna=True): - """ Compute the sample skewness. + """ Compute the sample excess kurtosis. The statistic computed here is the adjusted Fisher-Pearson standardized moment coefficient G2, computed directly from the second and fourth From 2eb568a9c968a3cffd4e585f644bd53e6e8a600b Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 23 Sep 2017 10:36:28 -0700 Subject: [PATCH 130/188] Bitesize offsets (#17318) --- asv_bench/benchmarks/timeseries.py | 2 +- pandas/tseries/frequencies.py | 1 + pandas/tseries/offsets.py | 131 ++++++++++++----------------- 3 files changed, 58 insertions(+), 76 deletions(-) diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py index b7151ad2eaa999..779fc0bd20964a 100644 --- a/asv_bench/benchmarks/timeseries.py +++ b/asv_bench/benchmarks/timeseries.py @@ -56,7 +56,7 @@ def setup(self): self.no_freq = self.rng7[:50000].append(self.rng7[50002:]) self.d_freq = self.rng7[:50000].append(self.rng7[50000:]) - self.rng8 = date_range(start='1/1/1700', freq='B', periods=100000) + self.rng8 = date_range(start='1/1/1700', freq='B', periods=75000) self.b_freq = self.rng8[:50000].append(self.rng8[50000:]) def time_add_timedelta(self): diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index 085a3a784557ba..b055c4b4cb27f0 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- from datetime import timedelta from pandas.compat import long, zip from pandas import compat diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index 452d30322b4cfa..ea37434e3a8d98 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- from datetime import date, datetime, timedelta from pandas.compat import range from pandas import compat @@ -323,37 +324,42 @@ def _params(self): def __repr__(self): className = getattr(self, '_outputName', type(self).__name__) + + if abs(self.n) != 1: + plural = 's' + else: + plural = '' + + n_str = "" + if self.n != 1: + n_str = "%s * " % self.n + + out = '<%s' % n_str + className + plural + self._repr_attrs() + '>' + return out + + # TODO: Combine this with BusinessMixin version by defining a whitelisted + # set of attributes on each object rather than the existing behavior of + # iterating over internal ``__dict__`` + def _repr_attrs(self): exclude = set(['n', 'inc', 'normalize']) attrs = [] for attr in sorted(self.__dict__): - if ((attr == 'kwds' and len(self.kwds) == 0) or - attr.startswith('_')): + if attr.startswith('_'): continue - elif attr == 'kwds': + elif attr == 'kwds': # TODO: get rid of this kwds_new = {} for key in self.kwds: if not hasattr(self, key): kwds_new[key] = self.kwds[key] if len(kwds_new) > 0: - attrs.append('='.join((attr, repr(kwds_new)))) - else: - if attr not in exclude: - attrs.append('='.join((attr, repr(getattr(self, attr))))) - - plural = '' - if abs(self.n) != 1: - plural = 's' - - n_str = '' - if self.n != 1: - n_str = '{n} * '.format(n=self.n) + attrs.append('kwds=%s' % (kwds_new)) + elif attr not in exclude: + value = getattr(self, attr) + attrs.append('%s=%s' % (attr, value)) - attrs_str = '' + out = '' if attrs: - attrs_str = ': ' + ', '.join(attrs) - - repr_content = ''.join([n_str, className, plural, attrs_str]) - out = '<{content}>'.format(content=repr_content) + out += ': ' + ', '.join(attrs) return out @property @@ -507,8 +513,18 @@ def freqstr(self): else: fstr = code + try: + if self._offset: + fstr += self._offset_str() + except AttributeError: + # TODO: standardize `_offset` vs `offset` naming convention + pass + return fstr + def _offset_str(self): + return '' + @property def nanos(self): raise ValueError("{name} is a non-fixed frequency".format(name=self)) @@ -527,23 +543,11 @@ def _from_name(cls, suffix=None): class BusinessMixin(object): """ mixin to business types to provide related functions """ - # TODO: Combine this with DateOffset by defining a whitelisted set of - # attributes on each object rather than the existing behavior of iterating - # over internal ``__dict__`` - def __repr__(self): - className = getattr(self, '_outputName', self.__class__.__name__) - - plural = '' - if abs(self.n) != 1: - plural = 's' - - n_str = '' - if self.n != 1: - n_str = '{n} * '.format(n=self.n) - - repr_content = ''.join([n_str, className, plural, self._repr_attrs()]) - out = '<{content}>'.format(content=repr_content) - return out + @property + def offset(self): + """Alias for self._offset""" + # Alias for backward compat + return self._offset def _repr_attrs(self): if self.offset: @@ -572,6 +576,11 @@ def __getstate__(self): def __setstate__(self, state): """Reconstruct an instance from a pickled state""" + if 'offset' in state: + # Older versions have offset attribute instead of _offset + if '_offset' in state: # pragma: no cover + raise ValueError('Unexpected key `_offset`') + state['_offset'] = state.pop('offset') self.__dict__ = state if 'weekmask' in state and 'holidays' in state: calendar, holidays = _get_calendar(weekmask=self.weekmask, @@ -593,24 +602,7 @@ def __init__(self, n=1, normalize=False, **kwds): self.n = int(n) self.normalize = normalize self.kwds = kwds - self.offset = kwds.get('offset', timedelta(0)) - - @property - def freqstr(self): - try: - code = self.rule_code - except NotImplementedError: - return repr(self) - - if self.n != 1: - fstr = '{n}{code}'.format(n=self.n, code=code) - else: - fstr = code - - if self.offset: - fstr += self._offset_str() - - return fstr + self._offset = kwds.get('offset', timedelta(0)) def _offset_str(self): def get_str(td): @@ -643,9 +635,6 @@ def get_str(td): else: return '+' + repr(self.offset) - def isAnchored(self): - return (self.n == 1) - @apply_wraps def apply(self, other): if isinstance(other, datetime): @@ -709,7 +698,7 @@ def __init__(self, **kwds): kwds['start'] = self._validate_time(kwds.get('start', '09:00')) kwds['end'] = self._validate_time(kwds.get('end', '17:00')) self.kwds = kwds - self.offset = kwds.get('offset', timedelta(0)) + self._offset = kwds.get('offset', timedelta(0)) self.start = kwds.get('start', '09:00') self.end = kwds.get('end', '17:00') @@ -776,7 +765,7 @@ def _get_business_hours_by_sec(self): Return business hours in a day by seconds. """ if self._get_daytime_flag(): - # create dummy datetime to calcurate businesshours in a day + # create dummy datetime to calculate businesshours in a day dtstart = datetime(2014, 4, 1, self.start.hour, self.start.minute) until = datetime(2014, 4, 1, self.end.hour, self.end.minute) return (until - dtstart).total_seconds() @@ -811,7 +800,7 @@ def rollforward(self, dt): @apply_wraps def apply(self, other): - # calcurate here because offset is not immutable + # calculate here because offset is not immutable daytime = self._get_daytime_flag() businesshours = self._get_business_hours_by_sec() bhdelta = timedelta(seconds=businesshours) @@ -860,7 +849,7 @@ def apply(self, other): if n >= 0: bday_edge = self._prev_opening_time(other) bday_edge = bday_edge + bhdelta - # calcurate remainder + # calculate remainder bday_remain = result - bday_edge result = self._next_opening_time(other) result += bday_remain @@ -898,7 +887,7 @@ def onOffset(self, dt): def _onOffset(self, dt, businesshours): """ - Slight speedups using calcurated values + Slight speedups using calculated values """ # if self.normalize and not _is_normalized(dt): # return False @@ -975,7 +964,8 @@ def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri', self.n = int(n) self.normalize = normalize self.kwds = kwds - self.offset = kwds.get('offset', timedelta(0)) + self._offset = kwds.get('offset', timedelta(0)) + calendar, holidays = _get_calendar(weekmask=weekmask, holidays=holidays, calendar=calendar) @@ -1337,9 +1327,6 @@ def _apply_index_days(self, i, roll): class BusinessMonthEnd(MonthOffset): """DateOffset increments between business EOM dates""" - def isAnchored(self): - return (self.n == 1) - @apply_wraps def apply(self, other): n = self.n @@ -1425,7 +1412,7 @@ def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri', self.n = int(n) self.normalize = normalize self.kwds = kwds - self.offset = kwds.get('offset', timedelta(0)) + self._offset = kwds.get('offset', timedelta(0)) calendar, holidays = _get_calendar(weekmask=weekmask, holidays=holidays, @@ -1495,7 +1482,7 @@ def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri', self.n = int(n) self.normalize = normalize self.kwds = kwds - self.offset = kwds.get('offset', timedelta(0)) + self._offset = kwds.get('offset', timedelta(0)) # _get_calendar does validation and possible transformation # of calendar and holidays. @@ -1966,9 +1953,6 @@ class QuarterEnd(QuarterOffset): _default_startingMonth = 3 _prefix = 'Q' - def isAnchored(self): - return (self.n == 1 and self.startingMonth is not None) - @apply_wraps def apply(self, other): n = self.n @@ -2004,9 +1988,6 @@ class QuarterBegin(QuarterOffset): _from_name_startingMonth = 1 _prefix = 'QS' - def isAnchored(self): - return (self.n == 1 and self.startingMonth is not None) - @apply_wraps def apply(self, other): n = self.n From ecd2ad9ff58fa37bbdb66a09736dfb14db5caa6b Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 23 Sep 2017 14:52:11 -0400 Subject: [PATCH 131/188] TST: remove some more warnings (#17645) TST: parametrize stata tests --- pandas/core/dtypes/missing.py | 20 +- pandas/core/internals.py | 9 + pandas/tests/frame/test_analytics.py | 8 +- pandas/tests/io/test_stata.py | 333 ++++++++++++++------------- pandas/tests/test_window.py | 2 +- pandas/util/testing.py | 4 +- 6 files changed, 189 insertions(+), 187 deletions(-) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 101612893cb025..49b7b1d1d3a9b4 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -327,25 +327,7 @@ def array_equivalent(left, right, strict_nan=False): left = left.view('i8') right = right.view('i8') - # NaNs cannot occur otherwise. - try: - return np.array_equal(left, right) - except AttributeError: - # see gh-13388 - # - # NumPy v1.7.1 has a bug in its array_equal - # function that prevents it from correctly - # comparing two arrays with complex dtypes. - # This bug is corrected in v1.8.0, so remove - # this try-except block as soon as we stop - # supporting NumPy versions < 1.8.0 - if not is_dtype_equal(left.dtype, right.dtype): - return False - - left = left.tolist() - right = right.tolist() - - return left == right + return np.array_equal(left, right) def _infer_fill_value(val): diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 83b382ec0ed723..6799d3b5746d0d 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1289,6 +1289,15 @@ def get_result(other): elif is_numeric_v_string_like(values, other): result = False + # avoid numpy warning of elementwise comparisons + elif func.__name__ == 'eq': + if is_list_like(other) and not isinstance(other, np.ndarray): + other = np.asarray(other) + + # if we can broadcast, then ok + if values.shape[-1] != other.shape[-1]: + return False + result = func(values, other) else: result = func(values, other) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 93514a8a422151..aac8f785f3d992 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -2,6 +2,7 @@ from __future__ import print_function +import warnings from datetime import timedelta from distutils.version import LooseVersion import sys @@ -102,7 +103,6 @@ def test_corr_int(self): # dtypes other than float64 #1761 df3 = DataFrame({"a": [1, 2, 3, 4], "b": [1, 2, 3, 4]}) - # it works! df3.cov() df3.corr() @@ -117,7 +117,11 @@ def test_corr_int_and_boolean(self): expected = DataFrame(np.ones((2, 2)), index=[ 'a', 'b'], columns=['a', 'b']) for meth in ['pearson', 'kendall', 'spearman']: - tm.assert_frame_equal(df.corr(meth), expected) + + # RuntimeWarning + with warnings.catch_warnings(record=True): + result = df.corr(meth) + tm.assert_frame_equal(result, expected) def test_corr_cov_independent_index_column(self): # GH 14617 diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index d6bdb764f1c8e9..055a490bc6b5d8 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -23,6 +23,19 @@ PossiblePrecisionLoss, StataMissingValue) +@pytest.fixture +def dirpath(): + return tm.get_data_path() + + +@pytest.fixture +def parsed_114(dirpath): + dta14_114 = os.path.join(dirpath, 'stata5_114.dta') + parsed_114 = read_stata(dta14_114, convert_dates=True) + parsed_114.index.name = 'index' + return parsed_114 + + class TestStata(object): def setup_method(self, method): @@ -108,10 +121,12 @@ def test_data_method(self): parsed_114_read = rdr.read() tm.assert_frame_equal(parsed_114_data, parsed_114_read) - def test_read_dta1(self): + @pytest.mark.parametrize( + 'file', ['dta1_114', 'dta1_117']) + def test_read_dta1(self, file): - parsed_114 = self.read_dta(self.dta1_114) - parsed_117 = self.read_dta(self.dta1_117) + file = getattr(self, file) + parsed = self.read_dta(file) # Pandas uses np.nan as missing value. # Thus, all columns will be of type float, regardless of their name. @@ -123,8 +138,7 @@ def test_read_dta1(self): # the casting doesn't fail so need to match stata here expected['float_miss'] = expected['float_miss'].astype(np.float32) - tm.assert_frame_equal(parsed_114, expected) - tm.assert_frame_equal(parsed_117, expected) + tm.assert_frame_equal(parsed, expected) def test_read_dta2(self): if LooseVersion(sys.version) < '2.7': @@ -193,11 +207,12 @@ def test_read_dta2(self): tm.assert_frame_equal(parsed_117, expected, check_datetimelike_compat=True) - def test_read_dta3(self): - parsed_113 = self.read_dta(self.dta3_113) - parsed_114 = self.read_dta(self.dta3_114) - parsed_115 = self.read_dta(self.dta3_115) - parsed_117 = self.read_dta(self.dta3_117) + @pytest.mark.parametrize( + 'file', ['dta3_113', 'dta3_114', 'dta3_115', 'dta3_117']) + def test_read_dta3(self, file): + + file = getattr(self, file) + parsed = self.read_dta(file) # match stata here expected = self.read_csv(self.csv3) @@ -205,16 +220,14 @@ def test_read_dta3(self): expected['year'] = expected['year'].astype(np.int16) expected['quarter'] = expected['quarter'].astype(np.int8) - tm.assert_frame_equal(parsed_113, expected) - tm.assert_frame_equal(parsed_114, expected) - tm.assert_frame_equal(parsed_115, expected) - tm.assert_frame_equal(parsed_117, expected) + tm.assert_frame_equal(parsed, expected) + + @pytest.mark.parametrize( + 'file', ['dta4_113', 'dta4_114', 'dta4_115', 'dta4_117']) + def test_read_dta4(self, file): - def test_read_dta4(self): - parsed_113 = self.read_dta(self.dta4_113) - parsed_114 = self.read_dta(self.dta4_114) - parsed_115 = self.read_dta(self.dta4_115) - parsed_117 = self.read_dta(self.dta4_117) + file = getattr(self, file) + parsed = self.read_dta(file) expected = DataFrame.from_records( [ @@ -237,10 +250,7 @@ def test_read_dta4(self): for col in expected], axis=1) # stata doesn't save .category metadata - tm.assert_frame_equal(parsed_113, expected, check_categorical=False) - tm.assert_frame_equal(parsed_114, expected, check_categorical=False) - tm.assert_frame_equal(parsed_115, expected, check_categorical=False) - tm.assert_frame_equal(parsed_117, expected, check_categorical=False) + tm.assert_frame_equal(parsed, expected, check_categorical=False) # File containing strls def test_read_dta12(self): @@ -427,7 +437,13 @@ def test_read_write_dta13(self): tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted) - def test_read_write_reread_dta14(self): + @pytest.mark.parametrize( + 'file', ['dta14_113', 'dta14_114', 'dta14_115', 'dta14_117']) + def test_read_write_reread_dta14(self, file, parsed_114): + file = getattr(self, file) + parsed = self.read_dta(file) + parsed.index.name = 'index' + expected = self.read_csv(self.csv14) cols = ['byte_', 'int_', 'long_', 'float_', 'double_'] for col in cols: @@ -436,18 +452,7 @@ def test_read_write_reread_dta14(self): expected['date_td'] = pd.to_datetime( expected['date_td'], errors='coerce') - parsed_113 = self.read_dta(self.dta14_113) - parsed_113.index.name = 'index' - parsed_114 = self.read_dta(self.dta14_114) - parsed_114.index.name = 'index' - parsed_115 = self.read_dta(self.dta14_115) - parsed_115.index.name = 'index' - parsed_117 = self.read_dta(self.dta14_117) - parsed_117.index.name = 'index' - - tm.assert_frame_equal(parsed_114, parsed_113) - tm.assert_frame_equal(parsed_114, parsed_115) - tm.assert_frame_equal(parsed_114, parsed_117) + tm.assert_frame_equal(parsed_114, parsed) with tm.ensure_clean() as path: parsed_114.to_stata(path, {'date_td': 'td'}) @@ -455,7 +460,10 @@ def test_read_write_reread_dta14(self): tm.assert_frame_equal( written_and_read_again.set_index('index'), parsed_114) - def test_read_write_reread_dta15(self): + @pytest.mark.parametrize( + 'file', ['dta15_113', 'dta15_114', 'dta15_115', 'dta15_117']) + def test_read_write_reread_dta15(self, file): + expected = self.read_csv(self.csv15) expected['byte_'] = expected['byte_'].astype(np.int8) expected['int_'] = expected['int_'].astype(np.int16) @@ -465,15 +473,10 @@ def test_read_write_reread_dta15(self): expected['date_td'] = expected['date_td'].apply( datetime.strptime, args=('%Y-%m-%d',)) - parsed_113 = self.read_dta(self.dta15_113) - parsed_114 = self.read_dta(self.dta15_114) - parsed_115 = self.read_dta(self.dta15_115) - parsed_117 = self.read_dta(self.dta15_117) + file = getattr(self, file) + parsed = self.read_dta(file) - tm.assert_frame_equal(expected, parsed_114) - tm.assert_frame_equal(parsed_113, parsed_114) - tm.assert_frame_equal(parsed_114, parsed_115) - tm.assert_frame_equal(parsed_114, parsed_117) + tm.assert_frame_equal(expected, parsed) def test_timestamp_and_label(self): original = DataFrame([(1,)], columns=['variable']) @@ -710,7 +713,9 @@ def test_missing_value_generator(self): ' Date: Sat, 23 Sep 2017 13:56:11 -0500 Subject: [PATCH 132/188] Categorical type (#16015) Closes #14711 Closes #15078 Closes #14676 --- doc/source/advanced.rst | 4 +- doc/source/api.rst | 5 +- doc/source/categorical.rst | 103 ++++- doc/source/merging.rst | 11 +- doc/source/whatsnew/v0.21.0.txt | 27 ++ pandas/core/categorical.py | 357 ++++++++++-------- pandas/core/dtypes/common.py | 38 +- pandas/core/dtypes/dtypes.py | 217 ++++++++++- pandas/core/indexes/base.py | 15 +- pandas/core/indexes/category.py | 54 ++- pandas/core/indexes/interval.py | 3 +- pandas/core/indexes/multi.py | 2 +- pandas/core/indexes/range.py | 2 +- pandas/core/internals.py | 20 +- pandas/core/series.py | 3 +- pandas/core/sorting.py | 3 +- pandas/core/util/hashing.py | 2 +- pandas/tests/dtypes/test_common.py | 10 +- pandas/tests/dtypes/test_dtypes.py | 141 ++++++- pandas/tests/frame/test_analytics.py | 3 + pandas/tests/indexes/test_category.py | 10 +- .../tests/io/json/test_json_table_schema.py | 5 +- pandas/tests/io/test_parquet.py | 2 + pandas/tests/io/test_pytables.py | 10 +- pandas/tests/reshape/test_merge.py | 4 +- pandas/tests/series/test_analytics.py | 11 +- pandas/tests/series/test_constructors.py | 21 ++ pandas/tests/series/test_dtypes.py | 34 +- pandas/tests/test_algos.py | 72 ++-- pandas/tests/test_categorical.py | 182 ++++++++- pandas/util/testing.py | 9 +- 31 files changed, 1092 insertions(+), 288 deletions(-) diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst index 3bda8c7eacb61b..799d04859cc2ac 100644 --- a/doc/source/advanced.rst +++ b/doc/source/advanced.rst @@ -638,9 +638,11 @@ and allows efficient indexing and storage of an index with a large number of dup .. ipython:: python + from pandas.api.types import CategoricalDtype + df = pd.DataFrame({'A': np.arange(6), 'B': list('aabbca')}) - df['B'] = df['B'].astype('category', categories=list('cab')) + df['B'] = df['B'].astype(CategoricalDtype(list('cab'))) df df.dtypes df.B.cat.categories diff --git a/doc/source/api.rst b/doc/source/api.rst index 96c7f68f57aaaa..4ffeb5035912f5 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -646,7 +646,10 @@ strings and apply several methods to it. These can be accessed like Categorical ~~~~~~~~~~~ -If the Series is of dtype ``category``, ``Series.cat`` can be used to change the the categorical +.. autoclass:: api.types.CategoricalDtype + :members: categories, ordered + +If the Series is of dtype ``CategoricalDtype``, ``Series.cat`` can be used to change the categorical data. This accessor is similar to the ``Series.dt`` or ``Series.str`` and has the following usable methods and properties: diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index ff5e550ebd97f4..cadbc895354b71 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -89,12 +89,22 @@ By passing a :class:`pandas.Categorical` object to a `Series` or assigning it to df["B"] = raw_cat df -You can also specify differently ordered categories or make the resulting data ordered, by passing these arguments to ``astype()``: +Anywhere above we passed a keyword ``dtype='category'``, we used the default behavior of + +1. categories are inferred from the data +2. categories are unordered. + +To control those behaviors, instead of passing ``'category'``, use an instance +of :class:`~pandas.api.types.CategoricalDtype`. .. ipython:: python - s = pd.Series(["a","b","c","a"]) - s_cat = s.astype("category", categories=["b","c","d"], ordered=False) + from pandas.api.types import CategoricalDtype + + s = pd.Series(["a", "b", "c", "a"]) + cat_type = CategoricalDtype(categories=["b", "c", "d"], + ordered=True) + s_cat = s.astype(cat_type) s_cat Categorical data has a specific ``category`` :ref:`dtype `: @@ -133,6 +143,75 @@ constructor to save the factorize step during normal constructor mode: splitter = np.random.choice([0,1], 5, p=[0.5,0.5]) s = pd.Series(pd.Categorical.from_codes(splitter, categories=["train", "test"])) +.. _categorical.categoricaldtype: + +CategoricalDtype +---------------- + +.. versionchanged:: 0.21.0 + +A categorical's type is fully described by + +1. ``categories``: a sequence of unique values and no missing values +2. ``ordered``: a boolean + +This information can be stored in a :class:`~pandas.api.types.CategoricalDtype`. +The ``categories`` argument is optional, which implies that the actual categories +should be inferred from whatever is present in the data when the +:class:`pandas.Categorical` is created. The categories are assumed to be unordered +by default. + +.. ipython:: python + + from pandas.api.types import CategoricalDtype + + CategoricalDtype(['a', 'b', 'c']) + CategoricalDtype(['a', 'b', 'c'], ordered=True) + CategoricalDtype() + +A :class:`~pandas.api.types.CategoricalDtype` can be used in any place pandas +expects a `dtype`. For example :func:`pandas.read_csv`, +:func:`pandas.DataFrame.astype`, or in the Series constructor. + +.. note:: + + As a convenience, you can use the string ``'category'`` in place of a + :class:`~pandas.api.types.CategoricalDtype` when you want the default behavior of + the categories being unordered, and equal to the set values present in the + array. In other words, ``dtype='category'`` is equivalent to + ``dtype=CategoricalDtype()``. + +Equality Semantics +~~~~~~~~~~~~~~~~~~ + +Two instances of :class:`~pandas.api.types.CategoricalDtype` compare equal +whenever they have the same categories and orderedness. When comparing two +unordered categoricals, the order of the ``categories`` is not considered + +.. ipython:: python + + c1 = CategoricalDtype(['a', 'b', 'c'], ordered=False) + + # Equal, since order is not considered when ordered=False + c1 == CategoricalDtype(['b', 'c', 'a'], ordered=False) + + # Unequal, since the second CategoricalDtype is ordered + c1 == CategoricalDtype(['a', 'b', 'c'], ordered=True) + +All instances of ``CategoricalDtype`` compare equal to the string ``'category'`` + +.. ipython:: python + + c1 == 'category' + +.. warning:: + + Since ``dtype='category'`` is essentially ``CategoricalDtype(None, False)``, + and since all instances ``CategoricalDtype`` compare equal to ``'category'``, + all instances of ``CategoricalDtype`` compare equal to a + ``CategoricalDtype(None, False)``, regardless of ``categories`` or + ``ordered``. + Description ----------- @@ -184,7 +263,7 @@ It's also possible to pass in the categories in a specific order: .. ipython:: python - s = pd.Series(list('babc')).astype('category', categories=list('abcd')) + s = pd.Series(list('babc')).astype(CategoricalDtype(list('abcd'))) s # categories @@ -301,7 +380,9 @@ meaning and certain operations are possible. If the categorical is unordered, `` s = pd.Series(pd.Categorical(["a","b","c","a"], ordered=False)) s.sort_values(inplace=True) - s = pd.Series(["a","b","c","a"]).astype('category', ordered=True) + s = pd.Series(["a","b","c","a"]).astype( + CategoricalDtype(ordered=True) + ) s.sort_values(inplace=True) s s.min(), s.max() @@ -401,9 +482,15 @@ categories or a categorical with any list-like object, will raise a TypeError. .. ipython:: python - cat = pd.Series([1,2,3]).astype("category", categories=[3,2,1], ordered=True) - cat_base = pd.Series([2,2,2]).astype("category", categories=[3,2,1], ordered=True) - cat_base2 = pd.Series([2,2,2]).astype("category", ordered=True) + cat = pd.Series([1,2,3]).astype( + CategoricalDtype([3, 2, 1], ordered=True) + ) + cat_base = pd.Series([2,2,2]).astype( + CategoricalDtype([3, 2, 1], ordered=True) + ) + cat_base2 = pd.Series([2,2,2]).astype( + CategoricalDtype(ordered=True) + ) cat cat_base diff --git a/doc/source/merging.rst b/doc/source/merging.rst index 72787ea97a7824..ad40c75a62722c 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -830,8 +830,10 @@ The left frame. .. ipython:: python + from pandas.api.types import CategoricalDtype + X = pd.Series(np.random.choice(['foo', 'bar'], size=(10,))) - X = X.astype('category', categories=['foo', 'bar']) + X = X.astype(CategoricalDtype(categories=['foo', 'bar'])) left = pd.DataFrame({'X': X, 'Y': np.random.choice(['one', 'two', 'three'], size=(10,))}) @@ -842,8 +844,11 @@ The right frame. .. ipython:: python - right = pd.DataFrame({'X': pd.Series(['foo', 'bar']).astype('category', categories=['foo', 'bar']), - 'Z': [1, 2]}) + right = pd.DataFrame({ + 'X': pd.Series(['foo', 'bar'], + dtype=CategoricalDtype(['foo', 'bar'])), + 'Z': [1, 2] + }) right right.dtypes diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 32e4294f06d6bb..261e12b8245094 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -10,6 +10,8 @@ users upgrade to this version. Highlights include: - Integration with `Apache Parquet `__, including a new top-level :func:`read_parquet` and :func:`DataFrame.to_parquet` method, see :ref:`here `. +- New user-facing :class:`pandas.api.types.CategoricalDtype` for specifying + categoricals independent of the data, see :ref:`here `. Check the :ref:`API Changes ` and :ref:`deprecations ` before updating. @@ -89,6 +91,31 @@ This does not raise any obvious exceptions, but also does not create a new colum Setting a list-like data structure into a new attribute now raise a ``UserWarning`` about the potential for unexpected behavior. See :ref:`Attribute Access `. +.. _whatsnew_0210.enhancements.categorical_dtype: + +``CategoricalDtype`` for specifying categoricals +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:class:`pandas.api.types.CategoricalDtype` has been added to the public API and +expanded to include the ``categories`` and ``ordered`` attributes. A +``CategoricalDtype`` can be used to specify the set of categories and +orderedness of an array, independent of the data themselves. This can be useful, +e.g., when converting string data to a ``Categorical`` (:issue:`14711`, +:issue:`15078`, :issue:`16015`): + +.. ipython:: python + + from pandas.api.types import CategoricalDtype + + s = pd.Series(['a', 'b', 'c', 'a']) # strings + dtype = CategoricalDtype(categories=['a', 'b', 'c', 'd'], ordered=True) + s.astype(dtype) + +The ``.dtype`` property of a ``Categorical``, ``CategoricalIndex`` or a +``Series`` with categorical type will now return an instance of ``CategoricalDtype``. + +See the :ref:`CategoricalDtype docs ` for more. + .. _whatsnew_0210.enhancements.other: Other Enhancements diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 6f7eafe43dbbb2..98d6d7a68017ad 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -23,7 +23,7 @@ is_datetimelike, is_categorical, is_categorical_dtype, - is_integer_dtype, is_bool, + is_integer_dtype, is_list_like, is_sequence, is_scalar, is_dict_like) @@ -140,33 +140,6 @@ def maybe_to_categorical(array): setter to change values in the categorical. """ -_categories_doc = """The categories of this categorical. - -Setting assigns new values to each category (effectively a rename of -each individual category). - -The assigned value has to be a list-like object. All items must be unique and -the number of items in the new categories must be the same as the number of -items in the old categories. - -Assigning to `categories` is a inplace operation! - -Raises ------- -ValueError - If the new categories do not validate as categories or if the number of new - categories is unequal the number of old categories - -See also --------- -rename_categories -reorder_categories -add_categories -remove_categories -remove_unused_categories -set_categories -""" - class Categorical(PandasObject): """ @@ -193,6 +166,10 @@ class Categorical(PandasObject): ordered : boolean, (default False) Whether or not this categorical is treated as a ordered categorical. If not given, the resulting categorical will not be ordered. + dtype : CategoricalDtype + An instance of ``CategoricalDtype`` to use for this categorical + + .. versionadded:: 0.21.0 Attributes ---------- @@ -203,6 +180,11 @@ class Categorical(PandasObject): categorical, read only. ordered : boolean Whether or not this Categorical is ordered. + dtype : CategoricalDtype + The instance of ``CategoricalDtype`` storing the ``categories`` + and ``ordered``. + + .. versionadded:: 0.21.0 Raises ------ @@ -212,7 +194,6 @@ class Categorical(PandasObject): If an explicit ``ordered=True`` is given but no `categories` and the `values` are not sortable. - Examples -------- >>> from pandas import Categorical @@ -224,17 +205,17 @@ class Categorical(PandasObject): [a, b, c, a, b, c] Categories (3, object): [a < b < c] + Only ordered `Categoricals` can be sorted (according to the order + of the categories) and have a min and max value. + >>> a = Categorical(['a','b','c','a','b','c'], ['c', 'b', 'a'], ordered=True) >>> a.min() 'c' - """ - dtype = CategoricalDtype() - """The dtype (always "category")""" - """Whether or not this Categorical is ordered. - Only ordered `Categoricals` can be sorted (according to the order - of the categories) and have a min and max value. + Notes + ----- + See the :ref:`user guide ` for more. See also -------- @@ -242,23 +223,58 @@ class Categorical(PandasObject): Categorical.order Categorical.min Categorical.max + pandas.api.types.CategoricalDtype """ # For comparisons, so that numpy uses our implementation if the compare # ops, which raise __array_priority__ = 1000 + _dtype = CategoricalDtype() _typ = 'categorical' - def __init__(self, values, categories=None, ordered=False, fastpath=False): + def __init__(self, values, categories=None, ordered=None, dtype=None, + fastpath=False): + + # Ways of specifying the dtype (prioritized ordered) + # 1. dtype is a CategoricalDtype + # a.) with known categories, use dtype.categories + # b.) else with Categorical values, use values.dtype + # c.) else, infer from values + # d.) specifying dtype=CategoricalDtype and categories is an error + # 2. dtype is a string 'category' + # a.) use categories, ordered + # b.) use values.dtype + # c.) infer from values + # 3. dtype is None + # a.) use categories, ordered + # b.) use values.dtype + # c.) infer from values + + if dtype is not None: + if isinstance(dtype, compat.string_types): + if dtype == 'category': + dtype = CategoricalDtype(categories, ordered) + else: + raise ValueError("Unknown `dtype` {}".format(dtype)) + elif categories is not None or ordered is not None: + raise ValueError("Cannot specify both `dtype` and `categories`" + " or `ordered`.") + + categories = dtype.categories + ordered = dtype.ordered + + elif is_categorical(values): + dtype = values.dtype._from_categorical_dtype(values.dtype, + categories, ordered) + else: + dtype = CategoricalDtype(categories, ordered) - self._validate_ordered(ordered) + # At this point, dtype is always a CategoricalDtype + # if dtype.categories is None, we are inferring if fastpath: - # fast path self._codes = coerce_indexer_dtype(values, categories) - self._categories = self._validate_categories( - categories, fastpath=isinstance(categories, ABCIndexClass)) - self._ordered = ordered + self._dtype = dtype return # sanitize input @@ -275,6 +291,7 @@ def __init__(self, values, categories=None, ordered=False, fastpath=False): values = values.get_values() elif isinstance(values, (ABCIndexClass, ABCSeries)): + # we'll do inference later pass else: @@ -292,12 +309,12 @@ def __init__(self, values, categories=None, ordered=False, fastpath=False): # "object" dtype to prevent this. In the end objects will be # casted to int/... in the category assignment step. if len(values) == 0 or isna(values).any(): - dtype = 'object' + sanitize_dtype = 'object' else: - dtype = None - values = _sanitize_array(values, None, dtype=dtype) + sanitize_dtype = None + values = _sanitize_array(values, None, dtype=sanitize_dtype) - if categories is None: + if dtype.categories is None: try: codes, categories = factorize(values, sort=True) except TypeError: @@ -314,7 +331,9 @@ def __init__(self, values, categories=None, ordered=False, fastpath=False): raise NotImplementedError("> 1 ndim Categorical are not " "supported at this time") - categories = self._validate_categories(categories) + if dtype.categories is None: + # we're inferring from values + dtype = CategoricalDtype(categories, ordered) else: # there were two ways if categories are present @@ -324,14 +343,12 @@ def __init__(self, values, categories=None, ordered=False, fastpath=False): # - the new one, where each value is also in the categories array # (or np.nan) - # make sure that we always have the same type here, no matter what - # we get passed in - categories = self._validate_categories(categories) - codes = _get_codes_for_values(values, categories) + codes = _get_codes_for_values(values, dtype.categories) # TODO: check for old style usage. These warnings should be removes # after 0.18/ in 2016 - if is_integer_dtype(values) and not is_integer_dtype(categories): + if (is_integer_dtype(values) and + not is_integer_dtype(dtype.categories)): warn("Values and categories have different dtypes. Did you " "mean to use\n'Categorical.from_codes(codes, " "categories)'?", RuntimeWarning, stacklevel=2) @@ -342,9 +359,57 @@ def __init__(self, values, categories=None, ordered=False, fastpath=False): "mean to use\n'Categorical.from_codes(codes, " "categories)'?", RuntimeWarning, stacklevel=2) - self.set_ordered(ordered or False, inplace=True) - self._categories = categories - self._codes = coerce_indexer_dtype(codes, categories) + self._dtype = dtype + self._codes = coerce_indexer_dtype(codes, dtype.categories) + + @property + def categories(self): + """The categories of this categorical. + + Setting assigns new values to each category (effectively a rename of + each individual category). + + The assigned value has to be a list-like object. All items must be + unique and the number of items in the new categories must be the same + as the number of items in the old categories. + + Assigning to `categories` is a inplace operation! + + Raises + ------ + ValueError + If the new categories do not validate as categories or if the + number of new categories is unequal the number of old categories + + See also + -------- + rename_categories + reorder_categories + add_categories + remove_categories + remove_unused_categories + set_categories + """ + return self.dtype.categories + + @categories.setter + def categories(self, categories): + new_dtype = CategoricalDtype(categories, ordered=self.ordered) + if (self.dtype.categories is not None and + len(self.dtype.categories) != len(new_dtype.categories)): + raise ValueError("new categories need to have the same number of " + "items as the old categories!") + self._dtype = new_dtype + + @property + def ordered(self): + """Whether the categories have an ordered relationship""" + return self.dtype.ordered + + @property + def dtype(self): + """The :ref:`~pandas.api.types.CategoricalDtype` for this instance""" + return self._dtype def __dir__(self): # Avoid IPython warnings for deprecated properties @@ -493,7 +558,7 @@ def from_codes(cls, codes, categories, ordered=False): raise ValueError( "codes need to be convertible to an arrays of integers") - categories = cls._validate_categories(categories) + categories = CategoricalDtype._validate_categories(categories) if len(codes) and (codes.max() >= len(categories) or codes.min() < -1): raise ValueError("codes need to be between -1 and " @@ -536,94 +601,38 @@ def _get_labels(self): labels = property(fget=_get_labels, fset=_set_codes) - _categories = None - - @classmethod - def _validate_ordered(cls, ordered): - """ - Validates that we have a valid ordered parameter. If - it is not a boolean, a TypeError will be raised. - - Parameters - ---------- - ordered : object - The parameter to be verified. - - Raises - ------ - TypeError - If 'ordered' is not a boolean. - """ - if not is_bool(ordered): - raise TypeError("'ordered' must either be 'True' or 'False'") - - @classmethod - def _validate_categories(cls, categories, fastpath=False): - """ - Validates that we have good categories - - Parameters - ---------- - fastpath : boolean (default: False) - Don't perform validation of the categories for uniqueness or nulls - - """ - if not isinstance(categories, ABCIndexClass): - dtype = None - if not hasattr(categories, "dtype"): - if not is_list_like(categories): - raise TypeError("`categories` must be list-like. " - "Got {} instead".format(repr(categories))) - categories = _convert_to_list_like(categories) - # On categories with NaNs, int values would be converted to - # float. Use "object" dtype to prevent this. - if isna(categories).any(): - without_na = np.array([x for x in categories - if notna(x)]) - with_na = np.array(categories) - if with_na.dtype != without_na.dtype: - dtype = "object" - - from pandas import Index - categories = Index(categories, dtype=dtype) - - if not fastpath: - - # Categories cannot contain NaN. - if categories.hasnans: - raise ValueError('Categorial categories cannot be null') - - # Categories must be unique. - if not categories.is_unique: - raise ValueError('Categorical categories must be unique') - - return categories - def _set_categories(self, categories, fastpath=False): - """ Sets new categories + """ Sets new categories inplace Parameters ---------- fastpath : boolean (default: False) Don't perform validation of the categories for uniqueness or nulls + Examples + -------- + >>> c = Categorical(['a', 'b']) + >>> c + [a, b] + Categories (2, object): [a, b] + + >>> c._set_categories(pd.Index(['a', 'c'])) + >>> c + [a, c] + Categories (2, object): [a, c] """ - categories = self._validate_categories(categories, fastpath=fastpath) - if (not fastpath and self._categories is not None and - len(categories) != len(self._categories)): + if fastpath: + new_dtype = CategoricalDtype._from_fastpath(categories, + self.ordered) + else: + new_dtype = CategoricalDtype(categories, ordered=self.ordered) + if (not fastpath and self.dtype.categories is not None and + len(new_dtype.categories) != len(self.dtype.categories)): raise ValueError("new categories need to have the same number of " "items than the old categories!") - self._categories = categories - - def _get_categories(self): - """ Gets the categories """ - # categories is an Index, which is immutable -> no need to copy - return self._categories - - categories = property(fget=_get_categories, fset=_set_categories, - doc=_categories_doc) + self._dtype = new_dtype def _codes_for_groupby(self, sort): """ @@ -665,7 +674,21 @@ def _codes_for_groupby(self, sort): return self.reorder_categories(cat.categories) - _ordered = None + def _set_dtype(self, dtype): + """Internal method for directly updating the CategoricalDtype + + Parameters + ---------- + dtype : CategoricalDtype + + Notes + ----- + We don't do any validation here. It's assumed that the dtype is + a (valid) instance of `CategoricalDtype`. + """ + codes = _recode_for_categories(self.codes, self.categories, + dtype.categories) + return type(self)(codes, dtype=dtype, fastpath=True) def set_ordered(self, value, inplace=False): """ @@ -680,9 +703,9 @@ def set_ordered(self, value, inplace=False): of this categorical with ordered set to the value """ inplace = validate_bool_kwarg(inplace, 'inplace') - self._validate_ordered(value) + new_dtype = CategoricalDtype(self.categories, ordered=value) cat = self if inplace else self.copy() - cat._ordered = value + cat._dtype = new_dtype if not inplace: return cat @@ -712,12 +735,6 @@ def as_unordered(self, inplace=False): inplace = validate_bool_kwarg(inplace, 'inplace') return self.set_ordered(False, inplace=inplace) - def _get_ordered(self): - """ Gets the ordered attribute """ - return self._ordered - - ordered = property(fget=_get_ordered) - def set_categories(self, new_categories, ordered=None, rename=False, inplace=False): """ Sets the categories to the specified new_categories. @@ -770,22 +787,21 @@ def set_categories(self, new_categories, ordered=None, rename=False, remove_unused_categories """ inplace = validate_bool_kwarg(inplace, 'inplace') - new_categories = self._validate_categories(new_categories) + if ordered is None: + ordered = self.dtype.ordered + new_dtype = CategoricalDtype(new_categories, ordered=ordered) + cat = self if inplace else self.copy() if rename: - if (cat._categories is not None and - len(new_categories) < len(cat._categories)): + if (cat.dtype.categories is not None and + len(new_dtype.categories) < len(cat.dtype.categories)): # remove all _codes which are larger and set to -1/NaN - self._codes[self._codes >= len(new_categories)] = -1 + self._codes[self._codes >= len(new_dtype.categories)] = -1 else: codes = _recode_for_categories(self.codes, self.categories, - new_categories) + new_dtype.categories) cat._codes = codes - cat._categories = new_categories - - if ordered is None: - ordered = self.ordered - cat.set_ordered(ordered, inplace=True) + cat._dtype = new_dtype if not inplace: return cat @@ -871,7 +887,7 @@ def reorder_categories(self, new_categories, ordered=None, inplace=False): set_categories """ inplace = validate_bool_kwarg(inplace, 'inplace') - if set(self._categories) != set(new_categories): + if set(self.dtype.categories) != set(new_categories): raise ValueError("items in new_categories are not the same as in " "old categories") return self.set_categories(new_categories, ordered=ordered, @@ -912,15 +928,17 @@ def add_categories(self, new_categories, inplace=False): inplace = validate_bool_kwarg(inplace, 'inplace') if not is_list_like(new_categories): new_categories = [new_categories] - already_included = set(new_categories) & set(self._categories) + already_included = set(new_categories) & set(self.dtype.categories) if len(already_included) != 0: msg = ("new categories must not include old categories: %s" % str(already_included)) raise ValueError(msg) - new_categories = list(self._categories) + list(new_categories) + new_categories = list(self.dtype.categories) + list(new_categories) + new_dtype = CategoricalDtype(new_categories, self.ordered) + cat = self if inplace else self.copy() - cat._categories = self._validate_categories(new_categories) - cat._codes = coerce_indexer_dtype(cat._codes, new_categories) + cat._dtype = new_dtype + cat._codes = coerce_indexer_dtype(cat._codes, new_dtype.categories) if not inplace: return cat @@ -960,8 +978,9 @@ def remove_categories(self, removals, inplace=False): removals = [removals] removal_set = set(list(removals)) - not_included = removal_set - set(self._categories) - new_categories = [c for c in self._categories if c not in removal_set] + not_included = removal_set - set(self.dtype.categories) + new_categories = [c for c in self.dtype.categories + if c not in removal_set] # GH 10156 if any(isna(removals)): @@ -1003,8 +1022,11 @@ def remove_unused_categories(self, inplace=False): if idx.size != 0 and idx[0] == -1: # na sentinel idx, inv = idx[1:], inv - 1 - cat._categories = cat.categories.take(idx) - cat._codes = coerce_indexer_dtype(inv, self._categories) + new_categories = cat.dtype.categories.take(idx) + new_dtype = CategoricalDtype._from_fastpath(new_categories, + ordered=self.ordered) + cat._dtype = new_dtype + cat._codes = coerce_indexer_dtype(inv, new_dtype.categories) if not inplace: return cat @@ -1105,7 +1127,7 @@ def __setstate__(self, state): # Provide compatibility with pre-0.15.0 Categoricals. if '_categories' not in state and '_levels' in state: - state['_categories'] = self._validate_categories(state.pop( + state['_categories'] = self.dtype._validate_categories(state.pop( '_levels')) if '_codes' not in state and 'labels' in state: state['_codes'] = coerce_indexer_dtype( @@ -1120,6 +1142,11 @@ def __setstate__(self, state): else: state['_ordered'] = False + # 0.21.0 CategoricalDtype change + if '_dtype' not in state: + state['_dtype'] = CategoricalDtype(state['_categories'], + state['_ordered']) + for k, v in compat.iteritems(state): setattr(self, k, v) @@ -1129,7 +1156,7 @@ def T(self): @property def nbytes(self): - return self._codes.nbytes + self._categories.values.nbytes + return self._codes.nbytes + self.dtype.categories.values.nbytes def memory_usage(self, deep=False): """ @@ -1154,7 +1181,8 @@ def memory_usage(self, deep=False): -------- numpy.ndarray.nbytes """ - return self._codes.nbytes + self._categories.memory_usage(deep=deep) + return self._codes.nbytes + self.dtype.categories.memory_usage( + deep=deep) @Substitution(klass='Categorical') @Appender(_shared_docs['searchsorted']) @@ -1285,7 +1313,7 @@ def value_counts(self, dropna=True): count = bincount(np.where(mask, code, ncat)) ix = np.append(ix, -1) - ix = self._constructor(ix, categories=cat, ordered=obj.ordered, + ix = self._constructor(ix, dtype=self.dtype, fastpath=True) return Series(count, index=CategoricalIndex(ix), dtype='int64') @@ -1998,8 +2026,7 @@ def is_dtype_equal(self, other): """ try: - return (self.categories.equals(other.categories) and - self.ordered == other.ordered) + return hash(self.dtype) == hash(other.dtype) except (AttributeError, TypeError): return False diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index c47e61dc446be2..f60c0d5ffdca0b 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -692,6 +692,40 @@ def is_dtype_equal(source, target): return False +def is_dtype_union_equal(source, target): + """ + Check whether two arrays have compatible dtypes to do a union. + numpy types are checked with ``is_dtype_equal``. Extension types are + checked separately. + + Parameters + ---------- + source : The first dtype to compare + target : The second dtype to compare + + Returns + ---------- + boolean : Whether or not the two dtypes are equal. + + >>> is_dtype_equal("int", int) + True + + >>> is_dtype_equal(CategoricalDtype(['a', 'b'], + ... CategoricalDtype(['b', 'c'])) + True + + >>> is_dtype_equal(CategoricalDtype(['a', 'b'], + ... CategoricalDtype(['b', 'c'], ordered=True)) + False + """ + source = _get_dtype(source) + target = _get_dtype(target) + if is_categorical_dtype(source) and is_categorical_dtype(target): + # ordered False for both + return source.ordered is target.ordered + return is_dtype_equal(source, target) + + def is_any_int_dtype(arr_or_dtype): """ DEPRECATED: This function will be removed in a future version. @@ -1671,7 +1705,9 @@ def _coerce_to_dtype(dtype): """ if is_categorical_dtype(dtype): - dtype = CategoricalDtype() + categories = getattr(dtype, 'categories', None) + ordered = getattr(dtype, 'ordered', False) + dtype = CategoricalDtype(categories=categories, ordered=ordered) elif is_datetime64tz_dtype(dtype): dtype = DatetimeTZDtype(dtype) elif is_period_dtype(dtype): diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index dc2c56ea476f9d..d2487905caced2 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -3,6 +3,7 @@ import re import numpy as np from pandas import compat +from pandas.core.dtypes.generic import ABCIndexClass class ExtensionDtype(object): @@ -110,37 +111,161 @@ class CategoricalDtypeType(type): class CategoricalDtype(ExtensionDtype): """ - A np.dtype duck-typed class, suitable for holding a custom categorical - dtype. - - THIS IS NOT A REAL NUMPY DTYPE, but essentially a sub-class of np.object + Type for categorical data with the categories and orderedness + + .. versionchanged:: 0.21.0 + + Parameters + ---------- + categories : sequence, optional + Must be unique, and must not contain any nulls. + ordered : bool, default False + + Notes + ----- + This class is useful for specifying the type of a ``Categorical`` + independent of the values. See :ref:`categorical.categoricaldtype` + for more. + + Examples + -------- + >>> t = CategoricalDtype(categories=['b', 'a'], ordered=True) + >>> pd.Series(['a', 'b', 'a', 'c'], dtype=t) + 0 a + 1 b + 2 a + 3 NaN + dtype: category + Categories (2, object): [b < a] + + See Also + -------- + Categorical """ + # TODO: Document public vs. private API name = 'category' type = CategoricalDtypeType kind = 'O' str = '|O08' base = np.dtype('O') - _metadata = [] + _metadata = ['categories', 'ordered'] _cache = {} - def __new__(cls): + def __init__(self, categories=None, ordered=False): + self._finalize(categories, ordered, fastpath=False) - try: - return cls._cache[cls.name] - except KeyError: - c = object.__new__(cls) - cls._cache[cls.name] = c - return c + @classmethod + def _from_fastpath(cls, categories=None, ordered=False): + self = cls.__new__(cls) + self._finalize(categories, ordered, fastpath=True) + return self + + @classmethod + def _from_categorical_dtype(cls, dtype, categories=None, ordered=None): + if categories is ordered is None: + return dtype + if categories is None: + categories = dtype.categories + if ordered is None: + ordered = dtype.ordered + return cls(categories, ordered) + + def _finalize(self, categories, ordered, fastpath=False): + from pandas.core.indexes.base import Index + + if ordered is None: + ordered = False + + if categories is not None: + categories = Index(categories, tupleize_cols=False) + # validation + self._validate_categories(categories) + self._validate_ordered(ordered) + self._categories = categories + self._ordered = ordered + + def __setstate__(self, state): + self._categories = state.pop('categories', None) + self._ordered = state.pop('ordered', False) def __hash__(self): - # make myself hashable - return hash(str(self)) + # _hash_categories returns a uint64, so use the negative + # space for when we have unknown categories to avoid a conflict + if self.categories is None: + if self.ordered: + return -1 + else: + return -2 + # We *do* want to include the real self.ordered here + return int(self._hash_categories(self.categories, self.ordered)) def __eq__(self, other): if isinstance(other, compat.string_types): return other == self.name - return isinstance(other, CategoricalDtype) + if not (hasattr(other, 'ordered') and hasattr(other, 'categories')): + return False + elif self.categories is None or other.categories is None: + # We're forced into a suboptimal corner thanks to math and + # backwards compatibility. We require that `CDT(...) == 'category'` + # for all CDTs **including** `CDT(None, ...)`. Therefore, *all* + # CDT(., .) = CDT(None, False) and *all* + # CDT(., .) = CDT(None, True). + return True + elif self.ordered: + return other.ordered and self.categories.equals(other.categories) + elif other.ordered: + return False + else: + # both unordered; this could probably be optimized / cached + return hash(self) == hash(other) + + def __unicode__(self): + tpl = u'CategoricalDtype(categories={}ordered={})' + if self.categories is None: + data = u"None, " + else: + data = self.categories._format_data(name=self.__class__.__name__) + return tpl.format(data, self.ordered) + + @staticmethod + def _hash_categories(categories, ordered=True): + from pandas.core.util.hashing import ( + hash_array, _combine_hash_arrays, hash_tuples + ) + + if len(categories) and isinstance(categories[0], tuple): + # assumes if any individual category is a tuple, then all our. ATM + # I don't really want to support just some of the categories being + # tuples. + categories = list(categories) # breaks if a np.array of categories + cat_array = hash_tuples(categories) + else: + if categories.dtype == 'O': + types = [type(x) for x in categories] + if not len(set(types)) == 1: + # TODO: hash_array doesn't handle mixed types. It casts + # everything to a str first, which means we treat + # {'1', '2'} the same as {'1', 2} + # find a better solution + cat_array = np.array([hash(x) for x in categories]) + hashed = hash((tuple(categories), ordered)) + return hashed + cat_array = hash_array(np.asarray(categories), categorize=False) + if ordered: + cat_array = np.vstack([ + cat_array, np.arange(len(cat_array), dtype=cat_array.dtype) + ]) + else: + cat_array = [cat_array] + hashed = _combine_hash_arrays(iter(cat_array), + num_items=len(cat_array)) + if len(hashed) == 0: + # bug in Numpy<1.12 for length 0 arrays. Just return the correct + # value of 0 + return 0 + else: + return np.bitwise_xor.reduce(hashed) @classmethod def construct_from_string(cls, string): @@ -154,6 +279,68 @@ def construct_from_string(cls, string): raise TypeError("cannot construct a CategoricalDtype") + @staticmethod + def _validate_ordered(ordered): + """ + Validates that we have a valid ordered parameter. If + it is not a boolean, a TypeError will be raised. + + Parameters + ---------- + ordered : object + The parameter to be verified. + + Raises + ------ + TypeError + If 'ordered' is not a boolean. + """ + from pandas.core.dtypes.common import is_bool + if not is_bool(ordered): + raise TypeError("'ordered' must either be 'True' or 'False'") + + @staticmethod + def _validate_categories(categories, fastpath=False): + """ + Validates that we have good categories + + Parameters + ---------- + categories : array-like + fastpath : bool + Whether to skip nan and uniqueness checks + + Returns + ------- + categories : Index + """ + from pandas import Index + + if not isinstance(categories, ABCIndexClass): + categories = Index(categories) + + if not fastpath: + + if categories.hasnans: + raise ValueError('Categorial categories cannot be null') + + if not categories.is_unique: + raise ValueError('Categorical categories must be unique') + + return categories + + @property + def categories(self): + """ + An ``Index`` containing the unique categories allowed. + """ + return self._categories + + @property + def ordered(self): + """Whether the categories have an ordered relationship""" + return self._ordered + class DatetimeTZDtypeType(type): """ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index ca145eeaaa7b89..562a758f83edc7 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -27,6 +27,7 @@ is_integer, is_float, is_dtype_equal, + is_dtype_union_equal, is_object_dtype, is_categorical_dtype, is_interval_dtype, @@ -847,7 +848,7 @@ def _formatter_func(self): """ return default_pprint - def _format_data(self): + def _format_data(self, name=None): """ Return the formatted data as a unicode string """ @@ -856,9 +857,11 @@ def _format_data(self): display_width, _ = get_console_size() if display_width is None: display_width = get_option('display.width') or 80 + if name is None: + name = self.__class__.__name__ - space1 = "\n%s" % (' ' * (len(self.__class__.__name__) + 1)) - space2 = "\n%s" % (' ' * (len(self.__class__.__name__) + 2)) + space1 = "\n%s" % (' ' * (len(name) + 1)) + space2 = "\n%s" % (' ' * (len(name) + 2)) n = len(self) sep = ',' @@ -2170,7 +2173,11 @@ def union(self, other): if len(self) == 0: return other._get_consensus_name(self) - if not is_dtype_equal(self.dtype, other.dtype): + # TODO: is_dtype_union_equal is a hack around + # 1. buggy set ops with duplicates (GH #13432) + # 2. CategoricalIndex lacking setops (GH #10186) + # Once those are fixed, this workaround can be removed + if not is_dtype_union_equal(self.dtype, other.dtype): this = self.astype('O') other = other.astype('O') return this.union(other) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 447087d3c75637..9a055afccd7997 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -58,16 +58,18 @@ def __new__(cls, data=None, categories=None, ordered=None, dtype=None, copy=False, name=None, fastpath=False, **kwargs): if fastpath: - return cls._simple_new(data, name=name) + return cls._simple_new(data, name=name, dtype=dtype) if name is None and hasattr(data, 'name'): name = data.name if isinstance(data, ABCCategorical): - data = cls._create_categorical(cls, data, categories, ordered) + data = cls._create_categorical(cls, data, categories, ordered, + dtype) elif isinstance(data, CategoricalIndex): data = data._data - data = cls._create_categorical(cls, data, categories, ordered) + data = cls._create_categorical(cls, data, categories, ordered, + dtype) else: # don't allow scalars @@ -114,7 +116,8 @@ def _create_from_codes(self, codes, categories=None, ordered=None, return CategoricalIndex(cat, name=name) @staticmethod - def _create_categorical(self, data, categories=None, ordered=None): + def _create_categorical(self, data, categories=None, ordered=None, + dtype=None): """ *this is an internal non-public method* @@ -125,6 +128,7 @@ def _create_categorical(self, data, categories=None, ordered=None): data : data for new Categorical categories : optional categories, defaults to existing ordered : optional ordered attribute, defaults to existing + dtype : CategoricalDtype, defaults to existing Returns ------- @@ -135,22 +139,30 @@ def _create_categorical(self, data, categories=None, ordered=None): data = data.values if not isinstance(data, ABCCategorical): - ordered = False if ordered is None else ordered + if ordered is None and dtype is None: + ordered = False from pandas.core.categorical import Categorical - data = Categorical(data, categories=categories, ordered=ordered) + data = Categorical(data, categories=categories, ordered=ordered, + dtype=dtype) else: + from pandas.core.dtypes.dtypes import CategoricalDtype + if categories is not None: - data = data.set_categories(categories) - if ordered is not None: + data = data.set_categories(categories, ordered=ordered) + elif ordered is not None and ordered != data.ordered: data = data.set_ordered(ordered) + if isinstance(dtype, CategoricalDtype): + # we want to silently ignore dtype='category' + data = data._set_dtype(dtype) return data @classmethod def _simple_new(cls, values, name=None, categories=None, ordered=None, - **kwargs): + dtype=None, **kwargs): result = object.__new__(cls) - values = cls._create_categorical(cls, values, categories, ordered) + values = cls._create_categorical(cls, values, categories, ordered, + dtype=dtype) result._data = values result.name = name for k, v in compat.iteritems(kwargs): @@ -161,16 +173,28 @@ def _simple_new(cls, values, name=None, categories=None, ordered=None, @Appender(_index_shared_docs['_shallow_copy']) def _shallow_copy(self, values=None, categories=None, ordered=None, - **kwargs): + dtype=None, **kwargs): # categories and ordered can't be part of attributes, # as these are properties + # we want to reuse self.dtype if possible, i.e. neither are + # overridden. + if dtype is not None and (categories is not None or + ordered is not None): + raise TypeError("Cannot specify both `dtype` and `categories` " + "or `ordered`") + + if categories is None and ordered is None: + dtype = self.dtype if dtype is None else dtype + return super(CategoricalIndex, self)._shallow_copy( + values=values, dtype=dtype, **kwargs) if categories is None: categories = self.categories if ordered is None: ordered = self.ordered - return super(CategoricalIndex, - self)._shallow_copy(values=values, categories=categories, - ordered=ordered, **kwargs) + + return super(CategoricalIndex, self)._shallow_copy( + values=values, categories=categories, + ordered=ordered, **kwargs) def _is_dtype_compat(self, other): """ @@ -236,7 +260,7 @@ def _format_attrs(self): ('ordered', self.ordered)] if self.name is not None: attrs.append(('name', ibase.default_pprint(self.name))) - attrs.append(('dtype', "'%s'" % self.dtype)) + attrs.append(('dtype', "'%s'" % self.dtype.name)) max_seq_items = get_option('display.max_seq_items') or len(self) if len(self) > max_seq_items: attrs.append(('length', len(self))) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 8120c93ad33643..55ed2342571ab8 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -985,9 +985,10 @@ def _format_native_types(self, na_rep='', quoting=None, **kwargs): na_rep=na_rep, justify='all').get_result() - def _format_data(self): + def _format_data(self, name=None): # TODO: integrate with categorical and make generic + # name argument is unused here; just for compat with base / categorical n = len(self) max_seq_items = min((get_option( 'display.max_seq_items') or n) // 10, 10) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 66209ecd3a0303..0b7c5f414b1789 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -490,7 +490,7 @@ def _format_attrs(self): def _format_space(self): return "\n%s" % (' ' * (len(self.__class__.__name__) + 1)) - def _format_data(self): + def _format_data(self, name=None): # we are formatting thru the attributes return None diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 16523257c2f77c..a3b899d58255b1 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -189,7 +189,7 @@ def _format_attrs(self): attrs.append(('name', ibase.default_pprint(self.name))) return attrs - def _format_data(self): + def _format_data(self, name=None): # we are formatting thru the attributes return None diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 6799d3b5746d0d..2046bae759b9ab 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -139,14 +139,14 @@ def is_categorical_astype(self, dtype): validate that we have a astypeable to categorical, returns a boolean if we are a categorical """ - if is_categorical_dtype(dtype): - if dtype == CategoricalDtype(): - return True - + if dtype is Categorical or dtype is CategoricalDtype: # this is a pd.Categorical, but is not # a valid type for astypeing raise TypeError("invalid type {0} for astype".format(dtype)) + elif is_categorical_dtype(dtype): + return True + return False def external_values(self, dtype=None): @@ -548,6 +548,18 @@ def _astype(self, dtype, copy=False, errors='raise', values=None, # may need to convert to categorical # this is only called for non-categoricals if self.is_categorical_astype(dtype): + if (('categories' in kwargs or 'ordered' in kwargs) and + isinstance(dtype, CategoricalDtype)): + raise TypeError("Cannot specify a CategoricalDtype and also " + "`categories` or `ordered`. Use " + "`dtype=CategoricalDtype(categories, ordered)`" + " instead.") + kwargs = kwargs.copy() + categories = getattr(dtype, 'categories', None) + ordered = getattr(dtype, 'ordered', False) + + kwargs.setdefault('categories', categories) + kwargs.setdefault('ordered', ordered) return self.make_block(Categorical(self.values, **kwargs)) # astype processing diff --git a/pandas/core/series.py b/pandas/core/series.py index 02690dec3e1c4c..ea9aeefe3b6651 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2985,7 +2985,8 @@ def _try_cast(arr, take_fast_path): subarr = np.array(subarr, dtype=dtype, copy=copy) except (ValueError, TypeError): if is_categorical_dtype(dtype): - subarr = Categorical(arr) + subarr = Categorical(arr, dtype.categories, + ordered=dtype.ordered) elif dtype is not None and raise_cast_failure: raise else: diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 12e8d8aba91779..27252b9616a445 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -2,7 +2,6 @@ import numpy as np from pandas.compat import long, string_types, PY3 -from pandas.core.categorical import Categorical from pandas.core.dtypes.common import ( _ensure_platform_int, _ensure_int64, @@ -183,6 +182,8 @@ def indexer_from_factorized(labels, shape, compress=True): def lexsort_indexer(keys, orders=None, na_position='last'): + from pandas.core.categorical import Categorical + labels = [] shape = [] if isinstance(orders, bool): diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index 07e993d7ef5092..0c82773b75c289 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -260,7 +260,7 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): # For categoricals, we hash the categories, then remap the codes to the # hash values. (This check is above the complex check so that we don't ask - # numpy if categorical is a subdtype of complex, as it will choke. + # numpy if categorical is a subdtype of complex, as it will choke). if is_categorical_dtype(dtype): return _hash_categorical(vals, encoding, hash_key) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 8a36f234484b4a..e0be34b14a97de 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -545,10 +545,12 @@ def test_is_complex_dtype(): (pd.Index([1, 2]), np.dtype('int64')), (pd.Index(['a', 'b']), np.dtype(object)), ('category', 'category'), - (pd.Categorical(['a', 'b']).dtype, CategoricalDtype()), - (pd.Categorical(['a', 'b']), CategoricalDtype()), - (pd.CategoricalIndex(['a', 'b']).dtype, CategoricalDtype()), - (pd.CategoricalIndex(['a', 'b']), CategoricalDtype()), + (pd.Categorical(['a', 'b']).dtype, CategoricalDtype(['a', 'b'])), + (pd.Categorical(['a', 'b']), CategoricalDtype(['a', 'b'])), + (pd.CategoricalIndex(['a', 'b']).dtype, CategoricalDtype(['a', 'b'])), + (pd.CategoricalIndex(['a', 'b']), CategoricalDtype(['a', 'b'])), + (CategoricalDtype(), CategoricalDtype()), + (CategoricalDtype(['a', 'b']), CategoricalDtype()), (pd.DatetimeIndex([1, 2]), np.dtype(' Date: Sat, 23 Sep 2017 12:42:54 -0700 Subject: [PATCH 133/188] Dont check for NaTType, just NaT (#17564) --- pandas/core/indexes/timedeltas.py | 2 +- pandas/io/packers.py | 5 ++--- pandas/tests/scalar/test_timedelta.py | 6 +++--- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index d7b7d56d74a3a9..12b7936503ad70 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -847,7 +847,7 @@ def insert(self, loc, item): pass freq = None - if isinstance(item, (Timedelta, libts.NaTType)): + if isinstance(item, Timedelta) or item is NaT: # check freq can be preserved on edge cases if self.freq is not None: diff --git a/pandas/io/packers.py b/pandas/io/packers.py index a2fc4db23700c2..92270b39f56ef5 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -56,7 +56,6 @@ Index, MultiIndex, Float64Index, Int64Index, Panel, RangeIndex, PeriodIndex, DatetimeIndex, NaT, Categorical, CategoricalIndex) -from pandas._libs.tslib import NaTType from pandas.core.sparse.api import SparseSeries, SparseDataFrame from pandas.core.sparse.array import BlockIndex, IntIndex from pandas.core.generic import NDFrame @@ -470,7 +469,7 @@ def encode(obj): } elif isinstance(obj, (datetime, date, np.datetime64, timedelta, - np.timedelta64, NaTType)): + np.timedelta64)) or obj is NaT: if isinstance(obj, Timestamp): tz = obj.tzinfo if tz is not None: @@ -482,7 +481,7 @@ def encode(obj): u'value': obj.value, u'freq': freq, u'tz': tz} - if isinstance(obj, NaTType): + if obj is NaT: return {u'typ': u'nat'} elif isinstance(obj, np.timedelta64): return {u'typ': u'timedelta64', diff --git a/pandas/tests/scalar/test_timedelta.py b/pandas/tests/scalar/test_timedelta.py index 2cabbfacf64161..b5a8ce24fa4f81 100644 --- a/pandas/tests/scalar/test_timedelta.py +++ b/pandas/tests/scalar/test_timedelta.py @@ -9,7 +9,7 @@ from pandas.core.tools.timedeltas import _coerce_scalar_to_timedelta_type as ct from pandas import (Timedelta, TimedeltaIndex, timedelta_range, Series, to_timedelta, compat) -from pandas._libs.tslib import iNaT, NaTType +from pandas._libs.tslib import iNaT, NaT class TestTimedeltas(object): @@ -579,7 +579,7 @@ def test_implementation_limits(self): assert max_td.value == np.iinfo(np.int64).max # Beyond lower limit, a NAT before the Overflow - assert isinstance(min_td - Timedelta(1, 'ns'), NaTType) + assert (min_td - Timedelta(1, 'ns')) is NaT with pytest.raises(OverflowError): min_td - Timedelta(2, 'ns') @@ -589,7 +589,7 @@ def test_implementation_limits(self): # Same tests using the internal nanosecond values td = Timedelta(min_td.value - 1, 'ns') - assert isinstance(td, NaTType) + assert td is NaT with pytest.raises(OverflowError): Timedelta(min_td.value - 2, 'ns') From 87e2f549a28874955b741c782ad99232e9669ad9 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 23 Sep 2017 12:44:52 -0700 Subject: [PATCH 134/188] Remove unused cimports (#17585) --- pandas/_libs/groupby.pyx | 2 - pandas/_libs/join.pyx | 2 - pandas/_libs/period.pyx | 30 ++- pandas/_libs/reshape.pyx | 2 - pandas/_libs/src/offsets.pyx | 367 ----------------------------------- setup.py | 4 +- 6 files changed, 15 insertions(+), 392 deletions(-) delete mode 100644 pandas/_libs/src/offsets.pyx diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 9500e685367c86..1cb7b18fa4f61b 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -7,8 +7,6 @@ cimport cython cnp.import_array() -cimport util - from numpy cimport (ndarray, double_t, int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t, diff --git a/pandas/_libs/join.pyx b/pandas/_libs/join.pyx index 503bdda75875f7..33c3650fa04250 100644 --- a/pandas/_libs/join.pyx +++ b/pandas/_libs/join.pyx @@ -8,8 +8,6 @@ from cython cimport Py_ssize_t np.import_array() -cimport util - from numpy cimport (ndarray, int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t, uint32_t, uint64_t, float16_t, float32_t, float64_t) diff --git a/pandas/_libs/period.pyx b/pandas/_libs/period.pyx index 49353f7b0491c0..75164748128e26 100644 --- a/pandas/_libs/period.pyx +++ b/pandas/_libs/period.pyx @@ -27,13 +27,12 @@ from datetime cimport ( INT32_MIN) -cimport util, lib +cimport util from util cimport is_period_object, is_string_object -from lib cimport is_null_datetimelike, is_period -from pandas._libs import tslib, lib -from pandas._libs.tslib import (Timedelta, Timestamp, iNaT, - NaT) +from lib cimport is_null_datetimelike +from pandas._libs import tslib +from pandas._libs.tslib import Timestamp, iNaT, NaT from tslibs.timezones cimport ( is_utc, is_tzlocal, get_utcoffset, _get_dst_info, maybe_get_tz) from tslib cimport _nat_scalar_rules @@ -485,7 +484,7 @@ def extract_freq(ndarray[object] values): try: # now Timestamp / NaT has freq attr - if is_period(p): + if is_period_object(p): return p.freq except AttributeError: pass @@ -728,8 +727,7 @@ cdef class _Period(object): return hash((self.ordinal, self.freqstr)) def _add_delta(self, other): - if isinstance(other, (timedelta, np.timedelta64, - offsets.Tick, Timedelta)): + if isinstance(other, (timedelta, np.timedelta64, offsets.Tick)): offset = frequencies.to_offset(self.freq.rule_code) if isinstance(offset, offsets.Tick): nanos = tslib._delta_to_nanoseconds(other) @@ -754,12 +752,11 @@ cdef class _Period(object): def __add__(self, other): if is_period_object(self): if isinstance(other, (timedelta, np.timedelta64, - offsets.DateOffset, - Timedelta)): + offsets.DateOffset)): return self._add_delta(other) elif other is NaT: return NaT - elif lib.is_integer(other): + elif util.is_integer_object(other): ordinal = self.ordinal + other * self.freq.n return Period(ordinal=ordinal, freq=self.freq) else: # pragma: no cover @@ -772,11 +769,10 @@ cdef class _Period(object): def __sub__(self, other): if is_period_object(self): if isinstance(other, (timedelta, np.timedelta64, - offsets.DateOffset, - Timedelta)): + offsets.DateOffset)): neg_other = -other return self + neg_other - elif lib.is_integer(other): + elif util.is_integer_object(other): ordinal = self.ordinal - other * self.freq.n return Period(ordinal=ordinal, freq=self.freq) elif is_period_object(other): @@ -1159,7 +1155,7 @@ class Period(_Period): raise ValueError(("Only value or ordinal but not both should be " "given but not both")) elif ordinal is not None: - if not lib.is_integer(ordinal): + if not util.is_integer_object(ordinal): raise ValueError("Ordinal must be an integer") if freq is None: raise ValueError('Must supply freq for ordinal value') @@ -1196,8 +1192,8 @@ class Period(_Period): elif is_null_datetimelike(value) or value in tslib._nat_strings: ordinal = iNaT - elif is_string_object(value) or lib.is_integer(value): - if lib.is_integer(value): + elif is_string_object(value) or util.is_integer_object(value): + if util.is_integer_object(value): value = str(value) value = value.upper() dt, _, reso = parse_time_string(value, freq) diff --git a/pandas/_libs/reshape.pyx b/pandas/_libs/reshape.pyx index d6996add374a95..db2e8b43d1ead7 100644 --- a/pandas/_libs/reshape.pyx +++ b/pandas/_libs/reshape.pyx @@ -8,8 +8,6 @@ from cython cimport Py_ssize_t np.import_array() -cimport util - from numpy cimport (ndarray, int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t, uint32_t, uint64_t, float16_t, float32_t, float64_t) diff --git a/pandas/_libs/src/offsets.pyx b/pandas/_libs/src/offsets.pyx deleted file mode 100644 index c963e256d0aa5b..00000000000000 --- a/pandas/_libs/src/offsets.pyx +++ /dev/null @@ -1,367 +0,0 @@ - -ctypedef enum time_res: - r_min = 0 - r_microsecond - r_second - r_minute - r_hour - r_day - r_month - r_year - r_max = 98 - r_invalid = 99 - - -cdef conversion_factor(time_res res1, time_res res2): - cdef: - time_res min_res, max_res - int64_t factor - - min_res = min(res1, res2) - max_res = max(res1, res2) - factor = 1 - - if min_res == max_res: - return factor - - while min_res < max_res: - if min_res < r_microsecond: - raise "Cannot convert from less than us" - elif min_res == r_microsecond: - factor *= 1000000 - min_res = r_second - elif min_res == r_second: - factor *= 60 - min_res = r_minute - elif min_res == r_minute: - factor *= 60 - min_res = r_hour - elif min_res == r_hour: - factor *= 24 - min_res = r_day - else: - raise "Cannot convert to month or year" - - return factor - -# Logic to generate ranges -# ----------------------------------------------------------------------------- - -cdef inline int64_t weekend_adjustment(int64_t dow, int bkwd): - if dow > 4: # sat or sun? - if bkwd: # roll back 1 or 2 days - return (4 - dow) - else: # roll forward 2 or 1 days - return (7 - dow) - return 0 - -cdef int64_t us_in_day = conversion_factor(r_microsecond, r_day) - -cdef class _Offset: - """ - Base class to generate timestamps. Set the anchor, and then move offsets - with next & prev. Retrieve timestamp with ts attribute. - """ - cdef: - int64_t t, dow, biz, dayoffset - object start - _TSObject ts - - def __cinit__(self): - self.t=0 - self.dow=0 - self.biz=0 - self.dayoffset=0 - - cpdef anchor(self, object start=None): - if start is not None: - self.start = start - self.ts = convert_to_tsobject(self.start, None, None) - self._setup() - - cdef _setup(self): - pass - - cpdef next(self): - pass - - cpdef __next__(self): - """wrapper around next""" - return self.next() - - cpdef prev(self): - pass - - cdef int64_t _ts(self): - """ - Access the current timestamp value, with a possible weekday - adjustment. - """ - cdef int64_t adj - - if self.biz != 0: - adj = weekend_adjustment(self.dow, self.biz < 0) - return self.t + us_in_day * adj - else: - return self.t - - cdef int64_t _get_anchor(self): - """ - Retrieve an anchor relating to current offset we're on. - """ - return self.t - self.dayoffset * us_in_day - - property ts: - def __get__(self): - return self._ts() - -cdef class YearOffset(_Offset): - """ - Generate annual timestamps from provided start time; apply dayoffset to - each timestamp. If biz > 0, we choose the next business day at each time; - previous if < 0. - - Parameters - ---------- - dayoffset : int - biz : int - """ - cdef: - int64_t y, ly - - def __init__(self, int64_t dayoffset=0, int64_t biz=0, object anchor=None): - self.dayoffset = dayoffset - self.biz = biz - - if anchor is not None: - self.anchor(anchor) - - cdef _setup(self): - cdef _TSObject ts = self.ts - - self.t = ts.value + self.dayoffset * us_in_day - self.y = ts.dts.year - - self.ly = (ts.dts.month > 2 or - ts.dts.month == 2 and ts.dts.day == 29) - - if self.biz != 0: - self.dow = (ts_dayofweek(ts) + self.dayoffset) % 7 - - cpdef next(self): - cdef int64_t days - - days = 365 + is_leapyear(self.y + self.ly) - - self.t += days * us_in_day - self.y += 1 - - if self.biz != 0: - self.dow = (self.dow + days) % 7 - - cpdef prev(self): - cdef int64_t days - - days = 365 + is_leapyear(self.y - (1 - self.ly)) - - self.t -= days * us_in_day - self.y -= 1 - - if self.biz != 0: - self.dow = (self.dow - days) % 7 - -cdef class MonthOffset(_Offset): - """ - Generate monthly timestamps from provided start time, and apply dayoffset - to each timestamp. Stride to construct strided timestamps (eg quarterly). - If biz > 0, we choose the next business day at each time; previous if < 0. - - Parameters - ---------- - dayoffset : int - stride : int, > 0 - biz : int - """ - cdef: - Py_ssize_t stride, ly, m - int64_t y - - def __init__(self, int64_t dayoffset=0, Py_ssize_t stride=1, - int64_t biz=0, object anchor=None): - self.dayoffset = dayoffset - self.stride = stride - self.biz = biz - - if stride <= 0: - raise ValueError("Stride must be positive") - - if anchor is not None: - self.anchor(anchor) - - cdef _setup(self): - cdef _TSObject ts = self.ts - - self.t = ts.value + (self.dayoffset * us_in_day) - - # for day counting - self.m = ts.dts.month - 1 - self.y = ts.dts.year - self.ly = is_leapyear(self.y) - - if self.biz != 0: - self.dow = (ts_dayofweek(ts) + self.dayoffset) % 7 - - cpdef next(self): - cdef: - int64_t tmp, days - Py_ssize_t j - - days = 0 - for j in range(0, self.stride): - if self.m >= 12: - self.m -= 12 - self.y += 1 - self.ly = is_leapyear(self.y) - days += days_per_month_table[self.ly][self.m] - self.m += 1 - - self.t += days * us_in_day - - if self.biz != 0: - self.dow = (self.dow + days) % 7 - - cpdef prev(self): - cdef: - int64_t tmp, days - Py_ssize_t j - - days = 0 - for j in range(0, self.stride): - self.m -= 1 - if self.m < 0: - self.m += 12 - self.y -= 1 - self.ly = is_leapyear(self.y) - days += days_per_month_table[self.ly][self.m] - - self.t -= days * us_in_day - - if self.biz != 0: - self.dow = (self.dow - days) % 7 - -cdef class DayOfMonthOffset(_Offset): - """ - Generate relative monthly timestamps from month & year of provided start - time. For example, fridays of the third week of each month (week=3, day=4); - or, thursdays of the last week of each month (week=-1, day=3). - - Parameters - ---------- - week : int - day : int, 0 to 6 - """ - cdef: - Py_ssize_t ly, m - int64_t y, day, week - - def __init__(self, int64_t week=0, int64_t day=0, object anchor=None): - self.week = week - self.day = day - - if self.day < 0 or self.day > 6: - raise ValueError("Day offset must be 0 to 6") - - if anchor is not None: - self.anchor(anchor) - - cdef _setup(self): - cdef _TSObject ts = self.ts - - # rewind to beginning of month - self.t = ts.value - (ts.dts.day - 1) * us_in_day - self.dow = dayofweek(ts.dts.year, ts.dts.month, 1) - - # for day counting - self.m = ts.dts.month - 1 - self.y = ts.dts.year - self.ly = is_leapyear(self.y) - - cpdef next(self): - cdef: - int64_t tmp, days - - days = days_per_month_table[self.ly][self.m] - self.t += days * us_in_day - self.dow = (self.dow + days) % 7 - - self.m += 1 - if self.m >= 12: - self.m -= 12 - self.y += 1 - self.ly = is_leapyear(self.y) - - cpdef prev(self): - cdef: - int64_t tmp, days - - days = days_per_month_table[self.ly][(self.m - 1) % 12] - self.t -= days * us_in_day - self.dow = (self.dow - days) % 7 - - self.m -= 1 - if self.m < 0: - self.m += 12 - self.y -= 1 - self.ly = is_leapyear(self.y) - - cdef int64_t _ts(self): - """ - Overwrite default adjustment - """ - cdef int64_t adj = (self.week * 7) + (self.day - self.dow) % 7 - return self.t + us_in_day * adj - -cdef class DayOffset(_Offset): - """ - Generate daily timestamps beginning with first valid time >= start time. If - biz != 0, we skip weekends. Stride, to construct weekly timestamps. - - Parameters - ---------- - stride : int, > 0 - biz : boolean - """ - cdef: - Py_ssize_t stride - - def __init__(self, int64_t stride=1, int64_t biz=0, object anchor=None): - self.stride = stride - self.biz = biz - - if self.stride <= 0: - raise ValueError("Stride must be positive") - - if anchor is not None: - self.anchor(anchor) - - cdef _setup(self): - cdef _TSObject ts = self.ts - self.t = ts.value - if self.biz != 0: - self.dow = ts_dayofweek(ts) - - cpdef next(self): - self.t += (self.stride * us_in_day) - if self.biz != 0: - self.dow = (self.dow + self.stride) % 7 - if self.dow >= 5: - self.t += (7 - self.dow) * us_in_day - self.dow = 0 - - cpdef prev(self): - self.t -= (self.stride * us_in_day) - if self.biz != 0: - self.dow = (self.dow - self.stride) % 7 - if self.dow >= 5: - self.t += (4 - self.dow) * us_in_day - self.dow = 4 diff --git a/setup.py b/setup.py index d28c4ba8be5b00..555cf9dc4a9b39 100755 --- a/setup.py +++ b/setup.py @@ -512,7 +512,7 @@ def pxd(name): 'pxdfiles': ['_libs/src/util', '_libs/hashtable'], 'depends': _pxi_dep['join']}, '_libs.reshape': {'pyxfile': '_libs/reshape', - 'depends': _pxi_dep['reshape']}, + 'depends': _pxi_dep['reshape'], 'include': []}, '_libs.interval': {'pyxfile': '_libs/interval', 'pxdfiles': ['_libs/hashtable'], 'depends': _pxi_dep['interval']}, @@ -528,7 +528,7 @@ def pxd(name): 'pandas/_libs/src/parser/io.c']}, '_libs.sparse': {'pyxfile': '_libs/sparse', 'depends': (['pandas/_libs/sparse.pyx'] + - _pxi_dep['sparse'])}, + _pxi_dep['sparse']), 'include': []}, '_libs.testing': {'pyxfile': '_libs/testing', 'depends': ['pandas/_libs/testing.pyx']}, '_libs.hashing': {'pyxfile': '_libs/hashing', From d43aba82e218cd8187769a07c487709aa86de693 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 23 Sep 2017 12:46:05 -0700 Subject: [PATCH 135/188] de-privatize timezone functions (#17543) --- pandas/_libs/period.pyx | 6 ++-- pandas/_libs/tslib.pyx | 32 ++++++++----------- pandas/_libs/tslibs/timezones.pxd | 4 +-- pandas/_libs/tslibs/timezones.pyx | 26 +++++++-------- pandas/core/indexes/datetimes.py | 20 ++++++------ pandas/core/tools/datetimes.py | 3 +- pandas/io/pytables.py | 7 ++-- .../indexes/datetimes/test_date_range.py | 2 +- pandas/tests/indexes/datetimes/test_setops.py | 4 +-- pandas/tests/io/test_pytables.py | 2 +- pandas/tests/scalar/test_period.py | 12 +++---- pandas/tests/scalar/test_timestamp.py | 4 +-- pandas/tests/series/test_indexing.py | 2 +- pandas/tests/tseries/test_offsets.py | 3 +- pandas/tests/tseries/test_timezones.py | 22 +++++++------ 15 files changed, 75 insertions(+), 74 deletions(-) diff --git a/pandas/_libs/period.pyx b/pandas/_libs/period.pyx index 75164748128e26..943f925ec5b04a 100644 --- a/pandas/_libs/period.pyx +++ b/pandas/_libs/period.pyx @@ -34,7 +34,7 @@ from lib cimport is_null_datetimelike from pandas._libs import tslib from pandas._libs.tslib import Timestamp, iNaT, NaT from tslibs.timezones cimport ( - is_utc, is_tzlocal, get_utcoffset, _get_dst_info, maybe_get_tz) + is_utc, is_tzlocal, get_utcoffset, get_dst_info, maybe_get_tz) from tslib cimport _nat_scalar_rules from tslibs.frequencies cimport get_freq_code @@ -556,7 +556,7 @@ cdef _reso_local(ndarray[int64_t] stamps, object tz): reso = curr_reso else: # Adjust datetime64 timestamp, recompute datetimestruct - trans, deltas, typ = _get_dst_info(tz) + trans, deltas, typ = get_dst_info(tz) _pos = trans.searchsorted(stamps, side='right') - 1 if _pos.dtype != np.int64: @@ -623,7 +623,7 @@ cdef ndarray[int64_t] localize_dt64arr_to_period(ndarray[int64_t] stamps, dts.us, dts.ps, freq) else: # Adjust datetime64 timestamp, recompute datetimestruct - trans, deltas, typ = _get_dst_info(tz) + trans, deltas, typ = get_dst_info(tz) _pos = trans.searchsorted(stamps, side='right') - 1 if _pos.dtype != np.int64: diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 077603af96947c..c629ccbd8e1fd4 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -100,16 +100,10 @@ iNaT = NPY_NAT from tslibs.timezones cimport ( - is_utc, is_tzlocal, _is_fixed_offset, + is_utc, is_tzlocal, is_fixed_offset, treat_tz_as_dateutil, treat_tz_as_pytz, get_timezone, get_utcoffset, maybe_get_tz, - _get_dst_info - ) -from tslibs.timezones import ( # noqa - get_timezone, get_utcoffset, maybe_get_tz, - _p_tz_cache_key, dst_cache, - _unbox_utcoffsets, - _dateutil_gettz + get_dst_info ) @@ -168,7 +162,7 @@ def ints_to_pydatetime(ndarray[int64_t] arr, tz=None, freq=None, box=False): pandas_datetime_to_datetimestruct( value, PANDAS_FR_ns, &dts) result[i] = func_create(value, dts, tz, freq) - elif is_tzlocal(tz) or _is_fixed_offset(tz): + elif is_tzlocal(tz) or is_fixed_offset(tz): for i in range(n): value = arr[i] if value == NPY_NAT: @@ -182,7 +176,7 @@ def ints_to_pydatetime(ndarray[int64_t] arr, tz=None, freq=None, box=False): dt = Timestamp(dt) result[i] = dt else: - trans, deltas, typ = _get_dst_info(tz) + trans, deltas, typ = get_dst_info(tz) for i in range(n): @@ -1641,12 +1635,12 @@ cdef inline void _localize_tso(_TSObject obj, object tz): obj.tzinfo = tz else: # Adjust datetime64 timestamp, recompute datetimestruct - trans, deltas, typ = _get_dst_info(tz) + trans, deltas, typ = get_dst_info(tz) pos = trans.searchsorted(obj.value, side='right') - 1 # static/pytz/dateutil specific code - if _is_fixed_offset(tz): + if is_fixed_offset(tz): # statictzinfo if len(deltas) > 0 and obj.value != NPY_NAT: pandas_datetime_to_datetimestruct(obj.value + deltas[0], @@ -4066,7 +4060,7 @@ def tz_convert(ndarray[int64_t] vals, object tz1, object tz2): * 1000000000) utc_dates[i] = v - delta else: - trans, deltas, typ = _get_dst_info(tz1) + trans, deltas, typ = get_dst_info(tz1) # all-NaT tt = vals[vals!=NPY_NAT] @@ -4108,7 +4102,7 @@ def tz_convert(ndarray[int64_t] vals, object tz1, object tz2): return result # Convert UTC to other timezone - trans, deltas, typ = _get_dst_info(tz2) + trans, deltas, typ = get_dst_info(tz2) # use first non-NaT element # if all-NaT, return all-NaT @@ -4172,7 +4166,7 @@ cpdef int64_t tz_convert_single(int64_t val, object tz1, object tz2): delta = int(get_utcoffset(tz1, dt).total_seconds()) * 1000000000 utc_date = val - delta elif get_timezone(tz1) != 'UTC': - trans, deltas, typ = _get_dst_info(tz1) + trans, deltas, typ = get_dst_info(tz1) pos = trans.searchsorted(val, side='right') - 1 if pos < 0: raise ValueError('First time before start of DST info') @@ -4191,7 +4185,7 @@ cpdef int64_t tz_convert_single(int64_t val, object tz1, object tz2): return utc_date + delta # Convert UTC to other timezone - trans, deltas, typ = _get_dst_info(tz2) + trans, deltas, typ = get_dst_info(tz2) pos = trans.searchsorted(utc_date, side='right') - 1 if pos < 0: @@ -4261,7 +4255,7 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None, "Length of ambiguous bool-array must be the same size as vals") ambiguous_array = np.asarray(ambiguous) - trans, deltas, typ = _get_dst_info(tz) + trans, deltas, typ = get_dst_info(tz) tdata = trans.data ntrans = len(trans) @@ -4967,7 +4961,7 @@ cdef _normalize_local(ndarray[int64_t] stamps, object tz): result[i] = _normalized_stamp(&dts) else: # Adjust datetime64 timestamp, recompute datetimestruct - trans, deltas, typ = _get_dst_info(tz) + trans, deltas, typ = get_dst_info(tz) _pos = trans.searchsorted(stamps, side='right') - 1 if _pos.dtype != np.int64: @@ -5023,7 +5017,7 @@ def dates_normalized(ndarray[int64_t] stamps, tz=None): if (dt.hour + dt.minute + dt.second + dt.microsecond) > 0: return False else: - trans, deltas, typ = _get_dst_info(tz) + trans, deltas, typ = get_dst_info(tz) for i in range(n): # Adjust datetime64 timestamp, recompute datetimestruct diff --git a/pandas/_libs/tslibs/timezones.pxd b/pandas/_libs/tslibs/timezones.pxd index fac0018a78bc2e..e5d1343e1c9843 100644 --- a/pandas/_libs/tslibs/timezones.pxd +++ b/pandas/_libs/tslibs/timezones.pxd @@ -13,6 +13,6 @@ cpdef object get_timezone(object tz) cpdef object maybe_get_tz(object tz) cpdef get_utcoffset(tzinfo, obj) -cdef bint _is_fixed_offset(object tz) +cdef bint is_fixed_offset(object tz) -cdef object _get_dst_info(object tz) +cdef object get_dst_info(object tz) diff --git a/pandas/_libs/tslibs/timezones.pyx b/pandas/_libs/tslibs/timezones.pyx index 346da41e7073be..48d82996a0bd0f 100644 --- a/pandas/_libs/tslibs/timezones.pyx +++ b/pandas/_libs/tslibs/timezones.pyx @@ -13,9 +13,9 @@ from dateutil.tz import ( import sys if sys.platform == 'win32' or sys.platform == 'cygwin': # equiv pd.compat.is_platform_windows() - from dateutil.zoneinfo import gettz as _dateutil_gettz + from dateutil.zoneinfo import gettz as dateutil_gettz else: - from dateutil.tz import gettz as _dateutil_gettz + from dateutil.tz import gettz as dateutil_gettz from pytz.tzinfo import BaseTzInfo as _pytz_BaseTzInfo @@ -100,7 +100,7 @@ cpdef inline object maybe_get_tz(object tz): tz = _dateutil_tzlocal() elif tz.startswith('dateutil/'): zone = tz[9:] - tz = _dateutil_gettz(zone) + tz = dateutil_gettz(zone) # On Python 3 on Windows, the filename is not always set correctly. if isinstance(tz, _dateutil_tzfile) and '.tar.gz' in tz._filename: tz._filename = zone @@ -113,14 +113,14 @@ cpdef inline object maybe_get_tz(object tz): def _p_tz_cache_key(tz): """ Python interface for cache function to facilitate testing.""" - return _tz_cache_key(tz) + return tz_cache_key(tz) # Timezone data caches, key is the pytz string or dateutil file name. dst_cache = {} -cdef inline object _tz_cache_key(object tz): +cdef inline object tz_cache_key(object tz): """ Return the key in the cache for the timezone info object or None if unknown. @@ -163,7 +163,7 @@ cpdef get_utcoffset(tzinfo, obj): return tzinfo.utcoffset(obj) -cdef inline bint _is_fixed_offset(object tz): +cdef inline bint is_fixed_offset(object tz): if treat_tz_as_dateutil(tz): if len(tz._trans_idx) == 0 and len(tz._trans_list) == 0: return 1 @@ -178,7 +178,7 @@ cdef inline bint _is_fixed_offset(object tz): return 1 -cdef object _get_utc_trans_times_from_dateutil_tz(object tz): +cdef object get_utc_trans_times_from_dateutil_tz(object tz): """ Transition times in dateutil timezones are stored in local non-dst time. This code converts them to UTC. It's the reverse of the code @@ -193,7 +193,7 @@ cdef object _get_utc_trans_times_from_dateutil_tz(object tz): return new_trans -cpdef ndarray _unbox_utcoffsets(object transinfo): +cpdef ndarray unbox_utcoffsets(object transinfo): cdef: Py_ssize_t i, sz ndarray[int64_t] arr @@ -211,7 +211,7 @@ cpdef ndarray _unbox_utcoffsets(object transinfo): # Daylight Savings -cdef object _get_dst_info(object tz): +cdef object get_dst_info(object tz): """ return a tuple of : (UTC times of DST transitions, @@ -219,7 +219,7 @@ cdef object _get_dst_info(object tz): string of type of transitions) """ - cache_key = _tz_cache_key(tz) + cache_key = tz_cache_key(tz) if cache_key is None: num = int(get_utcoffset(tz, None).total_seconds()) * 1000000000 return (np.array([NPY_NAT + 1], dtype=np.int64), @@ -235,13 +235,13 @@ cdef object _get_dst_info(object tz): trans[0] = NPY_NAT + 1 except Exception: pass - deltas = _unbox_utcoffsets(tz._transition_info) + deltas = unbox_utcoffsets(tz._transition_info) typ = 'pytz' elif treat_tz_as_dateutil(tz): if len(tz._trans_list): # get utc trans times - trans_list = _get_utc_trans_times_from_dateutil_tz(tz) + trans_list = get_utc_trans_times_from_dateutil_tz(tz) trans = np.hstack([ np.array([0], dtype='M8[s]'), # place holder for first item np.array(trans_list, dtype='M8[s]')]).astype( @@ -255,7 +255,7 @@ cdef object _get_dst_info(object tz): deltas *= 1000000000 typ = 'dateutil' - elif _is_fixed_offset(tz): + elif is_fixed_offset(tz): trans = np.array([NPY_NAT + 1], dtype=np.int64) deltas = np.array([tz._ttinfo_std.offset], dtype='i8') * 1000000000 diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 6b1b61c2798f4f..39dc24642235ba 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -50,6 +50,7 @@ from pandas._libs import (lib, index as libindex, tslib as libts, algos as libalgos, join as libjoin, Timestamp, period as libperiod) +from pandas._libs.tslibs import timezones def _utc(): @@ -372,7 +373,7 @@ def __new__(cls, data=None, tz = subarr.tz else: if tz is not None: - tz = libts.maybe_get_tz(tz) + tz = timezones.maybe_get_tz(tz) if (not isinstance(data, DatetimeIndex) or getattr(data, 'tz', None) is None): @@ -447,17 +448,18 @@ def _generate(cls, start, end, periods, name, offset, raise TypeError('Start and end cannot both be tz-aware with ' 'different timezones') - inferred_tz = libts.maybe_get_tz(inferred_tz) + inferred_tz = timezones.maybe_get_tz(inferred_tz) # these may need to be localized - tz = libts.maybe_get_tz(tz) + tz = timezones.maybe_get_tz(tz) if tz is not None: date = start or end if date.tzinfo is not None and hasattr(tz, 'localize'): tz = tz.localize(date.replace(tzinfo=None)).tzinfo if tz is not None and inferred_tz is not None: - if not libts.get_timezone(inferred_tz) == libts.get_timezone(tz): + if not (timezones.get_timezone(inferred_tz) == + timezones.get_timezone(tz)): raise AssertionError("Inferred time zone not equal to passed " "time zone") @@ -593,7 +595,7 @@ def _simple_new(cls, values, name=None, freq=None, tz=None, result._data = values result.name = name result.offset = freq - result.tz = libts.maybe_get_tz(tz) + result.tz = timezones.maybe_get_tz(tz) result._reset_identity() return result @@ -607,7 +609,7 @@ def tzinfo(self): @cache_readonly def _timezone(self): """ Comparable timezone both for pytz / dateutil""" - return libts.get_timezone(self.tzinfo) + return timezones.get_timezone(self.tzinfo) def _has_same_tz(self, other): zzone = self._timezone @@ -616,7 +618,7 @@ def _has_same_tz(self, other): if isinstance(other, np.datetime64): # convert to Timestamp as np.datetime64 doesn't have tz attr other = Timestamp(other) - vzone = libts.get_timezone(getattr(other, 'tzinfo', '__no_tz__')) + vzone = timezones.get_timezone(getattr(other, 'tzinfo', '__no_tz__')) return zzone == vzone @classmethod @@ -1779,7 +1781,7 @@ def tz_convert(self, tz): TypeError If DatetimeIndex is tz-naive. """ - tz = libts.maybe_get_tz(tz) + tz = timezones.maybe_get_tz(tz) if self.tz is None: # tz naive, use tz_localize @@ -1839,7 +1841,7 @@ def tz_localize(self, tz, ambiguous='raise', errors='raise'): else: raise TypeError("Already tz-aware, use tz_convert to convert.") else: - tz = libts.maybe_get_tz(tz) + tz = timezones.maybe_get_tz(tz) # Convert to UTC new_dates = libts.tz_localize_to_utc(self.asi8, tz, diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 9dde26f43ad337..95fe3ab83c2abf 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -3,6 +3,7 @@ from collections import MutableMapping from pandas._libs import lib, tslib +from pandas._libs.tslibs.timezones import get_timezone from pandas.core.dtypes.common import ( _ensure_object, @@ -44,7 +45,7 @@ def _infer_tzinfo(start, end): def _infer(a, b): tz = a.tzinfo if b and b.tzinfo: - if not (tslib.get_timezone(tz) == tslib.get_timezone(b.tzinfo)): + if not (get_timezone(tz) == get_timezone(b.tzinfo)): raise AssertionError('Inputs must both have the same timezone,' ' {timezone1} != {timezone2}' .format(timezone1=tz, timezone2=b.tzinfo)) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 9f819a4463bed4..4d300b200971ac 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -46,7 +46,8 @@ from pandas.core.config import get_option from pandas.core.computation.pytables import Expr, maybe_expression -from pandas._libs import tslib, algos, lib +from pandas._libs import algos, lib +from pandas._libs.tslibs import timezones from distutils.version import LooseVersion @@ -4379,7 +4380,7 @@ def _get_info(info, name): def _get_tz(tz): """ for a tz-aware type, return an encoded zone """ - zone = tslib.get_timezone(tz) + zone = timezones.get_timezone(tz) if zone is None: zone = tz.utcoffset().total_seconds() return zone @@ -4401,7 +4402,7 @@ def _set_tz(values, tz, preserve_UTC=False, coerce=False): if tz is not None: name = getattr(values, 'name', None) values = values.ravel() - tz = tslib.get_timezone(_ensure_decoded(tz)) + tz = timezones.get_timezone(_ensure_decoded(tz)) values = DatetimeIndex(values, name=name) if values.tz is None: values = values.tz_localize('UTC').tz_convert(tz) diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index 8d86bebdd4d5e4..c373942cb4c63c 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -394,7 +394,7 @@ def test_range_tz_dateutil(self): # see gh-2906 # Use maybe_get_tz to fix filename in tz under dateutil. - from pandas._libs.tslib import maybe_get_tz + from pandas._libs.tslibs.timezones import maybe_get_tz tz = lambda x: maybe_get_tz('dateutil/' + x) start = datetime(2011, 1, 1, tzinfo=tz('US/Eastern')) diff --git a/pandas/tests/indexes/datetimes/test_setops.py b/pandas/tests/indexes/datetimes/test_setops.py index f43c010f59b9e7..4ffd2e1cd1e615 100644 --- a/pandas/tests/indexes/datetimes/test_setops.py +++ b/pandas/tests/indexes/datetimes/test_setops.py @@ -325,8 +325,8 @@ def test_month_range_union_tz_pytz(self): def test_month_range_union_tz_dateutil(self): tm._skip_if_windows_python_3() - from pandas._libs.tslib import _dateutil_gettz as timezone - tz = timezone('US/Eastern') + from pandas._libs.tslibs.timezones import dateutil_gettz + tz = dateutil_gettz('US/Eastern') early_start = datetime(2011, 1, 1) early_end = datetime(2011, 3, 1) diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index 2a6d16fb39cc33..ff21afc11d2205 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -5427,7 +5427,7 @@ def test_append_with_timezones_dateutil(self): # use maybe_get_tz instead of dateutil.tz.gettz to handle the windows # filename issues. - from pandas._libs.tslib import maybe_get_tz + from pandas._libs.tslibs.timezones import maybe_get_tz gettz = lambda x: maybe_get_tz('dateutil/' + x) # as columns diff --git a/pandas/tests/scalar/test_period.py b/pandas/tests/scalar/test_period.py index a167c9c738b0bf..c17a216df44cbd 100644 --- a/pandas/tests/scalar/test_period.py +++ b/pandas/tests/scalar/test_period.py @@ -245,29 +245,29 @@ def test_timestamp_tz_arg(self): assert p.tz == exp.tz def test_timestamp_tz_arg_dateutil(self): - from pandas._libs.tslib import _dateutil_gettz as gettz - from pandas._libs.tslib import maybe_get_tz + from pandas._libs.tslibs.timezones import dateutil_gettz + from pandas._libs.tslibs.timezones import maybe_get_tz for case in ['dateutil/Europe/Brussels', 'dateutil/Asia/Tokyo', 'dateutil/US/Pacific']: p = Period('1/1/2005', freq='M').to_timestamp( tz=maybe_get_tz(case)) exp = Timestamp('1/1/2005', tz='UTC').tz_convert(case) assert p == exp - assert p.tz == gettz(case.split('/', 1)[1]) + assert p.tz == dateutil_gettz(case.split('/', 1)[1]) assert p.tz == exp.tz p = Period('1/1/2005', freq='M').to_timestamp(freq='3H', tz=maybe_get_tz(case)) exp = Timestamp('1/1/2005', tz='UTC').tz_convert(case) assert p == exp - assert p.tz == gettz(case.split('/', 1)[1]) + assert p.tz == dateutil_gettz(case.split('/', 1)[1]) assert p.tz == exp.tz def test_timestamp_tz_arg_dateutil_from_string(self): - from pandas._libs.tslib import _dateutil_gettz as gettz + from pandas._libs.tslibs.timezones import dateutil_gettz p = Period('1/1/2005', freq='M').to_timestamp(tz='dateutil/Europe/Brussels') - assert p.tz == gettz('Europe/Brussels') + assert p.tz == dateutil_gettz('Europe/Brussels') def test_timestamp_mult(self): p = pd.Period('2011-01', freq='M') diff --git a/pandas/tests/scalar/test_timestamp.py b/pandas/tests/scalar/test_timestamp.py index 8d47ce4802ac65..c1b9f858a08de3 100644 --- a/pandas/tests/scalar/test_timestamp.py +++ b/pandas/tests/scalar/test_timestamp.py @@ -17,7 +17,7 @@ import pandas.util.testing as tm from pandas.tseries import offsets, frequencies from pandas._libs import tslib, period -from pandas._libs.tslib import get_timezone +from pandas._libs.tslibs.timezones import get_timezone from pandas.compat import lrange, long from pandas.util.testing import assert_series_equal @@ -1295,7 +1295,7 @@ def test_timestamp_to_datetime_explicit_pytz(self): def test_timestamp_to_datetime_explicit_dateutil(self): tm._skip_if_windows_python_3() - from pandas._libs.tslib import _dateutil_gettz as gettz + from pandas._libs.tslibs.timezones import dateutil_gettz as gettz rng = date_range('20090415', '20090519', tz=gettz('US/Eastern')) stamp = rng[0] diff --git a/pandas/tests/series/test_indexing.py b/pandas/tests/series/test_indexing.py index 45a92f6d6f50b0..91187b709463aa 100644 --- a/pandas/tests/series/test_indexing.py +++ b/pandas/tests/series/test_indexing.py @@ -387,7 +387,7 @@ def test_getitem_setitem_datetime_tz_pytz(self): def test_getitem_setitem_datetime_tz_dateutil(self): from dateutil.tz import tzutc - from pandas._libs.tslib import _dateutil_gettz as gettz + from pandas._libs.tslibs.timezones import dateutil_gettz as gettz tz = lambda x: tzutc() if x == 'UTC' else gettz( x) # handle special case for utc in dateutil diff --git a/pandas/tests/tseries/test_offsets.py b/pandas/tests/tseries/test_offsets.py index cd2c29ffe3ac6b..543d21e162f048 100644 --- a/pandas/tests/tseries/test_offsets.py +++ b/pandas/tests/tseries/test_offsets.py @@ -33,6 +33,7 @@ to_datetime, DateParseError) import pandas.tseries.offsets as offsets from pandas.io.pickle import read_pickle +from pandas._libs.tslibs import timezones from pandas._libs.tslib import normalize_date, NaT, Timestamp, Timedelta import pandas._libs.tslib as tslib import pandas.util.testing as tm @@ -288,7 +289,7 @@ def _check_offsetfunc_works(self, offset, funcname, dt, expected, for tz in self.timezones: expected_localize = expected.tz_localize(tz) - tz_obj = tslib.maybe_get_tz(tz) + tz_obj = timezones.maybe_get_tz(tz) dt_tz = tslib._localize_pydatetime(dt, tz_obj) result = func(dt_tz) diff --git a/pandas/tests/tseries/test_timezones.py b/pandas/tests/tseries/test_timezones.py index ac1a338d2844d4..e7b470e01e2af3 100644 --- a/pandas/tests/tseries/test_timezones.py +++ b/pandas/tests/tseries/test_timezones.py @@ -18,6 +18,7 @@ from pandas.core.indexes.datetimes import bdate_range, date_range from pandas.core.dtypes.dtypes import DatetimeTZDtype from pandas._libs import tslib +from pandas._libs.tslibs import timezones from pandas import (Index, Series, DataFrame, isna, Timestamp, NaT, DatetimeIndex, to_datetime) from pandas.util.testing import (assert_frame_equal, assert_series_equal, @@ -943,7 +944,7 @@ def tz(self, tz): Use tslib.maybe_get_tz so that we get the filename on the tz right on windows. See #7337. """ - return tslib.maybe_get_tz('dateutil/' + tz) + return timezones.maybe_get_tz('dateutil/' + tz) def tzstr(self, tz): """ Construct a timezone string from a string. Overridden in subclass @@ -962,7 +963,7 @@ def test_utc_with_system_utc(self): # Skipped on win32 due to dateutil bug tm._skip_if_windows() - from pandas._libs.tslib import maybe_get_tz + from pandas._libs.tslibs.timezones import maybe_get_tz # from system utc to real utc ts = Timestamp('2001-01-05 11:56', tz=maybe_get_tz('dateutil/UTC')) @@ -1133,7 +1134,7 @@ def test_tzlocal(self): assert ts.tz == dateutil.tz.tzlocal() assert "tz='tzlocal()')" in repr(ts) - tz = tslib.maybe_get_tz('tzlocal()') + tz = timezones.maybe_get_tz('tzlocal()') assert tz == dateutil.tz.tzlocal() # get offset using normal datetime for test @@ -1176,12 +1177,13 @@ def test_cache_keys_are_distinct_for_pytz_vs_dateutil(self): if tz_name == 'UTC': # skip utc as it's a special case in dateutil continue - tz_p = tslib.maybe_get_tz(tz_name) - tz_d = tslib.maybe_get_tz('dateutil/' + tz_name) + tz_p = timezones.maybe_get_tz(tz_name) + tz_d = timezones.maybe_get_tz('dateutil/' + tz_name) if tz_d is None: # skip timezones that dateutil doesn't know about. continue - assert tslib._p_tz_cache_key(tz_p) != tslib._p_tz_cache_key(tz_d) + assert (timezones._p_tz_cache_key(tz_p) != + timezones._p_tz_cache_key(tz_d)) class TestTimeZones(object): @@ -1764,13 +1766,13 @@ def compare_local_to_utc(tz_didx, utc_didx): # Check empty array result = tslib.tz_convert(np.array([], dtype=np.int64), - tslib.maybe_get_tz('US/Eastern'), - tslib.maybe_get_tz('Asia/Tokyo')) + timezones.maybe_get_tz('US/Eastern'), + timezones.maybe_get_tz('Asia/Tokyo')) tm.assert_numpy_array_equal(result, np.array([], dtype=np.int64)) # Check all-NaT array result = tslib.tz_convert(np.array([tslib.iNaT], dtype=np.int64), - tslib.maybe_get_tz('US/Eastern'), - tslib.maybe_get_tz('Asia/Tokyo')) + timezones.maybe_get_tz('US/Eastern'), + timezones.maybe_get_tz('Asia/Tokyo')) tm.assert_numpy_array_equal(result, np.array( [tslib.iNaT], dtype=np.int64)) From c95eb3897f2f4d0893f7baa381b2b151805a960e Mon Sep 17 00:00:00 2001 From: Daniel Himmelstein Date: Sun, 24 Sep 2017 05:56:12 -0400 Subject: [PATCH 136/188] DOC: revise What's New for inferring compression from non-string paths (#17338) Refs https://github.com/pandas-dev/pandas/issues/17262 Refs https://github.com/pandas-dev/pandas/pull/17206#issuecomment-322586996 --- doc/source/whatsnew/v0.21.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 261e12b8245094..32dbeb32154e68 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -137,7 +137,7 @@ Other Enhancements - :func:`date_range` now accepts 'Y' in addition to 'A' as an alias for end of year (:issue:`9313`) - Integration with `Apache Parquet `__, including a new top-level :func:`read_parquet` and :func:`DataFrame.to_parquet` method, see :ref:`here `. (:issue:`15838`, :issue:`17438`) - :func:`DataFrame.add_prefix` and :func:`DataFrame.add_suffix` now accept strings containing the '%' character. (:issue:`17151`) -- `read_*` methods can now infer compression from non-string paths, such as ``pathlib.Path`` objects (:issue:`17206`). +- Read/write methods that infer compression (:func:`read_csv`, :func:`read_table`, :func:`read_pickle`, and :meth:`~DataFrame.to_pickle`) can now infer from non-string paths, such as ``pathlib.Path`` objects (:issue:`17206`). - :func:`pd.read_sas()` now recognizes much more of the most frequently used date (datetime) formats in SAS7BDAT files (:issue:`15871`). - :func:`DataFrame.items` and :func:`Series.items` is now present in both Python 2 and 3 and is lazy in all cases (:issue:`13918`, :issue:`17213`) - :func:`Styler.where` has been implemented. It is as a convenience for :func:`Styler.applymap` and enables simple DataFrame styling on the Jupyter notebook (:issue:`17474`). From ae16bf99467d7d26abe506ba95079b07442860a8 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 24 Sep 2017 13:48:00 +0200 Subject: [PATCH 137/188] API: harmonize drop/reindex/rename args (GH12392) - drop (#17644) * API: harmonize drop/reindex/rename args (GH12392) - drop * fixups * add versionadded --- doc/source/whatsnew/v0.21.0.txt | 18 ++++ pandas/core/generic.py | 102 +++++++++++++----- .../tests/frame/test_axis_select_reindex.py | 35 ++++++ 3 files changed, 129 insertions(+), 26 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 32dbeb32154e68..21abdccd2996c9 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -91,6 +91,24 @@ This does not raise any obvious exceptions, but also does not create a new colum Setting a list-like data structure into a new attribute now raise a ``UserWarning`` about the potential for unexpected behavior. See :ref:`Attribute Access `. +``drop`` now also accepts index/columns keywords +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The :meth:`~DataFrame.drop` method has gained ``index``/``columns`` keywords as an +alternative to specify the ``axis`` and to make it similar in usage to ``reindex`` +(:issue:`12392`). + +For example: + +.. ipython:: python + + df = pd.DataFrame(np.arange(8).reshape(2,4), + columns=['A', 'B', 'C', 'D']) + df + df.drop(['B', 'C'], axis=1) + # the following is now equivalent + df.drop(columns=['B', 'C']) + .. _whatsnew_0210.enhancements.categorical_dtype: ``CategoricalDtype`` for specifying categoricals diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 241204ef555f6e..3d55e07df6eacb 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2333,14 +2333,23 @@ def reindex_like(self, other, method=None, copy=True, limit=None, return self.reindex(**d) - def drop(self, labels, axis=0, level=None, inplace=False, errors='raise'): + def drop(self, labels=None, axis=0, index=None, columns=None, level=None, + inplace=False, errors='raise'): """ Return new object with labels in requested axis removed. Parameters ---------- labels : single label or list-like + Index or column labels to drop. axis : int or axis name + Whether to drop labels from the index (0 / 'index') or + columns (1 / 'columns'). + index, columns : single label or list-like + Alternative to specifying `axis` (``labels, axis=1`` is + equivalent to ``columns=labels``). + + .. versionadded:: 0.21.0 level : int or level name, default None For MultiIndex inplace : bool, default False @@ -2354,36 +2363,80 @@ def drop(self, labels, axis=0, level=None, inplace=False, errors='raise'): Examples -------- - >>> df = pd.DataFrame([[1, 2, 3, 4], - ... [5, 6, 7, 8], - ... [9, 1, 2, 3], - ... [4, 5, 6, 7] - ... ], - ... columns=list('ABCD')) + >>> df = pd.DataFrame(np.arange(12).reshape(3,4), + columns=['A', 'B', 'C', 'D']) >>> df - A B C D - 0 1 2 3 4 - 1 5 6 7 8 - 2 9 1 2 3 - 3 4 5 6 7 + A B C D + 0 0 1 2 3 + 1 4 5 6 7 + 2 8 9 10 11 + + Drop columns + + >>> df.drop(['B', 'C'], axis=1) + A D + 0 0 3 + 1 4 7 + 2 8 11 + + >>> df.drop(columns=['B', 'C']) + A D + 0 0 3 + 1 4 7 + 2 8 11 Drop a row by index >>> df.drop([0, 1]) - A B C D - 2 9 1 2 3 - 3 4 5 6 7 + A B C D + 2 8 9 10 11 - Drop columns + Notes + ----- + Specifying both `labels` and `index` or `columns` will raise a + ValueError. - >>> df.drop(['A', 'B'], axis=1) - C D - 0 3 4 - 1 7 8 - 2 2 3 - 3 6 7 """ inplace = validate_bool_kwarg(inplace, 'inplace') + + if labels is not None: + if index is not None or columns is not None: + raise ValueError("Cannot specify both 'labels' and " + "'index'/'columns'") + axis_name = self._get_axis_name(axis) + axes = {axis_name: labels} + elif index is not None or columns is not None: + axes, _ = self._construct_axes_from_arguments((index, columns), {}) + else: + raise ValueError("Need to specify at least one of 'labels', " + "'index' or 'columns'") + + obj = self + + for axis, labels in axes.items(): + if labels is not None: + obj = obj._drop_axis(labels, axis, level=level, errors=errors) + + if inplace: + self._update_inplace(obj) + else: + return obj + + def _drop_axis(self, labels, axis, level=None, errors='raise'): + """ + Drop labels from specified axis. Used in the ``drop`` method + internally. + + Parameters + ---------- + labels : single label or list-like + axis : int or axis name + level : int or level name, default None + For MultiIndex + errors : {'ignore', 'raise'}, default 'raise' + If 'ignore', suppress error and existing labels are dropped. + + """ axis = self._get_axis_number(axis) axis_name = self._get_axis_name(axis) axis, axis_ = self._get_axis(axis), axis @@ -2416,10 +2469,7 @@ def drop(self, labels, axis=0, level=None, inplace=False, errors='raise'): result = self.loc[tuple(slicer)] - if inplace: - self._update_inplace(result) - else: - return result + return result def _update_inplace(self, result, verify_is_copy=True): """ diff --git a/pandas/tests/frame/test_axis_select_reindex.py b/pandas/tests/frame/test_axis_select_reindex.py index e76869bf6712b5..fb9b8c2ed7affe 100644 --- a/pandas/tests/frame/test_axis_select_reindex.py +++ b/pandas/tests/frame/test_axis_select_reindex.py @@ -146,6 +146,41 @@ def test_drop_multiindex_not_lexsorted(self): tm.assert_frame_equal(result, expected) + def test_drop_api_equivalence(self): + # equivalence of the labels/axis and index/columns API's (GH12392) + df = DataFrame([[1, 2, 3], [3, 4, 5], [5, 6, 7]], + index=['a', 'b', 'c'], + columns=['d', 'e', 'f']) + + res1 = df.drop('a') + res2 = df.drop(index='a') + tm.assert_frame_equal(res1, res2) + + res1 = df.drop('d', 1) + res2 = df.drop(columns='d') + tm.assert_frame_equal(res1, res2) + + res1 = df.drop(labels='e', axis=1) + res2 = df.drop(columns='e') + tm.assert_frame_equal(res1, res2) + + res1 = df.drop(['a'], axis=0) + res2 = df.drop(index=['a']) + tm.assert_frame_equal(res1, res2) + + res1 = df.drop(['a'], axis=0).drop(['d'], axis=1) + res2 = df.drop(index=['a'], columns=['d']) + tm.assert_frame_equal(res1, res2) + + with pytest.raises(ValueError): + df.drop(labels='a', index='b') + + with pytest.raises(ValueError): + df.drop(labels='a', columns='b') + + with pytest.raises(ValueError): + df.drop(axis=1) + def test_merge_join_different_levels(self): # GH 9455 From 1f5127144d9c2697445882b81505997a4a67d67e Mon Sep 17 00:00:00 2001 From: gfyoung Date: Sun, 24 Sep 2017 06:13:37 -0700 Subject: [PATCH 138/188] API: Warn about dups in names for read_csv (#17346) xref gh-17095. --- doc/source/io.rst | 4 +-- doc/source/whatsnew/v0.21.0.txt | 1 + pandas/io/parsers.py | 33 ++++++++++++++++-- pandas/tests/io/parser/common.py | 14 -------- pandas/tests/io/parser/dtypes.py | 9 ++--- pandas/tests/io/parser/mangle_dupes.py | 46 ++++++++++++++++++++------ 6 files changed, 74 insertions(+), 33 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index ab1ad74ee8516b..d6abed6e9d1ad6 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -113,8 +113,8 @@ header : int or list of ints, default ``'infer'`` rather than the first line of the file. names : array-like, default ``None`` List of column names to use. If file contains no header row, then you should - explicitly pass ``header=None``. Duplicates in this list are not allowed unless - ``mangle_dupe_cols=True``, which is the default. + explicitly pass ``header=None``. Duplicates in this list will cause + a ``UserWarning`` to be issued. index_col : int or sequence or ``False``, default ``None`` Column to use as the row labels of the DataFrame. If a sequence is given, a MultiIndex is used. If you have a malformed file with delimiters at the end of diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 21abdccd2996c9..49d2c1767807c4 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -467,6 +467,7 @@ Other API Changes - The Categorical constructor no longer accepts a scalar for the ``categories`` keyword. (:issue:`16022`) - Accessing a non-existent attribute on a closed :class:`~pandas.HDFStore` will now raise an ``AttributeError`` rather than a ``ClosedFileError`` (:issue:`16301`) +- :func:`read_csv` now issues a ``UserWarning`` if the ``names`` parameter contains duplicates (:issue:`17095`) - :func:`read_csv` now treats ``'null'`` strings as missing values by default (:issue:`16471`) - :func:`read_csv` now treats ``'n/a'`` strings as missing values by default (:issue:`16078`) - :class:`pandas.HDFStore`'s string representation is now faster and less detailed. For the previous behavior, use ``pandas.HDFStore.info()``. (:issue:`16503`). diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index d9e83176d0d6e3..ed15d4295d6881 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -84,8 +84,8 @@ rather than the first line of the file. names : array-like, default None List of column names to use. If file contains no header row, then you - should explicitly pass header=None. Duplicates in this list are not - allowed unless mangle_dupe_cols=True, which is the default. + should explicitly pass header=None. Duplicates in this list will cause + a ``UserWarning`` to be issued. index_col : int or sequence or False, default None Column to use as the row labels of the DataFrame. If a sequence is given, a MultiIndex is used. If you have a malformed file with delimiters at the end @@ -385,6 +385,32 @@ def _validate_integer(name, val, min_val=0): return val +def _validate_names(names): + """ + Check if the `names` parameter contains duplicates. + + If duplicates are found, we issue a warning before returning. + + Parameters + ---------- + names : array-like or None + An array containing a list of the names used for the output DataFrame. + + Returns + ------- + names : array-like or None + The original `names` parameter. + """ + + if names is not None: + if len(names) != len(set(names)): + msg = ("Duplicate names specified. This " + "will raise an error in the future.") + warnings.warn(msg, UserWarning, stacklevel=3) + + return names + + def _read(filepath_or_buffer, kwds): """Generic reader of line files.""" encoding = kwds.get('encoding', None) @@ -407,6 +433,9 @@ def _read(filepath_or_buffer, kwds): chunksize = _validate_integer('chunksize', kwds.get('chunksize', None), 1) nrows = _validate_integer('nrows', kwds.get('nrows', None)) + # Check for duplicates in names. + _validate_names(kwds.get("names", None)) + # Create the parser. parser = TextFileReader(filepath_or_buffer, **kwds) diff --git a/pandas/tests/io/parser/common.py b/pandas/tests/io/parser/common.py index cfc4a1d7c55eb0..e85d3ad294655c 100644 --- a/pandas/tests/io/parser/common.py +++ b/pandas/tests/io/parser/common.py @@ -1357,20 +1357,6 @@ def test_euro_decimal_format(self): assert df2['Number2'].dtype == float assert df2['Number3'].dtype == float - def test_read_duplicate_names(self): - # See gh-7160 - data = "a,b,a\n0,1,2\n3,4,5" - df = self.read_csv(StringIO(data)) - expected = DataFrame([[0, 1, 2], [3, 4, 5]], - columns=['a', 'b', 'a.1']) - tm.assert_frame_equal(df, expected) - - data = "0,1,2\n3,4,5" - df = self.read_csv(StringIO(data), names=["a", "b", "a"]) - expected = DataFrame([[0, 1, 2], [3, 4, 5]], - columns=['a', 'b', 'a.1']) - tm.assert_frame_equal(df, expected) - def test_inf_parsing(self): data = """\ ,A diff --git a/pandas/tests/io/parser/dtypes.py b/pandas/tests/io/parser/dtypes.py index 7311c9200f269a..402fa0817595c7 100644 --- a/pandas/tests/io/parser/dtypes.py +++ b/pandas/tests/io/parser/dtypes.py @@ -204,10 +204,11 @@ def test_empty_with_dup_column_pass_dtype_by_indexes(self): result = self.read_csv(StringIO(data), dtype={0: 'u1', 1: 'f'}) tm.assert_frame_equal(result, expected, check_index_type=False) - data = '' - result = self.read_csv(StringIO(data), names=['one', 'one'], - dtype={0: 'u1', 1: 'f'}) - tm.assert_frame_equal(result, expected, check_index_type=False) + with tm.assert_produces_warning(UserWarning, check_stacklevel=False): + data = '' + result = self.read_csv(StringIO(data), names=['one', 'one'], + dtype={0: 'u1', 1: 'f'}) + tm.assert_frame_equal(result, expected, check_index_type=False) def test_raise_on_passed_int_dtype_with_nas(self): # see gh-2631 diff --git a/pandas/tests/io/parser/mangle_dupes.py b/pandas/tests/io/parser/mangle_dupes.py index e2efb1377f8b0a..6df69eb475bf76 100644 --- a/pandas/tests/io/parser/mangle_dupes.py +++ b/pandas/tests/io/parser/mangle_dupes.py @@ -7,6 +7,9 @@ """ from pandas.compat import StringIO +from pandas import DataFrame + +import pandas.util.testing as tm class DupeColumnTests(object): @@ -25,6 +28,21 @@ def test_basic(self): mangle_dupe_cols=True) assert list(df.columns) == expected + def test_basic_names(self): + # See gh-7160 + data = "a,b,a\n0,1,2\n3,4,5" + expected = DataFrame([[0, 1, 2], [3, 4, 5]], + columns=["a", "b", "a.1"]) + + df = self.read_csv(StringIO(data)) + tm.assert_frame_equal(df, expected) + + with tm.assert_produces_warning(UserWarning, check_stacklevel=False): + data = "0,1,2\n3,4,5" + df = self.read_csv(StringIO(data), + names=["a", "b", "a"]) + tm.assert_frame_equal(df, expected) + def test_thorough_mangle_columns(self): # see gh-17060 data = "a,a,a.1\n1,2,3" @@ -45,20 +63,26 @@ def test_thorough_mangle_names(self): # see gh-17095 data = "a,b,b\n1,2,3" names = ["a.1", "a.1", "a.1.1"] - df = self.read_csv(StringIO(data), sep=",", names=names, - mangle_dupe_cols=True) - assert list(df.columns) == ["a.1", "a.1.1", "a.1.1.1"] + + with tm.assert_produces_warning(UserWarning, check_stacklevel=False): + df = self.read_csv(StringIO(data), sep=",", names=names, + mangle_dupe_cols=True) + assert list(df.columns) == ["a.1", "a.1.1", "a.1.1.1"] data = "a,b,c,d,e,f\n1,2,3,4,5,6" names = ["a", "a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1"] - df = self.read_csv(StringIO(data), sep=",", names=names, - mangle_dupe_cols=True) - assert list(df.columns) == ["a", "a.1", "a.1.1", "a.1.1.1", - "a.1.1.1.1", "a.1.1.1.1.1"] + + with tm.assert_produces_warning(UserWarning, check_stacklevel=False): + df = self.read_csv(StringIO(data), sep=",", names=names, + mangle_dupe_cols=True) + assert list(df.columns) == ["a", "a.1", "a.1.1", "a.1.1.1", + "a.1.1.1.1", "a.1.1.1.1.1"] data = "a,b,c,d,e,f,g\n1,2,3,4,5,6,7" names = ["a", "a", "a.3", "a.1", "a.2", "a", "a"] - df = self.read_csv(StringIO(data), sep=",", names=names, - mangle_dupe_cols=True) - assert list(df.columns) == ["a", "a.1", "a.3", "a.1.1", - "a.2", "a.2.1", "a.3.1"] + + with tm.assert_produces_warning(UserWarning, check_stacklevel=False): + df = self.read_csv(StringIO(data), sep=",", names=names, + mangle_dupe_cols=True) + assert list(df.columns) == ["a", "a.1", "a.3", "a.1.1", + "a.2", "a.2.1", "a.3.1"] From f8bf12916e21bc03992f14b01a77355e180cdab9 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 24 Sep 2017 06:15:29 -0700 Subject: [PATCH 139/188] cut/paste AccessorProperty and PandasDelegate to core.accessor (#17651) --- pandas/core/accessor.py | 95 ++++++++++++++++++++++++++++++++ pandas/core/base.py | 94 ------------------------------- pandas/core/categorical.py | 5 +- pandas/core/frame.py | 5 +- pandas/core/indexes/accessors.py | 5 +- pandas/core/indexes/base.py | 4 +- pandas/core/indexes/category.py | 3 +- pandas/core/series.py | 11 ++-- pandas/tests/test_base.py | 5 +- 9 files changed, 117 insertions(+), 110 deletions(-) diff --git a/pandas/core/accessor.py b/pandas/core/accessor.py index 9f8556d1e69616..c8476841bfce47 100644 --- a/pandas/core/accessor.py +++ b/pandas/core/accessor.py @@ -5,6 +5,7 @@ that can be mixed into or pinned onto other pandas classes. """ +from pandas.core.common import AbstractMethodError class DirNamesMixin(object): @@ -33,3 +34,97 @@ def __dir__(self): rv = set(dir(type(self))) rv = (rv - self._dir_deletions()) | self._dir_additions() return sorted(rv) + + +class AccessorProperty(object): + """Descriptor for implementing accessor properties like Series.str + """ + + def __init__(self, accessor_cls, construct_accessor=None): + self.accessor_cls = accessor_cls + self.construct_accessor = (construct_accessor or + accessor_cls._make_accessor) + self.__doc__ = accessor_cls.__doc__ + + def __get__(self, instance, owner=None): + if instance is None: + # this ensures that Series.str. is well defined + return self.accessor_cls + return self.construct_accessor(instance) + + def __set__(self, instance, value): + raise AttributeError("can't set attribute") + + def __delete__(self, instance): + raise AttributeError("can't delete attribute") + + +class PandasDelegate(object): + """ an abstract base class for delegating methods/properties """ + + @classmethod + def _make_accessor(cls, data): + raise AbstractMethodError("_make_accessor should be implemented" + "by subclass and return an instance" + "of `cls`.") + + def _delegate_property_get(self, name, *args, **kwargs): + raise TypeError("You cannot access the " + "property {name}".format(name=name)) + + def _delegate_property_set(self, name, value, *args, **kwargs): + raise TypeError("The property {name} cannot be set".format(name=name)) + + def _delegate_method(self, name, *args, **kwargs): + raise TypeError("You cannot call method {name}".format(name=name)) + + @classmethod + def _add_delegate_accessors(cls, delegate, accessors, typ, + overwrite=False): + """ + add accessors to cls from the delegate class + + Parameters + ---------- + cls : the class to add the methods/properties to + delegate : the class to get methods/properties & doc-strings + acccessors : string list of accessors to add + typ : 'property' or 'method' + overwrite : boolean, default False + overwrite the method/property in the target class if it exists + """ + + def _create_delegator_property(name): + + def _getter(self): + return self._delegate_property_get(name) + + def _setter(self, new_values): + return self._delegate_property_set(name, new_values) + + _getter.__name__ = name + _setter.__name__ = name + + return property(fget=_getter, fset=_setter, + doc=getattr(delegate, name).__doc__) + + def _create_delegator_method(name): + + def f(self, *args, **kwargs): + return self._delegate_method(name, *args, **kwargs) + + f.__name__ = name + f.__doc__ = getattr(delegate, name).__doc__ + + return f + + for name in accessors: + + if typ == 'property': + f = _create_delegator_property(name) + else: + f = _create_delegator_method(name) + + # don't overwrite existing methods/properties + if overwrite or not hasattr(cls, name): + setattr(cls, name, f) diff --git a/pandas/core/base.py b/pandas/core/base.py index be021f3621c735..19f67286426450 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -153,100 +153,6 @@ def __setattr__(self, key, value): object.__setattr__(self, key, value) -class PandasDelegate(PandasObject): - """ an abstract base class for delegating methods/properties """ - - @classmethod - def _make_accessor(cls, data): - raise AbstractMethodError("_make_accessor should be implemented" - "by subclass and return an instance" - "of `cls`.") - - def _delegate_property_get(self, name, *args, **kwargs): - raise TypeError("You cannot access the " - "property {name}".format(name=name)) - - def _delegate_property_set(self, name, value, *args, **kwargs): - raise TypeError("The property {name} cannot be set".format(name=name)) - - def _delegate_method(self, name, *args, **kwargs): - raise TypeError("You cannot call method {name}".format(name=name)) - - @classmethod - def _add_delegate_accessors(cls, delegate, accessors, typ, - overwrite=False): - """ - add accessors to cls from the delegate class - - Parameters - ---------- - cls : the class to add the methods/properties to - delegate : the class to get methods/properties & doc-strings - acccessors : string list of accessors to add - typ : 'property' or 'method' - overwrite : boolean, default False - overwrite the method/property in the target class if it exists - """ - - def _create_delegator_property(name): - - def _getter(self): - return self._delegate_property_get(name) - - def _setter(self, new_values): - return self._delegate_property_set(name, new_values) - - _getter.__name__ = name - _setter.__name__ = name - - return property(fget=_getter, fset=_setter, - doc=getattr(delegate, name).__doc__) - - def _create_delegator_method(name): - - def f(self, *args, **kwargs): - return self._delegate_method(name, *args, **kwargs) - - f.__name__ = name - f.__doc__ = getattr(delegate, name).__doc__ - - return f - - for name in accessors: - - if typ == 'property': - f = _create_delegator_property(name) - else: - f = _create_delegator_method(name) - - # don't overwrite existing methods/properties - if overwrite or not hasattr(cls, name): - setattr(cls, name, f) - - -class AccessorProperty(object): - """Descriptor for implementing accessor properties like Series.str - """ - - def __init__(self, accessor_cls, construct_accessor=None): - self.accessor_cls = accessor_cls - self.construct_accessor = (construct_accessor or - accessor_cls._make_accessor) - self.__doc__ = accessor_cls.__doc__ - - def __get__(self, instance, owner=None): - if instance is None: - # this ensures that Series.str. is well defined - return self.accessor_cls - return self.construct_accessor(instance) - - def __set__(self, instance, value): - raise AttributeError("can't set attribute") - - def __delete__(self, instance): - raise AttributeError("can't delete attribute") - - class GroupByError(Exception): pass diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 98d6d7a68017ad..743bae2fd2848c 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -30,7 +30,8 @@ from pandas.core.common import is_null_slice, _maybe_box_datetimelike from pandas.core.algorithms import factorize, take_1d, unique1d -from pandas.core.base import (PandasObject, PandasDelegate, +from pandas.core.accessor import PandasDelegate +from pandas.core.base import (PandasObject, NoNewAttributesMixin, _shared_docs) import pandas.core.common as com from pandas.core.missing import interpolate_2d @@ -2065,7 +2066,7 @@ def repeat(self, repeats, *args, **kwargs): # The Series.cat accessor -class CategoricalAccessor(PandasDelegate, NoNewAttributesMixin): +class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin): """ Accessor object for categorical properties of the Series values. diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 346eeb8d2642cd..899ae99d5deb1f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -90,7 +90,7 @@ from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.indexes.timedeltas import TimedeltaIndex -import pandas.core.base as base +from pandas.core import accessor import pandas.core.common as com import pandas.core.nanops as nanops import pandas.core.ops as ops @@ -5897,7 +5897,8 @@ def isin(self, values): # ---------------------------------------------------------------------- # Add plotting methods to DataFrame - plot = base.AccessorProperty(gfx.FramePlotMethods, gfx.FramePlotMethods) + plot = accessor.AccessorProperty(gfx.FramePlotMethods, + gfx.FramePlotMethods) hist = gfx.hist_frame boxplot = gfx.boxplot_frame diff --git a/pandas/core/indexes/accessors.py b/pandas/core/indexes/accessors.py index 88297ac70984dd..2176338574304a 100644 --- a/pandas/core/indexes/accessors.py +++ b/pandas/core/indexes/accessors.py @@ -11,7 +11,8 @@ is_timedelta64_dtype, is_categorical_dtype, is_list_like) -from pandas.core.base import PandasDelegate, NoNewAttributesMixin +from pandas.core.accessor import PandasDelegate +from pandas.core.base import NoNewAttributesMixin, PandasObject from pandas.core.indexes.datetimes import DatetimeIndex from pandas._libs.period import IncompatibleFrequency # noqa from pandas.core.indexes.period import PeriodIndex @@ -81,7 +82,7 @@ def maybe_to_datetimelike(data, copy=False): "datetimelike index".format(type(data))) -class Properties(PandasDelegate, NoNewAttributesMixin): +class Properties(PandasDelegate, PandasObject, NoNewAttributesMixin): def __init__(self, values, index, name, orig=None): self.values = values diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 562a758f83edc7..f28ff9697e517f 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -57,7 +57,7 @@ import pandas.core.sorting as sorting from pandas.io.formats.printing import pprint_thing from pandas.core.ops import _comp_method_OBJECT_ARRAY -from pandas.core import strings +from pandas.core import strings, accessor from pandas.core.config import get_option @@ -159,7 +159,7 @@ class Index(IndexOpsMixin, PandasObject): _accessors = frozenset(['str']) # String Methods - str = base.AccessorProperty(strings.StringMethods) + str = accessor.AccessorProperty(strings.StringMethods) def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=False, tupleize_cols=True, **kwargs): diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 9a055afccd7997..8b680127723c32 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -19,6 +19,7 @@ from pandas.util._decorators import Appender, cache_readonly from pandas.core.config import get_option from pandas.core.indexes.base import Index, _index_shared_docs +from pandas.core import accessor import pandas.core.base as base import pandas.core.missing as missing import pandas.core.indexes.base as ibase @@ -27,7 +28,7 @@ _index_doc_kwargs.update(dict(target_klass='CategoricalIndex')) -class CategoricalIndex(Index, base.PandasDelegate): +class CategoricalIndex(Index, accessor.PandasDelegate): """ Immutable Index implementing an ordered, sliceable set. CategoricalIndex diff --git a/pandas/core/series.py b/pandas/core/series.py index ea9aeefe3b6651..db8ee2529ef577 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -62,6 +62,7 @@ from pandas.compat import zip, u, OrderedDict, StringIO from pandas.compat.numpy import function as nv +from pandas.core import accessor import pandas.core.ops as ops import pandas.core.algorithms as algorithms @@ -2901,19 +2902,19 @@ def to_period(self, freq=None, copy=True): # ------------------------------------------------------------------------- # Datetimelike delegation methods - dt = base.AccessorProperty(CombinedDatetimelikeProperties) + dt = accessor.AccessorProperty(CombinedDatetimelikeProperties) # ------------------------------------------------------------------------- # Categorical methods - cat = base.AccessorProperty(CategoricalAccessor) + cat = accessor.AccessorProperty(CategoricalAccessor) # String Methods - str = base.AccessorProperty(strings.StringMethods) + str = accessor.AccessorProperty(strings.StringMethods) # ---------------------------------------------------------------------- # Add plotting methods to Series - plot = base.AccessorProperty(gfx.SeriesPlotMethods, - gfx.SeriesPlotMethods) + plot = accessor.AccessorProperty(gfx.SeriesPlotMethods, + gfx.SeriesPlotMethods) hist = gfx.hist_series diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 38d78b12b31aa5..5bfd8eb7eae248 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -18,7 +18,8 @@ CategoricalIndex, Timestamp) from pandas.compat import StringIO, PYPY, long from pandas.compat.numpy import np_array_datetime64_compat -from pandas.core.base import PandasDelegate, NoNewAttributesMixin +from pandas.core.accessor import PandasDelegate +from pandas.core.base import PandasObject, NoNewAttributesMixin from pandas.core.indexes.datetimelike import DatetimeIndexOpsMixin from pandas._libs.tslib import iNaT @@ -105,7 +106,7 @@ def bar(self, *args, **kwargs): """ a test bar method """ pass - class Delegate(PandasDelegate): + class Delegate(PandasDelegate, PandasObject): def __init__(self, obj): self.obj = obj From 965c1c89b6df471d88dc0e1188fb8cbc0d89f867 Mon Sep 17 00:00:00 2001 From: Bob Haffner Date: Sun, 24 Sep 2017 08:22:13 -0500 Subject: [PATCH 140/188] preserve kwargs order on assign func for py36plus - #14207 (#17632) --- doc/source/whatsnew/v0.21.0.txt | 1 + pandas/core/frame.py | 23 ++++++++++++++--------- pandas/tests/frame/test_mutate_columns.py | 16 +++++++++++++--- 3 files changed, 28 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 49d2c1767807c4..1365901c2ce5e3 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -162,6 +162,7 @@ Other Enhancements - :func:`MultiIndex.is_monotonic_decreasing` has been implemented. Previously returned ``False`` in all cases. (:issue:`16554`) - :func:`Categorical.rename_categories` now accepts a dict-like argument as `new_categories` and only updates the categories found in that dict. (:issue:`17336`) - :func:`read_excel` raises ``ImportError`` with a better message if ``xlrd`` is not installed. (:issue:`17613`) +- :meth:`DataFrame.assign` will preserve the original order of ``**kwargs`` for Python 3.6+ users instead of sorting the column names .. _whatsnew_0210.api_breaking: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 899ae99d5deb1f..912dbdb9de7059 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -82,6 +82,7 @@ from pandas.compat import (range, map, zip, lrange, lmap, lzip, StringIO, u, OrderedDict, raise_with_traceback) from pandas import compat +from pandas.compat import PY36 from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender, Substitution from pandas.util._validators import validate_bool_kwarg @@ -2575,12 +2576,12 @@ def assign(self, **kwargs): Notes ----- - Since ``kwargs`` is a dictionary, the order of your - arguments may not be preserved. To make things predicatable, - the columns are inserted in alphabetical order, at the end of - your DataFrame. Assigning multiple columns within the same - ``assign`` is possible, but you cannot reference other columns - created within the same ``assign`` call. + For python 3.6 and above, the columns are inserted in the order of + **kwargs. For python 3.5 and earlier, since **kwargs is unordered, + the columns are inserted in alphabetical order at the end of your + DataFrame. Assigning multiple columns within the same ``assign`` + is possible, but you cannot reference other columns created within + the same ``assign`` call. Examples -------- @@ -2620,14 +2621,18 @@ def assign(self, **kwargs): data = self.copy() # do all calculations first... - results = {} + results = OrderedDict() for k, v in kwargs.items(): results[k] = com._apply_if_callable(v, data) + # preserve order for 3.6 and later, but sort by key for 3.5 and earlier + if PY36: + results = results.items() + else: + results = sorted(results.items()) # ... and then assign - for k, v in sorted(results.items()): + for k, v in results: data[k] = v - return data def _sanitize_column(self, key, value, broadcast=True): diff --git a/pandas/tests/frame/test_mutate_columns.py b/pandas/tests/frame/test_mutate_columns.py index 4462260a290d9b..0043475702f94b 100644 --- a/pandas/tests/frame/test_mutate_columns.py +++ b/pandas/tests/frame/test_mutate_columns.py @@ -4,6 +4,7 @@ import pytest from pandas.compat import range, lrange import numpy as np +from pandas.compat import PY36 from pandas import DataFrame, Series, Index, MultiIndex @@ -61,14 +62,23 @@ def test_assign_multiple(self): [3, 6, 9, 3, 6]], columns=list('ABCDE')) assert_frame_equal(result, expected) - def test_assign_alphabetical(self): + def test_assign_order(self): # GH 9818 df = DataFrame([[1, 2], [3, 4]], columns=['A', 'B']) result = df.assign(D=df.A + df.B, C=df.A - df.B) - expected = DataFrame([[1, 2, -1, 3], [3, 4, -1, 7]], - columns=list('ABCD')) + + if PY36: + expected = DataFrame([[1, 2, 3, -1], [3, 4, 7, -1]], + columns=list('ABDC')) + else: + expected = DataFrame([[1, 2, -1, 3], [3, 4, -1, 7]], + columns=list('ABCD')) assert_frame_equal(result, expected) result = df.assign(C=df.A - df.B, D=df.A + df.B) + + expected = DataFrame([[1, 2, -1, 3], [3, 4, -1, 7]], + columns=list('ABCD')) + assert_frame_equal(result, expected) def test_assign_bad(self): From e0743a1b9725c1bb63c738f6e730f52e269095ac Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 24 Sep 2017 14:42:53 -0400 Subject: [PATCH 141/188] TST: install cython from pip for 3.6_NUMPY_DEV build (#17657) --- ci/requirements-3.6_NUMPY_DEV.build | 1 - ci/requirements-3.6_NUMPY_DEV.build.sh | 3 +++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/ci/requirements-3.6_NUMPY_DEV.build b/ci/requirements-3.6_NUMPY_DEV.build index 900c050f1cc9ef..336fbe86b57d88 100644 --- a/ci/requirements-3.6_NUMPY_DEV.build +++ b/ci/requirements-3.6_NUMPY_DEV.build @@ -1,3 +1,2 @@ python=3.6* pytz -cython diff --git a/ci/requirements-3.6_NUMPY_DEV.build.sh b/ci/requirements-3.6_NUMPY_DEV.build.sh index 90ed04f8f0c17a..fd79142c5cebbe 100644 --- a/ci/requirements-3.6_NUMPY_DEV.build.sh +++ b/ci/requirements-3.6_NUMPY_DEV.build.sh @@ -14,4 +14,7 @@ pip install --pre --upgrade --timeout=60 -f $PRE_WHEELS numpy scipy # install dateutil from master pip install -U git+git://github.com/dateutil/dateutil.git +# cython via pip +pip install cython + true From 6da85b30d989855fe2a1f5d1323189f0fc639e60 Mon Sep 17 00:00:00 2001 From: Licht Takeuchi Date: Mon, 25 Sep 2017 04:06:42 +0900 Subject: [PATCH 142/188] TST: Fix repeat parameter overwritten the sparse asv test (#17659) --- asv_bench/benchmarks/sparse.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/sparse.py b/asv_bench/benchmarks/sparse.py index 7259e8cdb7d614..b958f5e0e5c342 100644 --- a/asv_bench/benchmarks/sparse.py +++ b/asv_bench/benchmarks/sparse.py @@ -1,4 +1,4 @@ -from itertools import repeat +import itertools from .pandas_vb_common import * import scipy.sparse @@ -33,7 +33,7 @@ def time_sparse_from_scipy(self): SparseDataFrame(scipy.sparse.rand(1000, 1000, 0.005)) def time_sparse_from_dict(self): - SparseDataFrame(dict(zip(range(1000), repeat([0])))) + SparseDataFrame(dict(zip(range(1000), itertools.repeat([0])))) class sparse_series_from_coo(object): From 0d06216e9aad8572350395b524a591e93c094836 Mon Sep 17 00:00:00 2001 From: topper-123 Date: Mon, 25 Sep 2017 09:13:10 +0200 Subject: [PATCH 143/188] DOC: fixed errors in doc string for Categorical + cleanup (#17655) --- pandas/core/categorical.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 743bae2fd2848c..8b055e9ae59c3a 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -197,34 +197,34 @@ class Categorical(PandasObject): Examples -------- - >>> from pandas import Categorical - >>> Categorical([1, 2, 3, 1, 2, 3]) + >>> pd.Categorical([1, 2, 3, 1, 2, 3]) [1, 2, 3, 1, 2, 3] - Categories (3, int64): [1 < 2 < 3] + Categories (3, int64): [1, 2, 3] - >>> Categorical(['a', 'b', 'c', 'a', 'b', 'c']) + >>> pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c']) [a, b, c, a, b, c] - Categories (3, object): [a < b < c] + Categories (3, object): [a, b, c] - Only ordered `Categoricals` can be sorted (according to the order - of the categories) and have a min and max value. + Ordered `Categoricals` can be sorted according to the custom order + of the categories and can have a min and max value. - >>> a = Categorical(['a','b','c','a','b','c'], ['c', 'b', 'a'], - ordered=True) - >>> a.min() + >>> c = pd.Categorical(['a','b','c','a','b','c'], ordered=True, + ... categories=['c', 'b', 'a']) + >>> c + [a, b, c, a, b, c] + Categories (3, object): [c < b < a] + >>> c.min() 'c' Notes ----- - See the :ref:`user guide ` for more. + See the `user guide + `_ for more. See also -------- - Categorical.sort - Categorical.order - Categorical.min - Categorical.max pandas.api.types.CategoricalDtype + CategoricalIndex : An Index with an underlying ``Categorical`` """ # For comparisons, so that numpy uses our implementation if the compare From 35bcd260a6e01fdb41c0e8b73c47db286250694b Mon Sep 17 00:00:00 2001 From: topper-123 Date: Mon, 25 Sep 2017 10:12:34 +0200 Subject: [PATCH 144/188] DOC: Added example to MultiIndex doc string (#17653) --- pandas/core/indexes/multi.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 0b7c5f414b1789..8c6b26c9070a9c 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -68,6 +68,33 @@ class MultiIndex(Index): Copy the meta-data verify_integrity : boolean, default True Check that the levels/labels are consistent and valid + + Examples + --------- + A new ``MultiIndex`` is typically constructed using one of the helper + methods :meth:`MultiIndex.from_arrays``, :meth:`MultiIndex.from_product`` + and :meth:`MultiIndex.from_tuples``. For example (using ``.from_arrays``): + + >>> arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']] + >>> pd.MultiIndex.from_arrays(arrays, names=('number', 'color')) + MultiIndex(levels=[[1, 2], ['blue', 'red']], + labels=[[0, 0, 1, 1], [1, 0, 1, 0]], + names=['number', 'color']) + + See further examples for how to construct a MultiIndex in the doc strings + of the mentioned helper methods. + + Notes + ----- + See the `user guide + `_ for more. + + See Also + -------- + MultiIndex.from_arrays : Convert list of arrays to MultiIndex + MultiIndex.from_product : Create a MultiIndex from the cartesian product + of iterables + MultiIndex.from_tuples : Convert list of tuples to a MultiIndex """ # initialize to zero-length tuples to make everything work From 0e2ce9a6001b8b9b8e5ba7ab2e57ea9201c74e8f Mon Sep 17 00:00:00 2001 From: topper-123 Date: Mon, 25 Sep 2017 10:13:55 +0200 Subject: [PATCH 145/188] DOC: Change plot style to matplotlib default from ggplot (#17462) --- doc/source/10min.rst | 2 +- doc/source/computation.rst | 2 +- doc/source/cookbook.rst | 2 +- doc/source/dsintro.rst | 2 +- doc/source/gotchas.rst | 2 +- doc/source/groupby.rst | 2 +- doc/source/missing_data.rst | 2 +- doc/source/visualization.rst | 28 ++++++++++++++++++---------- 8 files changed, 25 insertions(+), 17 deletions(-) diff --git a/doc/source/10min.rst b/doc/source/10min.rst index ef6b2d6ef2c904..0a23f490e66283 100644 --- a/doc/source/10min.rst +++ b/doc/source/10min.rst @@ -11,7 +11,7 @@ np.random.seed(123456) np.set_printoptions(precision=4, suppress=True) import matplotlib - matplotlib.style.use('ggplot') + # matplotlib.style.use('default') pd.options.display.max_rows = 15 #### portions of this were borrowed from the diff --git a/doc/source/computation.rst b/doc/source/computation.rst index 14cfdbc3648375..466ac3c9cbf51b 100644 --- a/doc/source/computation.rst +++ b/doc/source/computation.rst @@ -8,7 +8,7 @@ np.set_printoptions(precision=4, suppress=True) import pandas as pd import matplotlib - matplotlib.style.use('ggplot') + # matplotlib.style.use('default') import matplotlib.pyplot as plt plt.close('all') pd.options.display.max_rows=15 diff --git a/doc/source/cookbook.rst b/doc/source/cookbook.rst index 5bb3ba75fe51bc..f13e5e67de07e6 100644 --- a/doc/source/cookbook.rst +++ b/doc/source/cookbook.rst @@ -20,7 +20,7 @@ pd.options.display.max_rows=15 import matplotlib - matplotlib.style.use('ggplot') + # matplotlib.style.use('default') np.set_printoptions(precision=4, suppress=True) diff --git a/doc/source/dsintro.rst b/doc/source/dsintro.rst index ec0a1c7a00bf74..e5c7637ddb4993 100644 --- a/doc/source/dsintro.rst +++ b/doc/source/dsintro.rst @@ -10,7 +10,7 @@ pd.options.display.max_rows = 15 import matplotlib - matplotlib.style.use('ggplot') + # matplotlib.style.use('default') import matplotlib.pyplot as plt plt.close('all') diff --git a/doc/source/gotchas.rst b/doc/source/gotchas.rst index 9e6f98923fca6c..8ae830d7fd76b2 100644 --- a/doc/source/gotchas.rst +++ b/doc/source/gotchas.rst @@ -14,7 +14,7 @@ Frequently Asked Questions (FAQ) import pandas as pd pd.options.display.max_rows = 15 import matplotlib - matplotlib.style.use('ggplot') + # matplotlib.style.use('default') import matplotlib.pyplot as plt plt.close('all') diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index 91d806ca5dd4f8..175ea281226062 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -10,7 +10,7 @@ import pandas as pd pd.options.display.max_rows = 15 import matplotlib - matplotlib.style.use('ggplot') + # matplotlib.style.use('default') import matplotlib.pyplot as plt plt.close('all') from collections import OrderedDict diff --git a/doc/source/missing_data.rst b/doc/source/missing_data.rst index b33b5c304853ae..07740d66a21865 100644 --- a/doc/source/missing_data.rst +++ b/doc/source/missing_data.rst @@ -7,7 +7,7 @@ import pandas as pd pd.options.display.max_rows=15 import matplotlib - matplotlib.style.use('ggplot') + # matplotlib.style.use('default') import matplotlib.pyplot as plt .. _missing_data: diff --git a/doc/source/visualization.rst b/doc/source/visualization.rst index 82ad8de93514e2..7db3b63fd8f08a 100644 --- a/doc/source/visualization.rst +++ b/doc/source/visualization.rst @@ -10,7 +10,7 @@ np.set_printoptions(precision=4, suppress=True) pd.options.display.max_rows = 15 import matplotlib - matplotlib.style.use('ggplot') + # matplotlib.style.use('default') import matplotlib.pyplot as plt plt.close('all') @@ -24,13 +24,6 @@ We use the standard convention for referencing the matplotlib API: import matplotlib.pyplot as plt -The plots in this document are made using matplotlib's ``ggplot`` style (new in version 1.4): - -.. code-block:: python - - import matplotlib - matplotlib.style.use('ggplot') - We provide the basics in pandas to easily create decent looking plots. See the :ref:`ecosystem ` section for visualization libraries that go beyond the basics documented here. @@ -134,7 +127,7 @@ For example, a bar plot can be created the following way: plt.figure(); @savefig bar_plot_ex.png - df.iloc[5].plot(kind='bar'); plt.axhline(0, color='k') + df.iloc[5].plot(kind='bar'); .. versionadded:: 0.17.0 @@ -154,7 +147,7 @@ and :ref:`DataFrame.boxplot() ` methods, which use a separate Finally, there are several :ref:`plotting functions ` in ``pandas.plotting`` that take a :class:`Series` or :class:`DataFrame` as an argument. These -include +include: * :ref:`Scatter Matrix ` * :ref:`Andrews Curves ` @@ -1049,6 +1042,21 @@ be colored differently. Plot Formatting --------------- +Setting the plot style +~~~~~~~~~~~~~~~~~~~~~~ + +From version 1.5 and up, matplotlib offers a range of preconfigured plotting styles. Setting the +style can be used to easily give plots the general look that you want. +Setting the style is as easy as calling ``matplotlib.style.use(my_plot_style)`` before +creating your plot. For example you could do ``matplotlib.style.use('ggplot')`` for ggplot-style +plots. + +You can see the various available style names at ``matplotlib.style.available`` and it's very +easy to try them out. + +General plot style arguments +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Most plotting methods have a set of keyword arguments that control the layout and formatting of the returned plot: From 42195dbdc4e3c703f336dc618aa64f6efc4e4977 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 25 Sep 2017 03:07:46 -0700 Subject: [PATCH 146/188] Separate out strptime.pyx from tslib (#17342) --- pandas/_libs/__init__.py | 1 + pandas/_libs/src/datetime.pxd | 15 + pandas/_libs/src/datetime/np_datetime.c | 6 + pandas/_libs/src/datetime/np_datetime.h | 3 + pandas/_libs/tslib.pyx | 610 +--------------------- pandas/_libs/tslibs/strptime.pyx | 640 ++++++++++++++++++++++++ pandas/core/tools/datetimes.py | 5 +- setup.py | 5 +- 8 files changed, 675 insertions(+), 610 deletions(-) create mode 100644 pandas/_libs/tslibs/strptime.pyx diff --git a/pandas/_libs/__init__.py b/pandas/_libs/__init__.py index ab3832d0292ba4..b4c3ff8008015f 100644 --- a/pandas/_libs/__init__.py +++ b/pandas/_libs/__init__.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- # flake8: noqa from .tslib import iNaT, NaT, Timestamp, Timedelta, OutOfBoundsDatetime diff --git a/pandas/_libs/src/datetime.pxd b/pandas/_libs/src/datetime.pxd index 23620e790c1323..86c8f3bfc74f3b 100644 --- a/pandas/_libs/src/datetime.pxd +++ b/pandas/_libs/src/datetime.pxd @@ -94,6 +94,7 @@ cdef extern from "datetime/np_datetime.h": PANDAS_DATETIMEUNIT fr, pandas_datetimestruct *result) nogil int days_per_month_table[2][12] + pandas_datetimestruct _NS_MIN_DTS, _NS_MAX_DTS int dayofweek(int y, int m, int d) nogil int is_leapyear(int64_t year) nogil @@ -161,3 +162,17 @@ cdef inline int64_t _date_to_datetime64(object val, dts.hour = dts.min = dts.sec = dts.us = 0 dts.ps = dts.as = 0 return pandas_datetimestruct_to_datetime(PANDAS_FR_ns, dts) + + +cdef inline bint check_dts_bounds(pandas_datetimestruct *dts): + """Returns True if an error needs to be raised""" + cdef: + bint error = False + + if (dts.year <= 1677 and + cmp_pandas_datetimestruct(dts, &_NS_MIN_DTS) == -1): + error = True + elif (dts.year >= 2262 and + cmp_pandas_datetimestruct(dts, &_NS_MAX_DTS) == 1): + error = True + return error diff --git a/pandas/_libs/src/datetime/np_datetime.c b/pandas/_libs/src/datetime/np_datetime.c index 84584189888636..ffb901981f939f 100644 --- a/pandas/_libs/src/datetime/np_datetime.c +++ b/pandas/_libs/src/datetime/np_datetime.c @@ -40,6 +40,12 @@ This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt #define PyInt_AsUnsignedLongLongMask PyLong_AsUnsignedLongLongMask #endif +const pandas_datetimestruct _NS_MIN_DTS = { + 1677, 9, 21, 0, 12, 43, 145225, 0, 0}; +const pandas_datetimestruct _NS_MAX_DTS = { + 2262, 4, 11, 23, 47, 16, 854775, 807000, 0}; + + const int days_per_month_table[2][12] = { {31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}, {31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}}; diff --git a/pandas/_libs/src/datetime/np_datetime.h b/pandas/_libs/src/datetime/np_datetime.h index 97ec5782b625b5..a20bff60126aac 100644 --- a/pandas/_libs/src/datetime/np_datetime.h +++ b/pandas/_libs/src/datetime/np_datetime.h @@ -54,6 +54,9 @@ typedef struct { int num; } pandas_datetime_metadata; +extern const pandas_datetimestruct _NS_MIN_DTS; +extern const pandas_datetimestruct _NS_MAX_DTS; + // stuff pandas needs // ---------------------------------------------------------------------------- diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index c629ccbd8e1fd4..d4ca5af09367eb 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -50,6 +50,7 @@ from datetime cimport ( npy_datetime, is_leapyear, dayofweek, + check_dts_bounds, PANDAS_FR_ns, PyDateTime_Check, PyDate_Check, PyDateTime_IMPORT, @@ -69,6 +70,7 @@ from khash cimport ( cimport cython import re +import time # dateutil compat from dateutil.tz import (tzoffset, tzlocal as _dateutil_tzlocal, @@ -1691,21 +1693,10 @@ class OutOfBoundsDatetime(ValueError): pass cdef inline _check_dts_bounds(pandas_datetimestruct *dts): - cdef: - bint error = False - - if dts.year <= 1677 and cmp_pandas_datetimestruct(dts, &_NS_MIN_DTS) == -1: - error = True - elif ( - dts.year >= 2262 and - cmp_pandas_datetimestruct(dts, &_NS_MAX_DTS) == 1): - error = True - - if error: + if check_dts_bounds(dts): fmt = '%d-%.2d-%.2d %.2d:%.2d:%.2d' % (dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec) - raise OutOfBoundsDatetime( 'Out of bounds nanosecond timestamp: %s' % fmt) @@ -3515,284 +3506,6 @@ cpdef convert_to_timedelta64(object ts, object unit): return ts.astype('timedelta64[ns]') -def array_strptime(ndarray[object] values, object fmt, - bint exact=True, errors='raise'): - """ - Parameters - ---------- - values : ndarray of string-like objects - fmt : string-like regex - exact : matches must be exact if True, search if False - coerce : if invalid values found, coerce to NaT - """ - - cdef: - Py_ssize_t i, n = len(values) - pandas_datetimestruct dts - ndarray[int64_t] iresult - int year, month, day, minute, hour, second, weekday, julian, tz - int week_of_year, week_of_year_start - int64_t us, ns - object val, group_key, ampm, found - dict found_key - bint is_raise = errors=='raise' - bint is_ignore = errors=='ignore' - bint is_coerce = errors=='coerce' - - assert is_raise or is_ignore or is_coerce - - global _TimeRE_cache, _regex_cache - with _cache_lock: - if _getlang() != _TimeRE_cache.locale_time.lang: - _TimeRE_cache = TimeRE() - _regex_cache.clear() - if len(_regex_cache) > _CACHE_MAX_SIZE: - _regex_cache.clear() - locale_time = _TimeRE_cache.locale_time - format_regex = _regex_cache.get(fmt) - if not format_regex: - try: - format_regex = _TimeRE_cache.compile(fmt) - # KeyError raised when a bad format is found; can be specified as - # \\, in which case it was a stray % but with a space after it - except KeyError, err: - bad_directive = err.args[0] - if bad_directive == "\\": - bad_directive = "%" - del err - raise ValueError("'%s' is a bad directive in format '%s'" % - (bad_directive, fmt)) - # IndexError only occurs when the format string is "%" - except IndexError: - raise ValueError("stray %% in format '%s'" % fmt) - _regex_cache[fmt] = format_regex - - result = np.empty(n, dtype='M8[ns]') - iresult = result.view('i8') - - dts.us = dts.ps = dts.as = 0 - - cdef dict _parse_code_table = { - 'y': 0, - 'Y': 1, - 'm': 2, - 'B': 3, - 'b': 4, - 'd': 5, - 'H': 6, - 'I': 7, - 'M': 8, - 'S': 9, - 'f': 10, - 'A': 11, - 'a': 12, - 'w': 13, - 'j': 14, - 'U': 15, - 'W': 16, - 'Z': 17, - 'p': 18 # just an additional key, works only with I - } - cdef int parse_code - - for i in range(n): - val = values[i] - if util.is_string_object(val): - if val in _nat_strings: - iresult[i] = NPY_NAT - continue - else: - if _checknull_with_nat(val): - iresult[i] = NPY_NAT - continue - else: - val = str(val) - - # exact matching - if exact: - found = format_regex.match(val) - if not found: - if is_coerce: - iresult[i] = NPY_NAT - continue - raise ValueError("time data %r does not match " - "format %r (match)" % (values[i], fmt)) - if len(val) != found.end(): - if is_coerce: - iresult[i] = NPY_NAT - continue - raise ValueError("unconverted data remains: %s" % - values[i][found.end():]) - - # search - else: - found = format_regex.search(val) - if not found: - if is_coerce: - iresult[i] = NPY_NAT - continue - raise ValueError("time data %r does not match format " - "%r (search)" % (values[i], fmt)) - - year = 1900 - month = day = 1 - hour = minute = second = ns = us = 0 - tz = -1 - # Default to -1 to signify that values not known; not critical to have, - # though - week_of_year = -1 - week_of_year_start = -1 - # weekday and julian defaulted to -1 so as to signal need to calculate - # values - weekday = julian = -1 - found_dict = found.groupdict() - for group_key in found_dict.iterkeys(): - # Directives not explicitly handled below: - # c, x, X - # handled by making out of other directives - # U, W - # worthless without day of the week - parse_code = _parse_code_table[group_key] - - if parse_code == 0: - year = int(found_dict['y']) - # Open Group specification for strptime() states that a %y - #value in the range of [00, 68] is in the century 2000, while - #[69,99] is in the century 1900 - if year <= 68: - year += 2000 - else: - year += 1900 - elif parse_code == 1: - year = int(found_dict['Y']) - elif parse_code == 2: - month = int(found_dict['m']) - elif parse_code == 3: - # elif group_key == 'B': - month = locale_time.f_month.index(found_dict['B'].lower()) - elif parse_code == 4: - # elif group_key == 'b': - month = locale_time.a_month.index(found_dict['b'].lower()) - elif parse_code == 5: - # elif group_key == 'd': - day = int(found_dict['d']) - elif parse_code == 6: - # elif group_key == 'H': - hour = int(found_dict['H']) - elif parse_code == 7: - hour = int(found_dict['I']) - ampm = found_dict.get('p', '').lower() - # If there was no AM/PM indicator, we'll treat this like AM - if ampm in ('', locale_time.am_pm[0]): - # We're in AM so the hour is correct unless we're - # looking at 12 midnight. - # 12 midnight == 12 AM == hour 0 - if hour == 12: - hour = 0 - elif ampm == locale_time.am_pm[1]: - # We're in PM so we need to add 12 to the hour unless - # we're looking at 12 noon. - # 12 noon == 12 PM == hour 12 - if hour != 12: - hour += 12 - elif parse_code == 8: - minute = int(found_dict['M']) - elif parse_code == 9: - second = int(found_dict['S']) - elif parse_code == 10: - s = found_dict['f'] - # Pad to always return nanoseconds - s += "0" * (9 - len(s)) - us = long(s) - ns = us % 1000 - us = us / 1000 - elif parse_code == 11: - weekday = locale_time.f_weekday.index(found_dict['A'].lower()) - elif parse_code == 12: - weekday = locale_time.a_weekday.index(found_dict['a'].lower()) - elif parse_code == 13: - weekday = int(found_dict['w']) - if weekday == 0: - weekday = 6 - else: - weekday -= 1 - elif parse_code == 14: - julian = int(found_dict['j']) - elif parse_code == 15 or parse_code == 16: - week_of_year = int(found_dict[group_key]) - if group_key == 'U': - # U starts week on Sunday. - week_of_year_start = 6 - else: - # W starts week on Monday. - week_of_year_start = 0 - elif parse_code == 17: - # Since -1 is default value only need to worry about setting tz - # if it can be something other than -1. - found_zone = found_dict['Z'].lower() - for value, tz_values in enumerate(locale_time.timezone): - if found_zone in tz_values: - # Deal w/ bad locale setup where timezone names are the - # same and yet time.daylight is true; too ambiguous to - # be able to tell what timezone has daylight savings - if (time.tzname[0] == time.tzname[1] and - time.daylight and found_zone not in ( - "utc", "gmt")): - break - else: - tz = value - break - # If we know the wk of the year and what day of that wk, we can figure - # out the Julian day of the year. - if julian == -1 and week_of_year != -1 and weekday != -1: - week_starts_Mon = True if week_of_year_start == 0 else False - julian = _calc_julian_from_U_or_W(year, week_of_year, weekday, - week_starts_Mon) - # Cannot pre-calculate datetime_date() since can change in Julian - # calculation and thus could have different value for the day of the wk - # calculation. - try: - if julian == -1: - # Need to add 1 to result since first day of the year is 1, not - # 0. - julian = datetime_date(year, month, day).toordinal() - \ - datetime_date(year, 1, 1).toordinal() + 1 - else: # Assume that if they bothered to include Julian day it will - # be accurate. - datetime_result = datetime_date.fromordinal( - (julian - 1) + datetime_date(year, 1, 1).toordinal()) - year = datetime_result.year - month = datetime_result.month - day = datetime_result.day - except ValueError: - if is_coerce: - iresult[i] = NPY_NAT - continue - raise - if weekday == -1: - weekday = datetime_date(year, month, day).weekday() - - dts.year = year - dts.month = month - dts.day = day - dts.hour = hour - dts.min = minute - dts.sec = second - dts.us = us - dts.ps = ns * 1000 - - iresult[i] = pandas_datetimestruct_to_datetime(PANDAS_FR_ns, &dts) - try: - _check_dts_bounds(&dts) - except ValueError: - if is_coerce: - iresult[i] = NPY_NAT - continue - raise - - return result - - #---------------------------------------------------------------------- # NaT methods/property setups @@ -5176,320 +4889,3 @@ def shift_months(int64_t[:] dtindex, int months, object day=None): raise ValueError("day must be None, 'start' or 'end'") return np.asarray(out) - -#---------------------------------------------------------------------- -# Don't even ask - -"""Strptime-related classes and functions. - -CLASSES: - LocaleTime -- Discovers and stores locale-specific time information - TimeRE -- Creates regexes for pattern matching a string of text containing - time information - -FUNCTIONS: - _getlang -- Figure out what language is being used for the locale - strptime -- Calculates the time struct represented by the passed-in string - -""" -import time -import locale -import calendar -from re import compile as re_compile -from re import IGNORECASE -from re import escape as re_escape -from datetime import date as datetime_date - -# Python 2 vs Python 3 -try: - from thread import allocate_lock as _thread_allocate_lock -except: - try: - from _thread import allocate_lock as _thread_allocate_lock - except: - try: - from dummy_thread import allocate_lock as _thread_allocate_lock - except: - from _dummy_thread import allocate_lock as _thread_allocate_lock - -__all__ = [] - - -def _getlang(): - # Figure out what the current language is set to. - return locale.getlocale(locale.LC_TIME) - - -class LocaleTime(object): - """Stores and handles locale-specific information related to time. - - ATTRIBUTES: - f_weekday -- full weekday names (7-item list) - a_weekday -- abbreviated weekday names (7-item list) - f_month -- full month names (13-item list; dummy value in [0], which - is added by code) - a_month -- abbreviated month names (13-item list, dummy value in - [0], which is added by code) - am_pm -- AM/PM representation (2-item list) - LC_date_time -- format string for date/time representation (string) - LC_date -- format string for date representation (string) - LC_time -- format string for time representation (string) - timezone -- daylight- and non-daylight-savings timezone representation - (2-item list of sets) - lang -- Language used by instance (2-item tuple) - """ - - def __init__(self): - """Set all attributes. - - Order of methods called matters for dependency reasons. - - The locale language is set at the offset and then checked again before - exiting. This is to make sure that the attributes were not set with a - mix of information from more than one locale. This would most likely - happen when using threads where one thread calls a locale-dependent - function while another thread changes the locale while the function in - the other thread is still running. Proper coding would call for - locks to prevent changing the locale while locale-dependent code is - running. The check here is done in case someone does not think about - doing this. - - Only other possible issue is if someone changed the timezone and did - not call tz.tzset . That is an issue for the programmer, though, - since changing the timezone is worthless without that call. - - """ - self.lang = _getlang() - self.__calc_weekday() - self.__calc_month() - self.__calc_am_pm() - self.__calc_timezone() - self.__calc_date_time() - if _getlang() != self.lang: - raise ValueError("locale changed during initialization") - - def __pad(self, seq, front): - # Add '' to seq to either the front (is True), else the back. - seq = list(seq) - if front: - seq.insert(0, '') - else: - seq.append('') - return seq - - def __calc_weekday(self): - # Set self.a_weekday and self.f_weekday using the calendar - # module. - a_weekday = [calendar.day_abbr[i].lower() for i in range(7)] - f_weekday = [calendar.day_name[i].lower() for i in range(7)] - self.a_weekday = a_weekday - self.f_weekday = f_weekday - - def __calc_month(self): - # Set self.f_month and self.a_month using the calendar module. - a_month = [calendar.month_abbr[i].lower() for i in range(13)] - f_month = [calendar.month_name[i].lower() for i in range(13)] - self.a_month = a_month - self.f_month = f_month - - def __calc_am_pm(self): - # Set self.am_pm by using time.strftime(). - - # The magic date (1999,3,17,hour,44,55,2,76,0) is not really that - # magical; just happened to have used it everywhere else where a - # static date was needed. - am_pm = [] - for hour in (01, 22): - time_tuple = time.struct_time( - (1999, 3, 17, hour, 44, 55, 2, 76, 0)) - am_pm.append(time.strftime("%p", time_tuple).lower()) - self.am_pm = am_pm - - def __calc_date_time(self): - # Set self.date_time, self.date, & self.time by using - # time.strftime(). - - # Use (1999,3,17,22,44,55,2,76,0) for magic date because the amount of - # overloaded numbers is minimized. The order in which searches for - # values within the format string is very important; it eliminates - # possible ambiguity for what something represents. - time_tuple = time.struct_time((1999, 3, 17, 22, 44, 55, 2, 76, 0)) - date_time = [None, None, None] - date_time[0] = time.strftime("%c", time_tuple).lower() - date_time[1] = time.strftime("%x", time_tuple).lower() - date_time[2] = time.strftime("%X", time_tuple).lower() - replacement_pairs = [('%', '%%'), (self.f_weekday[2], '%A'), - (self.f_month[3], - '%B'), (self.a_weekday[2], '%a'), - (self.a_month[3], '%b'), (self.am_pm[1], '%p'), - ('1999', '%Y'), ('99', '%y'), ('22', '%H'), - ('44', '%M'), ('55', '%S'), ('76', '%j'), - ('17', '%d'), ('03', '%m'), ('3', '%m'), - # '3' needed for when no leading zero. - ('2', '%w'), ('10', '%I')] - replacement_pairs.extend([(tz, "%Z") for tz_values in self.timezone - for tz in tz_values]) - for offset, directive in ((0, '%c'), (1, '%x'), (2, '%X')): - current_format = date_time[offset] - for old, new in replacement_pairs: - # Must deal with possible lack of locale info - # manifesting itself as the empty string (e.g., Swedish's - # lack of AM/PM info) or a platform returning a tuple of empty - # strings (e.g., MacOS 9 having timezone as ('','')). - if old: - current_format = current_format.replace(old, new) - # If %W is used, then Sunday, 2005-01-03 will fall on week 0 since - # 2005-01-03 occurs before the first Monday of the year. Otherwise - # %U is used. - time_tuple = time.struct_time((1999, 1, 3, 1, 1, 1, 6, 3, 0)) - if '00' in time.strftime(directive, time_tuple): - U_W = '%W' - else: - U_W = '%U' - date_time[offset] = current_format.replace('11', U_W) - self.LC_date_time = date_time[0] - self.LC_date = date_time[1] - self.LC_time = date_time[2] - - def __calc_timezone(self): - # Set self.timezone by using time.tzname. - # Do not worry about possibility of time.tzname[0] == timetzname[1] - # and time.daylight; handle that in strptime . - try: - time.tzset() - except AttributeError: - pass - no_saving = frozenset(["utc", "gmt", time.tzname[0].lower()]) - if time.daylight: - has_saving = frozenset([time.tzname[1].lower()]) - else: - has_saving = frozenset() - self.timezone = (no_saving, has_saving) - - -class TimeRE(dict): - """Handle conversion from format directives to regexes.""" - - def __init__(self, locale_time=None): - """Create keys/values. - - Order of execution is important for dependency reasons. - - """ - if locale_time: - self.locale_time = locale_time - else: - self.locale_time = LocaleTime() - base = super(TimeRE, self) - base.__init__({ - # The " \d" part of the regex is to make %c from ANSI C work - 'd': r"(?P3[0-1]|[1-2]\d|0[1-9]|[1-9]| [1-9])", - 'f': r"(?P[0-9]{1,9})", - 'H': r"(?P2[0-3]|[0-1]\d|\d)", - 'I': r"(?P1[0-2]|0[1-9]|[1-9])", - 'j': (r"(?P36[0-6]|3[0-5]\d|[1-2]\d\d|0[1-9]\d|00[1-9]|" - r"[1-9]\d|0[1-9]|[1-9])"), - 'm': r"(?P1[0-2]|0[1-9]|[1-9])", - 'M': r"(?P[0-5]\d|\d)", - 'S': r"(?P6[0-1]|[0-5]\d|\d)", - 'U': r"(?P5[0-3]|[0-4]\d|\d)", - 'w': r"(?P[0-6])", - # W is set below by using 'U' - 'y': r"(?P\d\d)", - #XXX: Does 'Y' need to worry about having less or more than - # 4 digits? - 'Y': r"(?P\d\d\d\d)", - 'A': self.__seqToRE(self.locale_time.f_weekday, 'A'), - 'a': self.__seqToRE(self.locale_time.a_weekday, 'a'), - 'B': self.__seqToRE(self.locale_time.f_month[1:], 'B'), - 'b': self.__seqToRE(self.locale_time.a_month[1:], 'b'), - 'p': self.__seqToRE(self.locale_time.am_pm, 'p'), - 'Z': self.__seqToRE([tz for tz_names in self.locale_time.timezone - for tz in tz_names], - 'Z'), - '%': '%'}) - base.__setitem__('W', base.__getitem__('U').replace('U', 'W')) - base.__setitem__('c', self.pattern(self.locale_time.LC_date_time)) - base.__setitem__('x', self.pattern(self.locale_time.LC_date)) - base.__setitem__('X', self.pattern(self.locale_time.LC_time)) - - def __seqToRE(self, to_convert, directive): - """Convert a list to a regex string for matching a directive. - - Want possible matching values to be from longest to shortest. This - prevents the possibility of a match occuring for a value that also - a substring of a larger value that should have matched (e.g., 'abc' - matching when 'abcdef' should have been the match). - - """ - to_convert = sorted(to_convert, key=len, reverse=True) - for value in to_convert: - if value != '': - break - else: - return '' - regex = '|'.join([re_escape(stuff) for stuff in to_convert]) - regex = '(?P<%s>%s' % (directive, regex) - return '%s)' % regex - - def pattern(self, format): - """Return regex pattern for the format string. - - Need to make sure that any characters that might be interpreted as - regex syntax are escaped. - - """ - processed_format = '' - # The sub() call escapes all characters that might be misconstrued - # as regex syntax. Cannot use re.escape since we have to deal with - # format directives (%m, etc.). - regex_chars = re_compile(r"([\\.^$*+?\(\){}\[\]|])") - format = regex_chars.sub(r"\\\1", format) - whitespace_replacement = re_compile(r'\s+') - format = whitespace_replacement.sub(r'\\s+', format) - while '%' in format: - directive_index = format.index('%') +1 - processed_format = "%s%s%s" % (processed_format, - format[:directive_index -1], - self[format[directive_index]]) - format = format[directive_index +1:] - return "%s%s" % (processed_format, format) - - def compile(self, format): - """Return a compiled re object for the format string.""" - return re_compile(self.pattern(format), IGNORECASE) - -_cache_lock = _thread_allocate_lock() -# DO NOT modify _TimeRE_cache or _regex_cache without acquiring the cache lock -# first! -_TimeRE_cache = TimeRE() -_CACHE_MAX_SIZE = 5 # Max number of regexes stored in _regex_cache -_regex_cache = {} - -cdef _calc_julian_from_U_or_W(int year, int week_of_year, - int day_of_week, int week_starts_Mon): - """Calculate the Julian day based on the year, week of the year, and day of - the week, with week_start_day representing whether the week of the year - assumes the week starts on Sunday or Monday (6 or 0).""" - - cdef: - int first_weekday, week_0_length, days_to_week - - first_weekday = datetime_date(year, 1, 1).weekday() - # If we are dealing with the %U directive (week starts on Sunday), it's - # easier to just shift the view to Sunday being the first day of the - # week. - if not week_starts_Mon: - first_weekday = (first_weekday + 1) % 7 - day_of_week = (day_of_week + 1) % 7 - # Need to watch out for a week 0 (when the first day of the year is not - # the same as that specified by %U or %W). - week_0_length = (7 - first_weekday) % 7 - if week_of_year == 0: - return 1 + day_of_week - first_weekday - else: - days_to_week = week_0_length + (7 * (week_of_year - 1)) - return 1 + days_to_week + day_of_week - -# def _strptime_time(data_string, format="%a %b %d %H:%M:%S %Y"): -# return _strptime(data_string, format)[0] diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx new file mode 100644 index 00000000000000..20b24d6be9a581 --- /dev/null +++ b/pandas/_libs/tslibs/strptime.pyx @@ -0,0 +1,640 @@ +# -*- coding: utf-8 -*- +# cython: profile=False +"""Strptime-related classes and functions. +""" +import time +import locale +import calendar +import re + + +# Python 2 vs Python 3 +try: + from thread import allocate_lock as _thread_allocate_lock +except: + try: + from _thread import allocate_lock as _thread_allocate_lock + except: + try: + from dummy_thread import allocate_lock as _thread_allocate_lock + except: + from _dummy_thread import allocate_lock as _thread_allocate_lock + + +from cython cimport Py_ssize_t +from cpython cimport PyFloat_Check + +cimport cython + +import numpy as np +cimport numpy as np +from numpy cimport ndarray, int64_t + +from datetime import date as datetime_date +from datetime cimport datetime + +# This is src/datetime.pxd +from datetime cimport ( + PANDAS_FR_ns, + check_dts_bounds, + pandas_datetimestruct, + pandas_datetimestruct_to_datetime) + +from util cimport is_string_object, get_nat + +cdef int64_t NPY_NAT = get_nat() + +cdef set _nat_strings = set(['NaT', 'nat', 'NAT', 'nan', 'NaN', 'NAN']) + + +# TODO: Consolidate with other implementations +cdef inline bint _checknull_with_nat(object val): + """ utility to check if a value is a nat or not """ + return (val is None or + (PyFloat_Check(val) and val != val) or + (isinstance(val, datetime) and not val == val)) + + +def array_strptime(ndarray[object] values, object fmt, + bint exact=True, errors='raise'): + """ + Calculates the datetime structs represented by the passed array of strings + + Parameters + ---------- + values : ndarray of string-like objects + fmt : string-like regex + exact : matches must be exact if True, search if False + coerce : if invalid values found, coerce to NaT + """ + + cdef: + Py_ssize_t i, n = len(values) + pandas_datetimestruct dts + ndarray[int64_t] iresult + int year, month, day, minute, hour, second, weekday, julian, tz + int week_of_year, week_of_year_start + int64_t us, ns + object val, group_key, ampm, found + dict found_key + bint is_raise = errors=='raise' + bint is_ignore = errors=='ignore' + bint is_coerce = errors=='coerce' + + assert is_raise or is_ignore or is_coerce + + global _TimeRE_cache, _regex_cache + with _cache_lock: + if _getlang() != _TimeRE_cache.locale_time.lang: + _TimeRE_cache = TimeRE() + _regex_cache.clear() + if len(_regex_cache) > _CACHE_MAX_SIZE: + _regex_cache.clear() + locale_time = _TimeRE_cache.locale_time + format_regex = _regex_cache.get(fmt) + if not format_regex: + try: + format_regex = _TimeRE_cache.compile(fmt) + # KeyError raised when a bad format is found; can be specified as + # \\, in which case it was a stray % but with a space after it + except KeyError, err: + bad_directive = err.args[0] + if bad_directive == "\\": + bad_directive = "%" + del err + raise ValueError("'%s' is a bad directive in format '%s'" % + (bad_directive, fmt)) + # IndexError only occurs when the format string is "%" + except IndexError: + raise ValueError("stray %% in format '%s'" % fmt) + _regex_cache[fmt] = format_regex + + result = np.empty(n, dtype='M8[ns]') + iresult = result.view('i8') + + dts.us = dts.ps = dts.as = 0 + + cdef dict _parse_code_table = { + 'y': 0, + 'Y': 1, + 'm': 2, + 'B': 3, + 'b': 4, + 'd': 5, + 'H': 6, + 'I': 7, + 'M': 8, + 'S': 9, + 'f': 10, + 'A': 11, + 'a': 12, + 'w': 13, + 'j': 14, + 'U': 15, + 'W': 16, + 'Z': 17, + 'p': 18 # just an additional key, works only with I + } + cdef int parse_code + + for i in range(n): + val = values[i] + if is_string_object(val): + if val in _nat_strings: + iresult[i] = NPY_NAT + continue + else: + if _checknull_with_nat(val): + iresult[i] = NPY_NAT + continue + else: + val = str(val) + + # exact matching + if exact: + found = format_regex.match(val) + if not found: + if is_coerce: + iresult[i] = NPY_NAT + continue + raise ValueError("time data %r does not match " + "format %r (match)" % (values[i], fmt)) + if len(val) != found.end(): + if is_coerce: + iresult[i] = NPY_NAT + continue + raise ValueError("unconverted data remains: %s" % + values[i][found.end():]) + + # search + else: + found = format_regex.search(val) + if not found: + if is_coerce: + iresult[i] = NPY_NAT + continue + raise ValueError("time data %r does not match format " + "%r (search)" % (values[i], fmt)) + + year = 1900 + month = day = 1 + hour = minute = second = ns = us = 0 + tz = -1 + # Default to -1 to signify that values not known; not critical to have, + # though + week_of_year = -1 + week_of_year_start = -1 + # weekday and julian defaulted to -1 so as to signal need to calculate + # values + weekday = julian = -1 + found_dict = found.groupdict() + for group_key in found_dict.iterkeys(): + # Directives not explicitly handled below: + # c, x, X + # handled by making out of other directives + # U, W + # worthless without day of the week + parse_code = _parse_code_table[group_key] + + if parse_code == 0: + year = int(found_dict['y']) + # Open Group specification for strptime() states that a %y + #value in the range of [00, 68] is in the century 2000, while + #[69,99] is in the century 1900 + if year <= 68: + year += 2000 + else: + year += 1900 + elif parse_code == 1: + year = int(found_dict['Y']) + elif parse_code == 2: + month = int(found_dict['m']) + elif parse_code == 3: + # elif group_key == 'B': + month = locale_time.f_month.index(found_dict['B'].lower()) + elif parse_code == 4: + # elif group_key == 'b': + month = locale_time.a_month.index(found_dict['b'].lower()) + elif parse_code == 5: + # elif group_key == 'd': + day = int(found_dict['d']) + elif parse_code == 6: + # elif group_key == 'H': + hour = int(found_dict['H']) + elif parse_code == 7: + hour = int(found_dict['I']) + ampm = found_dict.get('p', '').lower() + # If there was no AM/PM indicator, we'll treat this like AM + if ampm in ('', locale_time.am_pm[0]): + # We're in AM so the hour is correct unless we're + # looking at 12 midnight. + # 12 midnight == 12 AM == hour 0 + if hour == 12: + hour = 0 + elif ampm == locale_time.am_pm[1]: + # We're in PM so we need to add 12 to the hour unless + # we're looking at 12 noon. + # 12 noon == 12 PM == hour 12 + if hour != 12: + hour += 12 + elif parse_code == 8: + minute = int(found_dict['M']) + elif parse_code == 9: + second = int(found_dict['S']) + elif parse_code == 10: + s = found_dict['f'] + # Pad to always return nanoseconds + s += "0" * (9 - len(s)) + us = long(s) + ns = us % 1000 + us = us / 1000 + elif parse_code == 11: + weekday = locale_time.f_weekday.index(found_dict['A'].lower()) + elif parse_code == 12: + weekday = locale_time.a_weekday.index(found_dict['a'].lower()) + elif parse_code == 13: + weekday = int(found_dict['w']) + if weekday == 0: + weekday = 6 + else: + weekday -= 1 + elif parse_code == 14: + julian = int(found_dict['j']) + elif parse_code == 15 or parse_code == 16: + week_of_year = int(found_dict[group_key]) + if group_key == 'U': + # U starts week on Sunday. + week_of_year_start = 6 + else: + # W starts week on Monday. + week_of_year_start = 0 + elif parse_code == 17: + # Since -1 is default value only need to worry about setting tz + # if it can be something other than -1. + found_zone = found_dict['Z'].lower() + for value, tz_values in enumerate(locale_time.timezone): + if found_zone in tz_values: + # Deal w/ bad locale setup where timezone names are the + # same and yet time.daylight is true; too ambiguous to + # be able to tell what timezone has daylight savings + if (time.tzname[0] == time.tzname[1] and + time.daylight and found_zone not in ( + "utc", "gmt")): + break + else: + tz = value + break + # If we know the wk of the year and what day of that wk, we can figure + # out the Julian day of the year. + if julian == -1 and week_of_year != -1 and weekday != -1: + week_starts_Mon = True if week_of_year_start == 0 else False + julian = _calc_julian_from_U_or_W(year, week_of_year, weekday, + week_starts_Mon) + # Cannot pre-calculate datetime_date() since can change in Julian + # calculation and thus could have different value for the day of the wk + # calculation. + try: + if julian == -1: + # Need to add 1 to result since first day of the year is 1, not + # 0. + julian = datetime_date(year, month, day).toordinal() - \ + datetime_date(year, 1, 1).toordinal() + 1 + else: # Assume that if they bothered to include Julian day it will + # be accurate. + datetime_result = datetime_date.fromordinal( + (julian - 1) + datetime_date(year, 1, 1).toordinal()) + year = datetime_result.year + month = datetime_result.month + day = datetime_result.day + except ValueError: + if is_coerce: + iresult[i] = NPY_NAT + continue + raise + if weekday == -1: + weekday = datetime_date(year, month, day).weekday() + + dts.year = year + dts.month = month + dts.day = day + dts.hour = hour + dts.min = minute + dts.sec = second + dts.us = us + dts.ps = ns * 1000 + + iresult[i] = pandas_datetimestruct_to_datetime(PANDAS_FR_ns, &dts) + if check_dts_bounds(&dts): + if is_coerce: + iresult[i] = NPY_NAT + continue + else: + from pandas._libs.tslib import OutOfBoundsDatetime + fmt = '%d-%.2d-%.2d %.2d:%.2d:%.2d' % (dts.year, dts.month, + dts.day, dts.hour, + dts.min, dts.sec) + raise OutOfBoundsDatetime( + 'Out of bounds nanosecond timestamp: %s' % fmt) + + return result + + +"""_getlang, LocaleTime, TimeRE, _calc_julian_from_U_or_W are vendored +from the standard library, see +https://github.com/python/cpython/blob/master/Lib/_strptime.py +The original module-level docstring follows. + +Strptime-related classes and functions. +CLASSES: + LocaleTime -- Discovers and stores locale-specific time information + TimeRE -- Creates regexes for pattern matching a string of text containing + time information +FUNCTIONS: + _getlang -- Figure out what language is being used for the locale + strptime -- Calculates the time struct represented by the passed-in string +""" + + +def _getlang(): + """Figure out what language is being used for the locale""" + return locale.getlocale(locale.LC_TIME) + + +class LocaleTime(object): + """Stores and handles locale-specific information related to time. + + ATTRIBUTES: + f_weekday -- full weekday names (7-item list) + a_weekday -- abbreviated weekday names (7-item list) + f_month -- full month names (13-item list; dummy value in [0], which + is added by code) + a_month -- abbreviated month names (13-item list, dummy value in + [0], which is added by code) + am_pm -- AM/PM representation (2-item list) + LC_date_time -- format string for date/time representation (string) + LC_date -- format string for date representation (string) + LC_time -- format string for time representation (string) + timezone -- daylight- and non-daylight-savings timezone representation + (2-item list of sets) + lang -- Language used by instance (2-item tuple) + """ + + def __init__(self): + """Set all attributes. + + Order of methods called matters for dependency reasons. + + The locale language is set at the offset and then checked again before + exiting. This is to make sure that the attributes were not set with a + mix of information from more than one locale. This would most likely + happen when using threads where one thread calls a locale-dependent + function while another thread changes the locale while the function in + the other thread is still running. Proper coding would call for + locks to prevent changing the locale while locale-dependent code is + running. The check here is done in case someone does not think about + doing this. + + Only other possible issue is if someone changed the timezone and did + not call tz.tzset . That is an issue for the programmer, though, + since changing the timezone is worthless without that call. + + """ + self.lang = _getlang() + self.__calc_weekday() + self.__calc_month() + self.__calc_am_pm() + self.__calc_timezone() + self.__calc_date_time() + if _getlang() != self.lang: + raise ValueError("locale changed during initialization") + + def __pad(self, seq, front): + # Add '' to seq to either the front (is True), else the back. + seq = list(seq) + if front: + seq.insert(0, '') + else: + seq.append('') + return seq + + def __calc_weekday(self): + # Set self.a_weekday and self.f_weekday using the calendar + # module. + a_weekday = [calendar.day_abbr[i].lower() for i in range(7)] + f_weekday = [calendar.day_name[i].lower() for i in range(7)] + self.a_weekday = a_weekday + self.f_weekday = f_weekday + + def __calc_month(self): + # Set self.f_month and self.a_month using the calendar module. + a_month = [calendar.month_abbr[i].lower() for i in range(13)] + f_month = [calendar.month_name[i].lower() for i in range(13)] + self.a_month = a_month + self.f_month = f_month + + def __calc_am_pm(self): + # Set self.am_pm by using time.strftime(). + + # The magic date (1999,3,17,hour,44,55,2,76,0) is not really that + # magical; just happened to have used it everywhere else where a + # static date was needed. + am_pm = [] + for hour in (01, 22): + time_tuple = time.struct_time( + (1999, 3, 17, hour, 44, 55, 2, 76, 0)) + am_pm.append(time.strftime("%p", time_tuple).lower()) + self.am_pm = am_pm + + def __calc_date_time(self): + # Set self.date_time, self.date, & self.time by using + # time.strftime(). + + # Use (1999,3,17,22,44,55,2,76,0) for magic date because the amount of + # overloaded numbers is minimized. The order in which searches for + # values within the format string is very important; it eliminates + # possible ambiguity for what something represents. + time_tuple = time.struct_time((1999, 3, 17, 22, 44, 55, 2, 76, 0)) + date_time = [None, None, None] + date_time[0] = time.strftime("%c", time_tuple).lower() + date_time[1] = time.strftime("%x", time_tuple).lower() + date_time[2] = time.strftime("%X", time_tuple).lower() + replacement_pairs = [('%', '%%'), (self.f_weekday[2], '%A'), + (self.f_month[3], + '%B'), (self.a_weekday[2], '%a'), + (self.a_month[3], '%b'), (self.am_pm[1], '%p'), + ('1999', '%Y'), ('99', '%y'), ('22', '%H'), + ('44', '%M'), ('55', '%S'), ('76', '%j'), + ('17', '%d'), ('03', '%m'), ('3', '%m'), + # '3' needed for when no leading zero. + ('2', '%w'), ('10', '%I')] + replacement_pairs.extend([(tz, "%Z") for tz_values in self.timezone + for tz in tz_values]) + for offset, directive in ((0, '%c'), (1, '%x'), (2, '%X')): + current_format = date_time[offset] + for old, new in replacement_pairs: + # Must deal with possible lack of locale info + # manifesting itself as the empty string (e.g., Swedish's + # lack of AM/PM info) or a platform returning a tuple of empty + # strings (e.g., MacOS 9 having timezone as ('','')). + if old: + current_format = current_format.replace(old, new) + # If %W is used, then Sunday, 2005-01-03 will fall on week 0 since + # 2005-01-03 occurs before the first Monday of the year. Otherwise + # %U is used. + time_tuple = time.struct_time((1999, 1, 3, 1, 1, 1, 6, 3, 0)) + if '00' in time.strftime(directive, time_tuple): + U_W = '%W' + else: + U_W = '%U' + date_time[offset] = current_format.replace('11', U_W) + self.LC_date_time = date_time[0] + self.LC_date = date_time[1] + self.LC_time = date_time[2] + + def __calc_timezone(self): + # Set self.timezone by using time.tzname. + # Do not worry about possibility of time.tzname[0] == timetzname[1] + # and time.daylight; handle that in strptime . + try: + time.tzset() + except AttributeError: + pass + no_saving = frozenset(["utc", "gmt", time.tzname[0].lower()]) + if time.daylight: + has_saving = frozenset([time.tzname[1].lower()]) + else: + has_saving = frozenset() + self.timezone = (no_saving, has_saving) + + +class TimeRE(dict): + """ + Handle conversion from format directives to regexes. + + Creates regexes for pattern matching a string of text containing + time information + """ + + def __init__(self, locale_time=None): + """Create keys/values. + + Order of execution is important for dependency reasons. + + """ + if locale_time: + self.locale_time = locale_time + else: + self.locale_time = LocaleTime() + base = super(TimeRE, self) + base.__init__({ + # The " \d" part of the regex is to make %c from ANSI C work + 'd': r"(?P3[0-1]|[1-2]\d|0[1-9]|[1-9]| [1-9])", + 'f': r"(?P[0-9]{1,9})", + 'H': r"(?P2[0-3]|[0-1]\d|\d)", + 'I': r"(?P1[0-2]|0[1-9]|[1-9])", + 'j': (r"(?P36[0-6]|3[0-5]\d|[1-2]\d\d|0[1-9]\d|00[1-9]|" + r"[1-9]\d|0[1-9]|[1-9])"), + 'm': r"(?P1[0-2]|0[1-9]|[1-9])", + 'M': r"(?P[0-5]\d|\d)", + 'S': r"(?P6[0-1]|[0-5]\d|\d)", + 'U': r"(?P5[0-3]|[0-4]\d|\d)", + 'w': r"(?P[0-6])", + # W is set below by using 'U' + 'y': r"(?P\d\d)", + #XXX: Does 'Y' need to worry about having less or more than + # 4 digits? + 'Y': r"(?P\d\d\d\d)", + 'A': self.__seqToRE(self.locale_time.f_weekday, 'A'), + 'a': self.__seqToRE(self.locale_time.a_weekday, 'a'), + 'B': self.__seqToRE(self.locale_time.f_month[1:], 'B'), + 'b': self.__seqToRE(self.locale_time.a_month[1:], 'b'), + 'p': self.__seqToRE(self.locale_time.am_pm, 'p'), + 'Z': self.__seqToRE([tz for tz_names in self.locale_time.timezone + for tz in tz_names], + 'Z'), + '%': '%'}) + base.__setitem__('W', base.__getitem__('U').replace('U', 'W')) + base.__setitem__('c', self.pattern(self.locale_time.LC_date_time)) + base.__setitem__('x', self.pattern(self.locale_time.LC_date)) + base.__setitem__('X', self.pattern(self.locale_time.LC_time)) + + def __seqToRE(self, to_convert, directive): + """Convert a list to a regex string for matching a directive. + + Want possible matching values to be from longest to shortest. This + prevents the possibility of a match occuring for a value that also + a substring of a larger value that should have matched (e.g., 'abc' + matching when 'abcdef' should have been the match). + + """ + to_convert = sorted(to_convert, key=len, reverse=True) + for value in to_convert: + if value != '': + break + else: + return '' + regex = '|'.join([re.escape(stuff) for stuff in to_convert]) + regex = '(?P<%s>%s' % (directive, regex) + return '%s)' % regex + + def pattern(self, format): + """Return regex pattern for the format string. + + Need to make sure that any characters that might be interpreted as + regex syntax are escaped. + + """ + processed_format = '' + # The sub() call escapes all characters that might be misconstrued + # as regex syntax. Cannot use re.escape since we have to deal with + # format directives (%m, etc.). + regex_chars = re.compile(r"([\\.^$*+?\(\){}\[\]|])") + format = regex_chars.sub(r"\\\1", format) + whitespace_replacement = re.compile(r'\s+') + format = whitespace_replacement.sub(r'\\s+', format) + while '%' in format: + directive_index = format.index('%') +1 + processed_format = "%s%s%s" % (processed_format, + format[:directive_index -1], + self[format[directive_index]]) + format = format[directive_index +1:] + return "%s%s" % (processed_format, format) + + def compile(self, format): + """Return a compiled re object for the format string.""" + return re.compile(self.pattern(format), re.IGNORECASE) + + +_cache_lock = _thread_allocate_lock() +# DO NOT modify _TimeRE_cache or _regex_cache without acquiring the cache lock +# first! +_TimeRE_cache = TimeRE() +_CACHE_MAX_SIZE = 5 # Max number of regexes stored in _regex_cache +_regex_cache = {} + + +cdef _calc_julian_from_U_or_W(int year, int week_of_year, + int day_of_week, int week_starts_Mon): + """Calculate the Julian day based on the year, week of the year, and day of + the week, with week_start_day representing whether the week of the year + assumes the week starts on Sunday or Monday (6 or 0).""" + + cdef: + int first_weekday, week_0_length, days_to_week + + first_weekday = datetime_date(year, 1, 1).weekday() + # If we are dealing with the %U directive (week starts on Sunday), it's + # easier to just shift the view to Sunday being the first day of the + # week. + if not week_starts_Mon: + first_weekday = (first_weekday + 1) % 7 + day_of_week = (day_of_week + 1) % 7 + + # Need to watch out for a week 0 (when the first day of the year is not + # the same as that specified by %U or %W). + week_0_length = (7 - first_weekday) % 7 + if week_of_year == 0: + return 1 + day_of_week - first_weekday + else: + days_to_week = week_0_length + (7 * (week_of_year - 1)) + return 1 + days_to_week + day_of_week diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 95fe3ab83c2abf..bf89509fd17467 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -3,6 +3,7 @@ from collections import MutableMapping from pandas._libs import lib, tslib +from pandas._libs.tslibs.strptime import array_strptime from pandas._libs.tslibs.timezones import get_timezone from pandas.core.dtypes.common import ( @@ -416,8 +417,8 @@ def _convert_listlike(arg, box, format, name=None, tz=tz): # fallback if result is None: try: - result = tslib.array_strptime(arg, format, exact=exact, - errors=errors) + result = array_strptime(arg, format, exact=exact, + errors=errors) except tslib.OutOfBoundsDatetime: if errors == 'raise': raise diff --git a/setup.py b/setup.py index 555cf9dc4a9b39..25a4924dad0bc8 100755 --- a/setup.py +++ b/setup.py @@ -471,7 +471,6 @@ def pxd(name): 'pandas/_libs/src/datetime/np_datetime_strings.h', 'pandas/_libs/src/datetime.pxd'] - # some linux distros require it libraries = ['m'] if not is_platform_windows() else [] @@ -483,6 +482,10 @@ def pxd(name): 'pxdfiles': ['_libs/hashtable'], 'depends': (['pandas/_libs/src/klib/khash_python.h'] + _pxi_dep['hashtable'])}, + '_libs.tslibs.strptime': {'pyxfile': '_libs/tslibs/strptime', + 'depends': tseries_depends, + 'sources': ['pandas/_libs/src/datetime/np_datetime.c', + 'pandas/_libs/src/datetime/np_datetime_strings.c']}, '_libs.tslib': {'pyxfile': '_libs/tslib', 'pxdfiles': ['_libs/src/util', '_libs/lib'], 'depends': tseries_depends, From 4c9e98d1ded9660812e62c4a4ecadeaf9d3f0e6b Mon Sep 17 00:00:00 2001 From: cbertinato Date: Mon, 25 Sep 2017 06:10:05 -0400 Subject: [PATCH 147/188] CLN: replace %s syntax with .format in io (#17660) Progress toward issue #16130. Converted old string formatting to new string formatting in io/html.py, io/excel.py, msgpack/_packer.pyx, msgpack/_unpacker.pyx, clipboard/exceptions.py, json/json.py, json/normalize.py, sas/sas.pyx --- pandas/io/clipboard/exceptions.py | 2 +- pandas/io/excel.py | 72 ++++++++++++++++++------------- pandas/io/html.py | 46 +++++++++++--------- pandas/io/json/json.py | 23 +++++----- pandas/io/json/normalize.py | 7 +-- pandas/io/msgpack/_packer.pyx | 2 +- pandas/io/msgpack/_unpacker.pyx | 7 +-- pandas/io/sas/sas.pyx | 16 ++++--- 8 files changed, 101 insertions(+), 74 deletions(-) diff --git a/pandas/io/clipboard/exceptions.py b/pandas/io/clipboard/exceptions.py index 413518e53660af..d948ad414327ca 100644 --- a/pandas/io/clipboard/exceptions.py +++ b/pandas/io/clipboard/exceptions.py @@ -8,5 +8,5 @@ class PyperclipException(RuntimeError): class PyperclipWindowsException(PyperclipException): def __init__(self, message): - message += " (%s)" % ctypes.WinError() + message += " ({err})".format(err=ctypes.WinError()) super(PyperclipWindowsException, self).__init__(message) diff --git a/pandas/io/excel.py b/pandas/io/excel.py index faafdba435ff21..afecd76c498efa 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -165,7 +165,7 @@ def register_writer(klass): if ext.startswith('.'): ext = ext[1:] if ext not in _writer_extensions: - config.register_option("io.excel.%s.writer" % ext, + config.register_option("io.excel.{ext}.writer".format(ext=ext), engine_name, validator=str) _writer_extensions.append(ext) @@ -190,7 +190,8 @@ def get_writer(engine_name): try: return _writers[engine_name] except KeyError: - raise ValueError("No Excel writer '%s'" % engine_name) + raise ValueError("No Excel writer '{engine}'" + .format(engine=engine_name)) @Appender(_read_excel_doc) @@ -259,7 +260,7 @@ def __init__(self, io, **kwds): engine = kwds.pop('engine', None) if engine is not None and engine != 'xlrd': - raise ValueError("Unknown engine: %s" % engine) + raise ValueError("Unknown engine: {engine}".format(engine=engine)) # If io is a url, want to keep the data as bytes so can't pass # to get_filepath_or_buffer() @@ -445,7 +446,7 @@ def _parse_cell(cell_contents, cell_typ): for asheetname in sheets: if verbose: - print("Reading sheet %s" % asheetname) + print("Reading sheet {sheet}".format(sheet=asheetname)) if isinstance(asheetname, compat.string_types): sheet = self.book.sheet_by_name(asheetname) @@ -634,7 +635,7 @@ def _conv_value(val): elif is_bool(val): val = bool(val) elif isinstance(val, Period): - val = "%s" % val + val = "{val}".format(val=val) elif is_list_like(val): val = str(val) @@ -697,9 +698,11 @@ def __new__(cls, path, engine=None, **kwargs): ext = 'xlsx' try: - engine = config.get_option('io.excel.%s.writer' % ext) + engine = config.get_option('io.excel.{ext}.writer' + .format(ext=ext)) except KeyError: - error = ValueError("No engine for filetype: '%s'" % ext) + error = ValueError("No engine for filetype: '{ext}'" + .format(ext=ext)) raise error cls = get_writer(engine) @@ -787,8 +790,9 @@ def check_extension(cls, ext): if ext.startswith('.'): ext = ext[1:] if not any(ext in extension for extension in cls.supported_extensions): - msg = (u("Invalid extension for engine '%s': '%s'") % - (pprint_thing(cls.engine), pprint_thing(ext))) + msg = (u("Invalid extension for engine '{engine}': '{ext}'") + .format(engine=pprint_thing(cls.engine), + ext=pprint_thing(ext))) raise ValueError(msg) else: return True @@ -813,8 +817,8 @@ class _Openpyxl1Writer(ExcelWriter): def __init__(self, path, engine=None, **engine_kwargs): if not openpyxl_compat.is_compat(major_ver=self.openpyxl_majorver): raise ValueError('Installed openpyxl is not supported at this ' - 'time. Use {0}.x.y.' - .format(self.openpyxl_majorver)) + 'time. Use {majorver}.x.y.' + .format(majorver=self.openpyxl_majorver)) # Use the openpyxl module as the Excel writer. from openpyxl.workbook import Workbook @@ -854,7 +858,8 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0, for cell in cells: colletter = get_column_letter(startcol + cell.col + 1) - xcell = wks.cell("%s%s" % (colletter, startrow + cell.row + 1)) + xcell = wks.cell("{col}{row}".format(col=colletter, + row=startrow + cell.row + 1)) if (isinstance(cell.val, compat.string_types) and xcell.data_type_for_value(cell.val) != xcell.TYPE_STRING): xcell.set_value_explicit(cell.val) @@ -876,10 +881,12 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0, cletterstart = get_column_letter(startcol + cell.col + 1) cletterend = get_column_letter(startcol + cell.mergeend + 1) - wks.merge_cells('%s%s:%s%s' % (cletterstart, - startrow + cell.row + 1, - cletterend, - startrow + cell.mergestart + 1)) + wks.merge_cells('{start}{row}:{end}{mergestart}' + .format(start=cletterstart, + row=startrow + cell.row + 1, + end=cletterend, + mergestart=startrow + + cell.mergestart + 1)) # Excel requires that the format of the first cell in a merged # range is repeated in the rest of the merged range. @@ -895,7 +902,8 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0, # Ignore first cell. It is already handled. continue colletter = get_column_letter(col) - xcell = wks.cell("%s%s" % (colletter, row)) + xcell = wks.cell("{col}{row}" + .format(col=colletter, row=row)) for field in style.__fields__: xcell.style.__setattr__( field, style.__getattribute__(field)) @@ -955,7 +963,8 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0, for cell in cells: colletter = get_column_letter(startcol + cell.col + 1) - xcell = wks["%s%s" % (colletter, startrow + cell.row + 1)] + xcell = wks["{col}{row}" + .format(col=colletter, row=startrow + cell.row + 1)] xcell.value = _conv_value(cell.val) style_kwargs = {} @@ -977,10 +986,12 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0, cletterstart = get_column_letter(startcol + cell.col + 1) cletterend = get_column_letter(startcol + cell.mergeend + 1) - wks.merge_cells('%s%s:%s%s' % (cletterstart, - startrow + cell.row + 1, - cletterend, - startrow + cell.mergestart + 1)) + wks.merge_cells('{start}{row}:{end}{mergestart}' + .format(start=cletterstart, + row=startrow + cell.row + 1, + end=cletterend, + mergestart=startrow + + cell.mergestart + 1)) # Excel requires that the format of the first cell in a merged # range is repeated in the rest of the merged range. @@ -996,7 +1007,8 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0, # Ignore first cell. It is already handled. continue colletter = get_column_letter(col) - xcell = wks["%s%s" % (colletter, row)] + xcell = wks["{col}{row}" + .format(col=colletter, row=row)] xcell.style = xcell.style.copy(**style_kwargs) @classmethod @@ -1030,7 +1042,7 @@ def _convert_to_style_kwargs(cls, style_dict): for k, v in style_dict.items(): if k in _style_key_map: k = _style_key_map[k] - _conv_to_x = getattr(cls, '_convert_to_{0}'.format(k), + _conv_to_x = getattr(cls, '_convert_to_{k}'.format(k=k), lambda x: None) new_v = _conv_to_x(v) if new_v: @@ -1505,17 +1517,19 @@ def _style_to_xlwt(cls, item, firstlevel=True, field_sep=',', """ if hasattr(item, 'items'): if firstlevel: - it = ["%s: %s" % (key, cls._style_to_xlwt(value, False)) + it = ["{key}: {val}" + .format(key=key, val=cls._style_to_xlwt(value, False)) for key, value in item.items()] - out = "%s " % (line_sep).join(it) + out = "{sep} ".format(sep=(line_sep).join(it)) return out else: - it = ["%s %s" % (key, cls._style_to_xlwt(value, False)) + it = ["{key} {val}" + .format(key=key, val=cls._style_to_xlwt(value, False)) for key, value in item.items()] - out = "%s " % (field_sep).join(it) + out = "{sep} ".format(sep=(field_sep).join(it)) return out else: - item = "%s" % item + item = "{item}".format(item=item) item = item.replace("True", "on") item = item.replace("False", "off") return item diff --git a/pandas/io/html.py b/pandas/io/html.py index a4acb26af52590..b5aaffcf710c29 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -439,14 +439,15 @@ def _parse_tables(self, doc, match, attrs): unique_tables.add(table) if not result: - raise ValueError("No tables found matching pattern %r" % - match.pattern) + raise ValueError("No tables found matching pattern {patt!r}" + .format(patt=match.pattern)) return result def _setup_build_doc(self): raw_text = _read(self.io) if not raw_text: - raise ValueError('No text parsed from document: %s' % self.io) + raise ValueError('No text parsed from document: {doc}' + .format(doc=self.io)) return raw_text def _build_doc(self): @@ -473,8 +474,8 @@ def _build_xpath_expr(attrs): if 'class_' in attrs: attrs['class'] = attrs.pop('class_') - s = [u("@%s=%r") % (k, v) for k, v in iteritems(attrs)] - return u('[%s]') % ' and '.join(s) + s = [u("@{key}={val!r}").format(key=k, val=v) for k, v in iteritems(attrs)] + return u('[{expr}]').format(expr=' and '.join(s)) _re_namespace = {'re': 'http://exslt.org/regular-expressions'} @@ -517,8 +518,8 @@ def _parse_tables(self, doc, match, kwargs): # 1. check all descendants for the given pattern and only search tables # 2. go up the tree until we find a table - query = '//table//*[re:test(text(), %r)]/ancestor::table' - xpath_expr = u(query) % pattern + query = '//table//*[re:test(text(), {patt!r})]/ancestor::table' + xpath_expr = u(query).format(patt=pattern) # if any table attributes were given build an xpath expression to # search for them @@ -528,7 +529,8 @@ def _parse_tables(self, doc, match, kwargs): tables = doc.xpath(xpath_expr, namespaces=_re_namespace) if not tables: - raise ValueError("No tables found matching regex %r" % pattern) + raise ValueError("No tables found matching regex {patt!r}" + .format(patt=pattern)) return tables def _build_doc(self): @@ -574,8 +576,9 @@ def _build_doc(self): scheme = parse_url(self.io).scheme if scheme not in _valid_schemes: # lxml can't parse it - msg = ('%r is not a valid url scheme, valid schemes are ' - '%s') % (scheme, _valid_schemes) + msg = (('{invalid!r} is not a valid url scheme, valid ' + 'schemes are {valid}') + .format(invalid=scheme, valid=_valid_schemes)) raise ValueError(msg) else: # something else happened: maybe a faulty connection @@ -670,8 +673,9 @@ def _parser_dispatch(flavor): """ valid_parsers = list(_valid_parsers.keys()) if flavor not in valid_parsers: - raise ValueError('%r is not a valid flavor, valid flavors are %s' % - (flavor, valid_parsers)) + raise ValueError('{invalid!r} is not a valid flavor, valid flavors ' + 'are {valid}' + .format(invalid=flavor, valid=valid_parsers)) if flavor in ('bs4', 'html5lib'): if not _HAS_HTML5LIB: @@ -695,7 +699,7 @@ def _parser_dispatch(flavor): def _print_as_set(s): - return '{%s}' % ', '.join([pprint_thing(el) for el in s]) + return '{{arg}}'.format(arg=', '.join([pprint_thing(el) for el in s])) def _validate_flavor(flavor): @@ -705,21 +709,23 @@ def _validate_flavor(flavor): flavor = flavor, elif isinstance(flavor, collections.Iterable): if not all(isinstance(flav, string_types) for flav in flavor): - raise TypeError('Object of type %r is not an iterable of strings' % - type(flavor).__name__) + raise TypeError('Object of type {typ!r} is not an iterable of ' + 'strings' + .format(typ=type(flavor).__name__)) else: - fmt = '{0!r}' if isinstance(flavor, string_types) else '{0}' + fmt = '{flavor!r}' if isinstance(flavor, string_types) else '{flavor}' fmt += ' is not a valid flavor' - raise ValueError(fmt.format(flavor)) + raise ValueError(fmt.format(flavor=flavor)) flavor = tuple(flavor) valid_flavors = set(_valid_parsers) flavor_set = set(flavor) if not flavor_set & valid_flavors: - raise ValueError('%s is not a valid set of flavors, valid flavors are ' - '%s' % (_print_as_set(flavor_set), - _print_as_set(valid_flavors))) + raise ValueError('{invalid} is not a valid set of flavors, valid ' + 'flavors are {valid}' + .format(invalid=_print_as_set(flavor_set), + valid=_print_as_set(valid_flavors))) return flavor diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index a1d48719ba9c0f..5dae6099446d0f 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -99,7 +99,7 @@ class SeriesWriter(Writer): def _format_axes(self): if not self.obj.index.is_unique and self.orient == 'index': raise ValueError("Series index must be unique for orient=" - "'%s'" % self.orient) + "'{orient}'".format(orient=self.orient)) class FrameWriter(Writer): @@ -110,11 +110,11 @@ def _format_axes(self): if not self.obj.index.is_unique and self.orient in ( 'index', 'columns'): raise ValueError("DataFrame index must be unique for orient=" - "'%s'." % self.orient) + "'{orient}'.".format(orient=self.orient)) if not self.obj.columns.is_unique and self.orient in ( 'index', 'columns', 'records'): raise ValueError("DataFrame columns must be unique for orient=" - "'%s'." % self.orient) + "'{orient}'.".format(orient=self.orient)) class JSONTableWriter(FrameWriter): @@ -134,8 +134,9 @@ def __init__(self, obj, orient, date_format, double_precision, if date_format != 'iso': msg = ("Trying to write with `orient='table'` and " - "`date_format='%s'`. Table Schema requires dates " - "to be formatted with `date_format='iso'`" % date_format) + "`date_format='{fmt}'`. Table Schema requires dates " + "to be formatted with `date_format='iso'`" + .format(fmt=date_format)) raise ValueError(msg) self.schema = build_table_schema(obj) @@ -166,8 +167,8 @@ def __init__(self, obj, orient, date_format, double_precision, def write(self): data = super(JSONTableWriter, self).write() - serialized = '{{"schema": {}, "data": {}}}'.format( - dumps(self.schema), data) + serialized = '{{"schema": {schema}, "data": {data}}}'.format( + schema=dumps(self.schema), data=data) return serialized @@ -391,8 +392,8 @@ def __init__(self, json, orient, dtype=True, convert_axes=True, if date_unit is not None: date_unit = date_unit.lower() if date_unit not in self._STAMP_UNITS: - raise ValueError('date_unit must be one of %s' % - (self._STAMP_UNITS,)) + raise ValueError('date_unit must be one of {units}' + .format(units=self._STAMP_UNITS)) self.min_stamp = self._MIN_STAMPS[date_unit] else: self.min_stamp = self._MIN_STAMPS['s'] @@ -410,8 +411,8 @@ def check_keys_split(self, decoded): bad_keys = set(decoded.keys()).difference(set(self._split_keys)) if bad_keys: bad_keys = ", ".join(bad_keys) - raise ValueError(u("JSON data had unexpected key(s): %s") % - pprint_thing(bad_keys)) + raise ValueError(u("JSON data had unexpected key(s): {bad_keys}") + .format(bad_keys=pprint_thing(bad_keys))) def parse(self): diff --git a/pandas/io/json/normalize.py b/pandas/io/json/normalize.py index 72776ed01de15e..e811dd1eab1420 100644 --- a/pandas/io/json/normalize.py +++ b/pandas/io/json/normalize.py @@ -249,7 +249,8 @@ def _recursive_extract(data, path, seen_meta, level=0): raise \ KeyError("Try running with " "errors='ignore' as key " - "%s is not always present", e) + "{err} is not always present" + .format(err=e)) meta_vals[key].append(meta_val) records.extend(recs) @@ -267,8 +268,8 @@ def _recursive_extract(data, path, seen_meta, level=0): k = meta_prefix + k if k in result: - raise ValueError('Conflicting metadata name %s, ' - 'need distinguishing prefix ' % k) + raise ValueError('Conflicting metadata name {name}, ' + 'need distinguishing prefix '.format(name=k)) result[k] = np.array(v).repeat(lengths) diff --git a/pandas/io/msgpack/_packer.pyx b/pandas/io/msgpack/_packer.pyx index ad7ce1fb2531ae..fd3f4612fb4322 100644 --- a/pandas/io/msgpack/_packer.pyx +++ b/pandas/io/msgpack/_packer.pyx @@ -224,7 +224,7 @@ cdef class Packer(object): default_used = 1 continue else: - raise TypeError("can't serialize %r" % (o,)) + raise TypeError("can't serialize {thing!r}".format(thing=o)) return ret cpdef pack(self, object obj): diff --git a/pandas/io/msgpack/_unpacker.pyx b/pandas/io/msgpack/_unpacker.pyx index 504bfed48df3ca..22401d7514f653 100644 --- a/pandas/io/msgpack/_unpacker.pyx +++ b/pandas/io/msgpack/_unpacker.pyx @@ -94,7 +94,7 @@ cdef inline init_ctx(unpack_context *ctx, def default_read_extended_type(typecode, data): raise NotImplementedError("Cannot decode extended type " - "with typecode=%d" % typecode) + "with typecode={code}".format(code=typecode)) def unpackb(object packed, object object_hook=None, object list_hook=None, @@ -144,7 +144,7 @@ def unpackb(object packed, object object_hook=None, object list_hook=None, buf + off, buf_len - off)) return obj else: - raise UnpackValueError("Unpack failed: error = %d" % (ret,)) + raise UnpackValueError("Unpack failed: error = {ret}".format(ret=ret)) def unpack(object stream, object object_hook=None, object list_hook=None, @@ -411,7 +411,8 @@ cdef class Unpacker(object): else: raise OutOfData("No more data to unpack.") else: - raise ValueError("Unpack failed: error = %d" % (ret,)) + raise ValueError("Unpack failed: error = {ret}" + .format(ret=ret)) def read_bytes(self, Py_ssize_t nbytes): """Read a specified number of raw bytes from the stream""" diff --git a/pandas/io/sas/sas.pyx b/pandas/io/sas/sas.pyx index 4396180da44cbf..41c03cb2799a3f 100644 --- a/pandas/io/sas/sas.pyx +++ b/pandas/io/sas/sas.pyx @@ -101,10 +101,12 @@ cdef np.ndarray[uint8_t, ndim=1] rle_decompress( result[rpos] = 0x00 rpos += 1 else: - raise ValueError("unknown control byte: %v", control_byte) + raise ValueError("unknown control byte: {byte}" + .format(byte=control_byte)) if len(result) != result_length: - raise ValueError("RLE: %v != %v", (len(result), result_length)) + raise ValueError("RLE: {got} != {expect}".format(got=len(result), + expect=result_length)) return np.asarray(result) @@ -185,7 +187,8 @@ cdef np.ndarray[uint8_t, ndim=1] rdc_decompress( raise ValueError("unknown RDC command") if len(outbuff) != result_length: - raise ValueError("RDC: %v != %v\n", len(outbuff), result_length) + raise ValueError("RDC: {got} != {expect}\n" + .format(got=len(outbuff), expect=result_length)) return np.asarray(outbuff) @@ -258,7 +261,8 @@ cdef class Parser(object): self.column_types[j] = column_type_string else: raise ValueError("unknown column type: " - "%s" % self.parser.columns[j].ctype) + "{typ}" + .format(typ=self.parser.columns[j].ctype)) # compression if parser.compression == const.rle_compression: @@ -378,8 +382,8 @@ cdef class Parser(object): return True return False else: - raise ValueError("unknown page type: %s", - self.current_page_type) + raise ValueError("unknown page type: {typ}" + .format(typ=self.current_page_type)) cdef void process_byte_array_with_data(self, int offset, int length): From 868389d3dbdde71df4244e53cdd79a94201db093 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 25 Sep 2017 06:14:47 -0400 Subject: [PATCH 148/188] DEPR: deprecate .as_blocks() (#17656) closes #17302 --- doc/source/10min.rst | 12 +---- doc/source/whatsnew/v0.21.0.txt | 3 +- pandas/core/computation/expressions.py | 2 +- pandas/core/generic.py | 51 ++++++++++++---------- pandas/core/internals.py | 25 +++++++++++ pandas/core/window.py | 2 +- pandas/tests/frame/test_block_internals.py | 12 ++++- pandas/tests/frame/test_constructors.py | 5 ++- pandas/tests/internals/test_internals.py | 7 +-- pandas/tests/sparse/test_frame.py | 5 ++- pandas/util/testing.py | 4 +- 11 files changed, 79 insertions(+), 49 deletions(-) diff --git a/doc/source/10min.rst b/doc/source/10min.rst index 0a23f490e66283..49142311ff0576 100644 --- a/doc/source/10min.rst +++ b/doc/source/10min.rst @@ -95,17 +95,7 @@ will be completed: df2.append df2.combine_first df2.apply df2.compound df2.applymap df2.consolidate - df2.as_blocks df2.convert_objects - df2.asfreq df2.copy - df2.as_matrix df2.corr - df2.astype df2.corrwith - df2.at df2.count - df2.at_time df2.cov - df2.axes df2.cummax - df2.B df2.cummin - df2.between_time df2.cumprod - df2.bfill df2.cumsum - df2.blocks df2.D + df2.D As you can see, the columns ``A``, ``B``, ``C``, and ``D`` are automatically tab completed. ``E`` is there as well; the rest of the attributes have been diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 1365901c2ce5e3..07cc00b3724e42 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -488,10 +488,9 @@ Other API Changes Deprecations ~~~~~~~~~~~~ - :func:`read_excel()` has deprecated ``sheetname`` in favor of ``sheet_name`` for consistency with ``.to_excel()`` (:issue:`10559`). - - ``pd.options.html.border`` has been deprecated in favor of ``pd.options.display.html.border`` (:issue:`15793`). - - :func:`SeriesGroupBy.nth` has deprecated ``True`` in favor of ``'all'`` for its kwarg ``dropna`` (:issue:`11038`). +- :func:`DataFrame.as_blocks` is deprecated, as this is exposing the internal implementation (:issue:`17302`) .. _whatsnew_0210.prior_deprecations: diff --git a/pandas/core/computation/expressions.py b/pandas/core/computation/expressions.py index af068bd1f32b34..8ddc625887a511 100644 --- a/pandas/core/computation/expressions.py +++ b/pandas/core/computation/expressions.py @@ -165,7 +165,7 @@ def _has_bool_dtype(x): return x.dtype == bool except AttributeError: try: - return 'bool' in x.blocks + return 'bool' in x.dtypes except AttributeError: return isinstance(x, (bool, np.bool_)) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 3d55e07df6eacb..b49eeed6db85f0 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1650,7 +1650,7 @@ def to_xarray(self): coords=coords, ) - _shared_docs['to_latex'] = """ + _shared_docs['to_latex'] = r""" Render an object to a tabular environment table. You can splice this into a LaTeX document. Requires \\usepackage{booktabs}. @@ -3271,7 +3271,7 @@ def sample(self, n=None, frac=None, replace=False, weights=None, locs = rs.choice(axis_length, size=n, replace=replace, p=weights) return self.take(locs, axis=axis, is_copy=False) - _shared_docs['pipe'] = (""" + _shared_docs['pipe'] = (r""" Apply func(self, \*args, \*\*kwargs) Parameters @@ -3692,6 +3692,8 @@ def as_blocks(self, copy=True): Convert the frame to a dict of dtype -> Constructor Types that each has a homogeneous dtype. + .. deprecated:: 0.21.0 + NOTE: the dtypes of the blocks WILL BE PRESERVED HERE (unlike in as_matrix) @@ -3699,32 +3701,34 @@ def as_blocks(self, copy=True): ---------- copy : boolean, default True - .. versionadded: 0.16.1 - Returns ------- values : a dict of dtype -> Constructor Types """ - self._consolidate_inplace() - - bd = {} - for b in self._data.blocks: - bd.setdefault(str(b.dtype), []).append(b) - - result = {} - for dtype, blocks in bd.items(): - # Must combine even after consolidation, because there may be - # sparse items which are never consolidated into one block. - combined = self._data.combine(blocks, copy=copy) - result[dtype] = self._constructor(combined).__finalize__(self) - - return result + warnings.warn("as_blocks is deprecated and will " + "be removed in a future version", + FutureWarning, stacklevel=2) + return self._to_dict_of_blocks(copy=copy) @property def blocks(self): - """Internal property, property synonym for as_blocks()""" + """ + Internal property, property synonym for as_blocks() + + .. deprecated:: 0.21.0 + """ return self.as_blocks() + def _to_dict_of_blocks(self, copy=True): + """ + Return a dict of dtype -> Constructor Types that + each is a homogeneous dtype. + + Internal ONLY + """ + return {k: self._constructor(v).__finalize__(self) + for k, v, in self._data.to_dict(copy=copy).items()} + @deprecate_kwarg(old_arg_name='raise_on_error', new_arg_name='errors', mapping={True: 'raise', False: 'ignore'}) def astype(self, dtype, copy=True, errors='raise', **kwargs): @@ -3931,13 +3935,12 @@ def convert_objects(self, convert_dates=True, convert_numeric=False, ------- converted : same as input object """ - from warnings import warn msg = ("convert_objects is deprecated. To re-infer data dtypes for " "object columns, use {klass}.infer_objects()\nFor all " "other conversions use the data-type specific converters " "pd.to_datetime, pd.to_timedelta and pd.to_numeric." ).format(klass=self.__class__.__name__) - warn(msg, FutureWarning, stacklevel=2) + warnings.warn(msg, FutureWarning, stacklevel=2) return self._constructor( self._data.convert(convert_dates=convert_dates, @@ -4310,9 +4313,9 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, raise AssertionError("'to_replace' must be 'None' if 'regex' is " "not a bool") if axis is not None: - from warnings import warn - warn('the "axis" argument is deprecated and will be removed in' - 'v0.13; this argument has no effect') + warnings.warn('the "axis" argument is deprecated ' + 'and will be removed in' + 'v0.13; this argument has no effect') self._consolidate_inplace() diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 2046bae759b9ab..e6f61a22e31373 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -3583,6 +3583,31 @@ def _interleave(self): return result + def to_dict(self, copy=True): + """ + Return a dict of str(dtype) -> BlockManager + + Parameters + ---------- + copy : boolean, default True + + Returns + ------- + values : a dict of dtype -> BlockManager + + Notes + ----- + This consolidates based on str(dtype) + """ + self._consolidate_inplace() + + bd = {} + for b in self.blocks: + bd.setdefault(str(b.dtype), []).append(b) + + return {dtype: self.combine(blocks, copy=copy) + for dtype, blocks in bd.items()} + def xs(self, key, axis=1, copy=True, takeable=False): if axis < 1: raise AssertionError('Can only take xs across axis >= 1, got %d' % diff --git a/pandas/core/window.py b/pandas/core/window.py index 4bd959f52673c9..869296503225d0 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -141,7 +141,7 @@ def _create_blocks(self, how): if obj.ndim == 2: obj = obj.reindex(columns=obj.columns.difference([self.on]), copy=False) - blocks = obj.as_blocks(copy=False).values() + blocks = obj._to_dict_of_blocks(copy=False).values() return blocks, obj, index diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index afa3c4f25789ae..3ca185cf158a7c 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -320,7 +320,11 @@ def test_copy_blocks(self): column = df.columns[0] # use the default copy=True, change a column - blocks = df.as_blocks() + + # deprecated 0.21.0 + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + blocks = df.as_blocks() for dtype, _df in blocks.items(): if column in _df: _df.loc[:, column] = _df[column] + 1 @@ -334,7 +338,11 @@ def test_no_copy_blocks(self): column = df.columns[0] # use the copy=False, change a column - blocks = df.as_blocks(copy=False) + + # deprecated 0.21.0 + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + blocks = df.as_blocks(copy=False) for dtype, _df in blocks.items(): if column in _df: _df.loc[:, column] = _df[column] + 1 diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index d942330ecd8a6b..d0cd1899a0a3c5 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1766,7 +1766,7 @@ def test_from_records_sequencelike(self): # this is actually tricky to create the recordlike arrays and # have the dtypes be intact - blocks = df.blocks + blocks = df._to_dict_of_blocks() tuples = [] columns = [] dtypes = [] @@ -1841,8 +1841,9 @@ def test_from_records_dictlike(self): # columns is in a different order here than the actual items iterated # from the dict + blocks = df._to_dict_of_blocks() columns = [] - for dtype, b in compat.iteritems(df.blocks): + for dtype, b in compat.iteritems(blocks): columns.extend(b.columns) asdict = dict((x, y) for x, y in compat.iteritems(df)) diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 0900d21b250ede..f40fc151676da1 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -469,10 +469,11 @@ def test_set_change_dtype_slice(self): # GH8850 df = DataFrame([[1.0, 2, 3], [4.0, 5, 6]], columns=cols) df['2nd'] = df['2nd'] * 2.0 - assert sorted(df.blocks.keys()) == ['float64', 'int64'] - assert_frame_equal(df.blocks['float64'], DataFrame( + blocks = df._to_dict_of_blocks() + assert sorted(blocks.keys()) == ['float64', 'int64'] + assert_frame_equal(blocks['float64'], DataFrame( [[1.0, 4.0], [4.0, 10.0]], columns=cols[:2])) - assert_frame_equal(df.blocks['int64'], DataFrame( + assert_frame_equal(blocks['int64'], DataFrame( [[3], [6]], columns=cols[2:])) def test_copy(self, mgr): diff --git a/pandas/tests/sparse/test_frame.py b/pandas/tests/sparse/test_frame.py index 004af5066fe835..ed4a3a9e5f75f8 100644 --- a/pandas/tests/sparse/test_frame.py +++ b/pandas/tests/sparse/test_frame.py @@ -1099,7 +1099,10 @@ def test_as_blocks(self): df = SparseDataFrame({'A': [1.1, 3.3], 'B': [nan, -3.9]}, dtype='float64') - df_blocks = df.blocks + # deprecated 0.21.0 + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + df_blocks = df.blocks assert list(df_blocks.keys()) == ['float64'] tm.assert_frame_equal(df_blocks['float64'], df) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 5adbd1498bb6aa..c5f73ca0e885bb 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -1385,8 +1385,8 @@ def assert_frame_equal(left, right, check_dtype=True, # compare by blocks if by_blocks: - rblocks = right.blocks - lblocks = left.blocks + rblocks = right._to_dict_of_blocks() + lblocks = left._to_dict_of_blocks() for dtype in list(set(list(lblocks.keys()) + list(rblocks.keys()))): assert dtype in lblocks assert dtype in rblocks From d2b166885496ebf5f25cdedd27dce69379878aaa Mon Sep 17 00:00:00 2001 From: Sam Foo Date: Mon, 25 Sep 2017 06:20:50 -0400 Subject: [PATCH 149/188] TST: Use fixtures in indexes common tests (#17622) --- pandas/tests/indexes/common.py | 332 +++++++++++++-------------- pandas/tests/indexes/conftest.py | 24 ++ pandas/tests/indexes/datetimelike.py | 4 +- pandas/tests/indexes/test_base.py | 4 +- pandas/tests/indexes/test_numeric.py | 4 +- pandas/tests/indexes/test_range.py | 4 +- 6 files changed, 186 insertions(+), 186 deletions(-) create mode 100644 pandas/tests/indexes/conftest.py diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 90618cd6e235f6..970dd7b63225ab 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -30,9 +30,9 @@ def setup_indices(self): for name, idx in self.indices.items(): setattr(self, name, idx) - def verify_pickle(self, index): - unpickled = tm.round_trip_pickle(index) - assert index.equals(unpickled) + def verify_pickle(self, indices): + unpickled = tm.round_trip_pickle(indices) + assert indices.equals(unpickled) def test_pickle_compat_construction(self): # this is testing for pickle compat @@ -97,7 +97,7 @@ def test_numeric_compat(self): lambda: 1 * idx) div_err = "cannot perform __truediv__" if PY3 \ - else "cannot perform __div__" + else "cannot perform __div__" tm.assert_raises_regex(TypeError, div_err, lambda: idx / 1) tm.assert_raises_regex(TypeError, div_err, lambda: 1 / idx) tm.assert_raises_regex(TypeError, "cannot perform __floordiv__", @@ -178,11 +178,10 @@ def test_str(self): assert "'foo'" in str(idx) assert idx.__class__.__name__ in str(idx) - def test_dtype_str(self): - for idx in self.indices.values(): - dtype = idx.dtype_str - assert isinstance(dtype, compat.string_types) - assert dtype == str(idx.dtype) + def test_dtype_str(self, indices): + dtype = indices.dtype_str + assert isinstance(dtype, compat.string_types) + assert dtype == str(indices.dtype) def test_repr_max_seq_item_setting(self): # GH10182 @@ -192,48 +191,43 @@ def test_repr_max_seq_item_setting(self): repr(idx) assert '...' not in str(idx) - def test_wrong_number_names(self): + def test_wrong_number_names(self, indices): def testit(ind): ind.names = ["apple", "banana", "carrot"] + tm.assert_raises_regex(ValueError, "^Length", testit, indices) - for ind in self.indices.values(): - tm.assert_raises_regex(ValueError, "^Length", testit, ind) - - def test_set_name_methods(self): + def test_set_name_methods(self, indices): new_name = "This is the new name for this index" - for ind in self.indices.values(): - - # don't tests a MultiIndex here (as its tested separated) - if isinstance(ind, MultiIndex): - continue - original_name = ind.name - new_ind = ind.set_names([new_name]) - assert new_ind.name == new_name - assert ind.name == original_name - res = ind.rename(new_name, inplace=True) - - # should return None - assert res is None - assert ind.name == new_name - assert ind.names == [new_name] - # with tm.assert_raises_regex(TypeError, "list-like"): - # # should still fail even if it would be the right length - # ind.set_names("a") - with tm.assert_raises_regex(ValueError, "Level must be None"): - ind.set_names("a", level=0) - - # rename in place just leaves tuples and other containers alone - name = ('A', 'B') - ind.rename(name, inplace=True) - assert ind.name == name - assert ind.names == [name] - - def test_hash_error(self): - for ind in self.indices.values(): - with tm.assert_raises_regex(TypeError, "unhashable type: %r" % - type(ind).__name__): - hash(ind) + # don't tests a MultiIndex here (as its tested separated) + if isinstance(indices, MultiIndex): + return + original_name = indices.name + new_ind = indices.set_names([new_name]) + assert new_ind.name == new_name + assert indices.name == original_name + res = indices.rename(new_name, inplace=True) + + # should return None + assert res is None + assert indices.name == new_name + assert indices.names == [new_name] + # with tm.assert_raises_regex(TypeError, "list-like"): + # # should still fail even if it would be the right length + # ind.set_names("a") + with tm.assert_raises_regex(ValueError, "Level must be None"): + indices.set_names("a", level=0) + + # rename in place just leaves tuples and other containers alone + name = ('A', 'B') + indices.rename(name, inplace=True) + assert indices.name == name + assert indices.names == [name] + + def test_hash_error(self, indices): + index = indices + tm.assert_raises_regex(TypeError, "unhashable type: %r" % + type(index).__name__, hash, indices) def test_copy_name(self): # gh-12309: Check that the "name" argument @@ -298,106 +292,87 @@ def test_ensure_copied_data(self): tm.assert_numpy_array_equal(index._values, result._values, check_same='same') - def test_copy_and_deepcopy(self): + def test_copy_and_deepcopy(self, indices): from copy import copy, deepcopy - for ind in self.indices.values(): + if isinstance(indices, MultiIndex): + return + for func in (copy, deepcopy): + idx_copy = func(indices) + assert idx_copy is not indices + assert idx_copy.equals(indices) - # don't tests a MultiIndex here (as its tested separated) - if isinstance(ind, MultiIndex): - continue + new_copy = indices.copy(deep=True, name="banana") + assert new_copy.name == "banana" - for func in (copy, deepcopy): - idx_copy = func(ind) - assert idx_copy is not ind - assert idx_copy.equals(ind) + def test_duplicates(self, indices): + if type(indices) is not self._holder: + return + if not len(indices) or isinstance(indices, MultiIndex): + return + idx = self._holder([indices[0]] * 5) + assert not idx.is_unique + assert idx.has_duplicates - new_copy = ind.copy(deep=True, name="banana") - assert new_copy.name == "banana" + def test_get_unique_index(self, indices): + # MultiIndex tested separately + if not len(indices) or isinstance(indices, MultiIndex): + return - def test_duplicates(self): - for ind in self.indices.values(): + idx = indices[[0] * 5] + idx_unique = indices[[0]] - if not len(ind): - continue - if isinstance(ind, MultiIndex): - continue - idx = self._holder([ind[0]] * 5) - assert not idx.is_unique - assert idx.has_duplicates - - # GH 10115 - # preserve names - idx.name = 'foo' - result = idx.drop_duplicates() - assert result.name == 'foo' - tm.assert_index_equal(result, Index([ind[0]], name='foo')) - - def test_get_unique_index(self): - for ind in self.indices.values(): - - # MultiIndex tested separately - if not len(ind) or isinstance(ind, MultiIndex): - continue + # We test against `idx_unique`, so first we make sure it's unique + # and doesn't contain nans. + assert idx_unique.is_unique + try: + assert not idx_unique.hasnans + except NotImplementedError: + pass - idx = ind[[0] * 5] - idx_unique = ind[[0]] + for dropna in [False, True]: + result = idx._get_unique_index(dropna=dropna) + tm.assert_index_equal(result, idx_unique) - # We test against `idx_unique`, so first we make sure it's unique - # and doesn't contain nans. - assert idx_unique.is_unique - try: - assert not idx_unique.hasnans - except NotImplementedError: - pass + # nans: + if not indices._can_hold_na: + return - for dropna in [False, True]: - result = idx._get_unique_index(dropna=dropna) - tm.assert_index_equal(result, idx_unique) + if needs_i8_conversion(indices): + vals = indices.asi8[[0] * 5] + vals[0] = iNaT + else: + vals = indices.values[[0] * 5] + vals[0] = np.nan - # nans: - if not ind._can_hold_na: - continue + vals_unique = vals[:2] + idx_nan = indices._shallow_copy(vals) + idx_unique_nan = indices._shallow_copy(vals_unique) + assert idx_unique_nan.is_unique - if needs_i8_conversion(ind): - vals = ind.asi8[[0] * 5] - vals[0] = iNaT - else: - vals = ind.values[[0] * 5] - vals[0] = np.nan - - vals_unique = vals[:2] - idx_nan = ind._shallow_copy(vals) - idx_unique_nan = ind._shallow_copy(vals_unique) - assert idx_unique_nan.is_unique - - assert idx_nan.dtype == ind.dtype - assert idx_unique_nan.dtype == ind.dtype - - for dropna, expected in zip([False, True], - [idx_unique_nan, idx_unique]): - for i in [idx_nan, idx_unique_nan]: - result = i._get_unique_index(dropna=dropna) - tm.assert_index_equal(result, expected) - - def test_sort(self): - for ind in self.indices.values(): - pytest.raises(TypeError, ind.sort) - - def test_mutability(self): - for ind in self.indices.values(): - if not len(ind): - continue - pytest.raises(TypeError, ind.__setitem__, 0, ind[0]) + assert idx_nan.dtype == indices.dtype + assert idx_unique_nan.dtype == indices.dtype - def test_view(self): - for ind in self.indices.values(): - i_view = ind.view() - assert i_view.name == ind.name + for dropna, expected in zip([False, True], + [idx_unique_nan, + idx_unique]): + for i in [idx_nan, idx_unique_nan]: + result = i._get_unique_index(dropna=dropna) + tm.assert_index_equal(result, expected) - def test_compat(self): - for ind in self.indices.values(): - assert ind.tolist() == list(ind) + def test_sort(self, indices): + pytest.raises(TypeError, indices.sort) + + def test_mutability(self, indices): + if not len(indices): + return + pytest.raises(TypeError, indices.__setitem__, 0, indices[0]) + + def test_view(self, indices): + assert indices.view().name == indices.name + + def test_compat(self, indices): + assert indices.tolist() == list(indices) def test_memory_usage(self): for name, index in compat.iteritems(self.indices): @@ -457,11 +432,11 @@ def test_numpy_argsort(self): tm.assert_raises_regex(ValueError, msg, np.argsort, ind, order=('a', 'b')) - def test_pickle(self): - for ind in self.indices.values(): - self.verify_pickle(ind) - ind.name = 'foo' - self.verify_pickle(ind) + def test_pickle(self, indices): + self.verify_pickle(indices) + original_name, indices.name = indices.name, 'foo' + self.verify_pickle(indices) + indices.name = original_name def test_take(self): indexer = [4, 3, 0, 2] @@ -962,46 +937,47 @@ def test_join_self_unique(self, how): joined = index.join(index, how=how) assert (index == joined).all() - def test_searchsorted_monotonic(self): + def test_searchsorted_monotonic(self, indices): # GH17271 - for index in self.indices.values(): - # not implemented for tuple searches in MultiIndex - # or Intervals searches in IntervalIndex - if isinstance(index, (MultiIndex, IntervalIndex)): - continue + # not implemented for tuple searches in MultiIndex + # or Intervals searches in IntervalIndex + if isinstance(indices, (MultiIndex, IntervalIndex)): + return - # nothing to test if the index is empty - if index.empty: - continue - value = index[0] - - # determine the expected results (handle dupes for 'right') - expected_left, expected_right = 0, (index == value).argmin() - if expected_right == 0: - # all values are the same, expected_right should be length - expected_right = len(index) - - # test _searchsorted_monotonic in all cases - # test searchsorted only for increasing - if index.is_monotonic_increasing: - ssm_left = index._searchsorted_monotonic(value, side='left') - assert expected_left == ssm_left - - ssm_right = index._searchsorted_monotonic(value, side='right') - assert expected_right == ssm_right - - ss_left = index.searchsorted(value, side='left') - assert expected_left == ss_left - - ss_right = index.searchsorted(value, side='right') - assert expected_right == ss_right - elif index.is_monotonic_decreasing: - ssm_left = index._searchsorted_monotonic(value, side='left') - assert expected_left == ssm_left - - ssm_right = index._searchsorted_monotonic(value, side='right') - assert expected_right == ssm_right - else: - # non-monotonic should raise. - with pytest.raises(ValueError): - index._searchsorted_monotonic(value, side='left') + # nothing to test if the index is empty + if indices.empty: + return + value = indices[0] + + # determine the expected results (handle dupes for 'right') + expected_left, expected_right = 0, (indices == value).argmin() + if expected_right == 0: + # all values are the same, expected_right should be length + expected_right = len(indices) + + # test _searchsorted_monotonic in all cases + # test searchsorted only for increasing + if indices.is_monotonic_increasing: + ssm_left = indices._searchsorted_monotonic(value, side='left') + assert expected_left == ssm_left + + ssm_right = indices._searchsorted_monotonic(value, side='right') + assert expected_right == ssm_right + + ss_left = indices.searchsorted(value, side='left') + assert expected_left == ss_left + + ss_right = indices.searchsorted(value, side='right') + assert expected_right == ss_right + + elif indices.is_monotonic_decreasing: + ssm_left = indices._searchsorted_monotonic(value, side='left') + assert expected_left == ssm_left + + ssm_right = indices._searchsorted_monotonic(value, side='right') + assert expected_right == ssm_right + + else: + # non-monotonic should raise. + with pytest.raises(ValueError): + indices._searchsorted_monotonic(value, side='left') diff --git a/pandas/tests/indexes/conftest.py b/pandas/tests/indexes/conftest.py new file mode 100644 index 00000000000000..a0ee3e511ef378 --- /dev/null +++ b/pandas/tests/indexes/conftest.py @@ -0,0 +1,24 @@ +import pytest + +import pandas.util.testing as tm +from pandas.core.indexes.api import Index, MultiIndex +from pandas.compat import lzip + + +@pytest.fixture(params=[tm.makeUnicodeIndex(100), + tm.makeStringIndex(100), + tm.makeDateIndex(100), + tm.makePeriodIndex(100), + tm.makeTimedeltaIndex(100), + tm.makeIntIndex(100), + tm.makeUIntIndex(100), + tm.makeFloatIndex(100), + Index([True, False]), + tm.makeCategoricalIndex(100), + Index([]), + MultiIndex.from_tuples(lzip( + ['foo', 'bar', 'baz'], [1, 2, 3])), + Index([0, 0, 1, 1, 2, 2])], + ids=lambda x: type(x).__name__) +def indices(request): + return request.param diff --git a/pandas/tests/indexes/datetimelike.py b/pandas/tests/indexes/datetimelike.py index 114940009377c7..12b509d4aef3fe 100644 --- a/pandas/tests/indexes/datetimelike.py +++ b/pandas/tests/indexes/datetimelike.py @@ -26,8 +26,8 @@ def test_str(self): if hasattr(idx, 'freq'): assert "freq='%s'" % idx.freqstr in str(idx) - def test_view(self): - super(DatetimeLike, self).test_view() + def test_view(self, indices): + super(DatetimeLike, self).test_view(indices) i = self.create_index() diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index fa73c9fc7b7225..0bd2861e060eda 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -58,8 +58,8 @@ def test_new_axis(self): assert new_index.ndim == 2 assert isinstance(new_index, np.ndarray) - def test_copy_and_deepcopy(self): - super(TestIndex, self).test_copy_and_deepcopy() + def test_copy_and_deepcopy(self, indices): + super(TestIndex, self).test_copy_and_deepcopy(indices) new_copy2 = self.intIndex.copy(dtype=int) assert new_copy2.dtype.kind == 'i' diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index 7e7e10e4aeabee..dc38b0a2b1fb7f 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -459,8 +459,8 @@ def test_take_fill_value(self): class NumericInt(Numeric): - def test_view(self): - super(NumericInt, self).test_view() + def test_view(self, indices): + super(NumericInt, self).test_view(indices) i = self._holder([], name='Foo') i_view = i.view() diff --git a/pandas/tests/indexes/test_range.py b/pandas/tests/indexes/test_range.py index 8dc5a40ced4bfd..9fe10885186de0 100644 --- a/pandas/tests/indexes/test_range.py +++ b/pandas/tests/indexes/test_range.py @@ -312,8 +312,8 @@ def test_delete(self): # either depending on numpy version result = idx.delete(len(idx)) - def test_view(self): - super(TestRangeIndex, self).test_view() + def test_view(self, indices): + super(TestRangeIndex, self).test_view(indices) i = RangeIndex(0, name='Foo') i_view = i.view() From 9d0db60f75783ba5a1a036aac0485f8b760d61dc Mon Sep 17 00:00:00 2001 From: dkamm Date: Mon, 25 Sep 2017 07:12:27 -0400 Subject: [PATCH 150/188] BUG: wrap all supported inplace methods to avoid making a copy (#12962) (#17589) --- doc/source/whatsnew/v0.21.0.txt | 1 + pandas/core/ops.py | 15 ++++++++++++--- pandas/tests/frame/test_operators.py | 27 +++++++++++++++++++++++++++ 3 files changed, 40 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 07cc00b3724e42..36551fa30c3adc 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -646,4 +646,5 @@ PyPy Other ^^^^^ +- Bug where some inplace operators were not being wrapped and produced a copy when invoked (:issue:`12962`) - Bug in :func:`eval` where the ``inplace`` parameter was being incorrectly handled (:issue:`16732`) diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 221f6ff8b92c68..d37acf48ed9c28 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -186,8 +186,10 @@ def add_special_arithmetic_methods(cls, arith_method=None, arith_method : function (optional) factory for special arithmetic methods, with op string: f(op, name, str_rep, default_axis=None, fill_zeros=None, **eval_kwargs) - comp_method : function, optional, + comp_method : function (optional) factory for rich comparison - signature: f(op, name, str_rep) + bool_method : function (optional) + factory for boolean methods - signature: f(op, name, str_rep) use_numexpr : bool, default True whether to accelerate with numexpr, defaults to True force : bool, default False @@ -234,9 +236,16 @@ def f(self, other): __isub__=_wrap_inplace_method(new_methods["__sub__"]), __imul__=_wrap_inplace_method(new_methods["__mul__"]), __itruediv__=_wrap_inplace_method(new_methods["__truediv__"]), - __ipow__=_wrap_inplace_method(new_methods["__pow__"]), )) + __ifloordiv__=_wrap_inplace_method(new_methods["__floordiv__"]), + __imod__=_wrap_inplace_method(new_methods["__mod__"]), + __ipow__=_wrap_inplace_method(new_methods["__pow__"]))) if not compat.PY3: - new_methods["__idiv__"] = new_methods["__div__"] + new_methods["__idiv__"] = _wrap_inplace_method(new_methods["__div__"]) + if bool_method: + new_methods.update( + dict(__iand__=_wrap_inplace_method(new_methods["__and__"]), + __ior__=_wrap_inplace_method(new_methods["__or__"]), + __ixor__=_wrap_inplace_method(new_methods["__xor__"]))) add_methods(cls, new_methods=new_methods, force=force, select=select, exclude=exclude) diff --git a/pandas/tests/frame/test_operators.py b/pandas/tests/frame/test_operators.py index 309c0f0244d7c8..10a9853b8a5b4f 100644 --- a/pandas/tests/frame/test_operators.py +++ b/pandas/tests/frame/test_operators.py @@ -1167,6 +1167,33 @@ def test_inplace_ops_identity(self): assert_frame_equal(df2, expected) assert df._data is df2._data + @pytest.mark.parametrize('op', ['add', 'and', 'div', 'floordiv', 'mod', + 'mul', 'or', 'pow', 'sub', 'truediv', + 'xor']) + def test_inplace_ops_identity2(self, op): + + if compat.PY3 and op == 'div': + return + + df = DataFrame({'a': [1., 2., 3.], + 'b': [1, 2, 3]}) + + operand = 2 + if op in ('and', 'or', 'xor'): + # cannot use floats for boolean ops + df['a'] = [True, False, True] + + df_copy = df.copy() + iop = '__i{}__'.format(op) + op = '__{}__'.format(op) + + # no id change and value is correct + getattr(df, iop)(operand) + expected = getattr(df_copy, op)(operand) + assert_frame_equal(df, expected) + expected = id(df) + assert id(df) == expected + def test_alignment_non_pandas(self): index = ['A', 'B', 'C'] columns = ['X', 'Y', 'Z'] From 83c9205971bd173692286187024ae30aae9ffc39 Mon Sep 17 00:00:00 2001 From: topper-123 Date: Mon, 25 Sep 2017 13:54:02 +0200 Subject: [PATCH 151/188] Correct wrong doc string for MultiIndex.get_loc_level + added examples (#17663) --- pandas/core/indexes/multi.py | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 8c6b26c9070a9c..35f738b347a3eb 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2083,16 +2083,42 @@ def _maybe_str_to_time_stamp(key, lev): def get_loc_level(self, key, level=0, drop_level=True): """ - Get integer location slice for requested label or tuple + Get both the location for the requested label(s) and the + resulting sliced index. Parameters ---------- - key : label or tuple - level : int/level name or list thereof + key : label or sequence of labels + level : int/level name or list thereof, optional + drop_level : bool, default True + if ``False``, the resulting index will not drop any level. Returns ------- - loc : int or slice object + loc : A 2-tuple where the elements are: + Element 0: int, slice object or boolean array + Element 1: The resulting sliced multiindex/index. If the key + contains all levels, this will be ``None``. + + Examples + -------- + >>> mi = pd.MultiIndex.from_arrays([list('abb'), list('def')], + ... names=['A', 'B']) + + >>> mi.get_loc_level('b') + (slice(1, 3, None), Index(['e', 'f'], dtype='object', name='B')) + + >>> mi.get_loc_level('e', level='B') + (array([False, True, False], dtype=bool), + Index(['b'], dtype='object', name='A')) + + >>> mi.get_loc_level(['b', 'e']) + (1, None) + + See Also + --------- + MultiIndex.get_loc : Get integer location, slice or boolean mask for + requested label or tuple. """ def maybe_droplevels(indexer, levels, drop_level): From e0fe5cc60b1dc0d777223bba64b8abfc0e0e02ab Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 25 Sep 2017 10:12:45 -0400 Subject: [PATCH 152/188] COMPAT: skip 32-bit test on int repr (#17664) closes #17121 --- pandas/tests/frame/test_api.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index b3209da6449d6a..230a5806ccb2e6 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -213,8 +213,8 @@ def test_itertuples(self): assert (list(dfaa.itertuples()) == [(0, 1, 1), (1, 2, 2), (2, 3, 3)]) - # repr with be int/long on windows - if not compat.is_platform_windows(): + # repr with be int/long on 32-bit/windows + if not (compat.is_platform_windows() or compat.is_platform_32bit()): assert (repr(list(df.itertuples(name=None))) == '[(0, 1, 4), (1, 2, 5), (2, 3, 6)]') From 45a795e03c985aa3d456916879e3728b90276a7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Jonasson?= Date: Mon, 25 Sep 2017 23:50:18 +0200 Subject: [PATCH 153/188] ERR: get_indexer returns the correct indexer when Index is numeric and target is boolean (#16877) (#17343) --- doc/source/whatsnew/v0.21.0.txt | 2 +- pandas/core/indexes/base.py | 7 ++++++- pandas/tests/indexes/test_base.py | 7 +++++++ pandas/tests/series/test_indexing.py | 5 +++++ 4 files changed, 19 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 36551fa30c3adc..b6bd86bd79a1f2 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -481,7 +481,7 @@ Other API Changes - :class:`Period` is now immutable, and will now raise an ``AttributeError`` when a user tries to assign a new value to the ``ordinal`` or ``freq`` attributes (:issue:`17116`). - :func:`to_datetime` when passed a tz-aware ``origin=`` kwarg will now raise a more informative ``ValueError`` rather than a ``TypeError`` (:issue:`16842`) - Renamed non-functional ``index`` to ``index_col`` in :func:`read_stata` to improve API consistency (:issue:`16342`) - +- Bug in :func:`DataFrame.drop` caused boolean labels ``False`` and ``True`` to be treated as labels 0 and 1 respectively when dropping indices from a numeric index. This will now raise a ValueError (:issue:`16877`) .. _whatsnew_0210.deprecations: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index f28ff9697e517f..be26720adb0bda 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2609,6 +2609,12 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): if tolerance is not None: tolerance = self._convert_tolerance(tolerance) + # Treat boolean labels passed to a numeric index as not found. Without + # this fix False and True would be treated as 0 and 1 respectively. + # (GH #16877) + if target.is_boolean() and self.is_numeric(): + return _ensure_platform_int(np.repeat(-1, target.size)) + pself, ptarget = self._maybe_promote(target) if pself is not self or ptarget is not target: return pself.get_indexer(ptarget, method=method, limit=limit, @@ -2637,7 +2643,6 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): 'backfill or nearest reindexing') indexer = self._engine.get_indexer(target._values) - return _ensure_platform_int(indexer) def _convert_tolerance(self, tolerance): diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 0bd2861e060eda..81f113d58d680a 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1141,6 +1141,13 @@ def test_get_indexer_strings(self): with pytest.raises(TypeError): idx.get_indexer(['a', 'b', 'c', 'd'], method='pad', tolerance=2) + def test_get_indexer_numeric_index_boolean_target(self): + # GH 16877 + numeric_idx = pd.Index(range(4)) + result = numeric_idx.get_indexer([True, False, True]) + expected = np.array([-1, -1, -1], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + def test_get_loc(self): idx = pd.Index([0, 1, 2]) all_methods = [None, 'pad', 'backfill', 'nearest'] diff --git a/pandas/tests/series/test_indexing.py b/pandas/tests/series/test_indexing.py index 91187b709463aa..2182e3fbfc2129 100644 --- a/pandas/tests/series/test_indexing.py +++ b/pandas/tests/series/test_indexing.py @@ -1783,6 +1783,11 @@ def test_drop(self): expected = Series([3], index=[False]) assert_series_equal(result, expected) + # GH 16877 + s = Series([2, 3], index=[0, 1]) + with tm.assert_raises_regex(ValueError, 'not contained in axis'): + s.drop([False, True]) + def test_align(self): def _check_align(a, b, how='left', fill=None): aa, ab = a.align(b, join=how, fill_value=fill) From 5279a172a86ac22250c5a382708e23917df79744 Mon Sep 17 00:00:00 2001 From: JennaVergeynst Date: Tue, 26 Sep 2017 01:19:21 +0200 Subject: [PATCH 154/188] DOC: improve docstring of function where (#17665) --- pandas/core/generic.py | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index b49eeed6db85f0..a7be145f210833 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5825,13 +5825,15 @@ def _where(self, cond, other=np.nan, inplace=False, axis=None, level=None, _shared_docs['where'] = (""" Return an object of same shape as self and whose corresponding - entries are from self where cond is %(cond)s and otherwise are from - other. + entries are from self where `cond` is %(cond)s and otherwise are from + `other`. Parameters ---------- cond : boolean %(klass)s, array-like, or callable - If cond is callable, it is computed on the %(klass)s and + Where `cond` is %(cond)s, keep the original value. Where + %(cond_rev)s, replace with corresponding value from `other`. + If `cond` is callable, it is computed on the %(klass)s and should return boolean %(klass)s or array. The callable must not change input %(klass)s (though pandas doesn't check it). @@ -5839,6 +5841,8 @@ def _where(self, cond, other=np.nan, inplace=False, axis=None, level=None, A callable can be used as cond. other : scalar, %(klass)s, or callable + Entries where `cond` is %(cond_rev)s are replaced with + corresponding value from `other`. If other is callable, it is computed on the %(klass)s and should return scalar or %(klass)s. The callable must not change input %(klass)s (though pandas doesn't check it). @@ -5884,6 +5888,20 @@ def _where(self, cond, other=np.nan, inplace=False, axis=None, level=None, 3 3.0 4 4.0 + >>> s.mask(s > 0) + 0 0.0 + 1 NaN + 2 NaN + 3 NaN + 4 NaN + + >>> s.where(s > 1, 10) + 0 10.0 + 1 10.0 + 2 2.0 + 3 3.0 + 4 4.0 + >>> df = pd.DataFrame(np.arange(10).reshape(-1, 2), columns=['A', 'B']) >>> m = df %% 3 == 0 >>> df.where(m, -df) @@ -5914,7 +5932,8 @@ def _where(self, cond, other=np.nan, inplace=False, axis=None, level=None, """) @Appender(_shared_docs['where'] % dict(_shared_doc_kwargs, cond="True", - name='where', name_other='mask')) + cond_rev="False", name='where', + name_other='mask')) def where(self, cond, other=np.nan, inplace=False, axis=None, level=None, try_cast=False, raise_on_error=True): @@ -5923,7 +5942,8 @@ def where(self, cond, other=np.nan, inplace=False, axis=None, level=None, raise_on_error) @Appender(_shared_docs['where'] % dict(_shared_doc_kwargs, cond="False", - name='mask', name_other='where')) + cond_rev="True", name='mask', + name_other='where')) def mask(self, cond, other=np.nan, inplace=False, axis=None, level=None, try_cast=False, raise_on_error=True): From 0d239d9ec0a38d8208269fc688a49f0c3c6a9b2a Mon Sep 17 00:00:00 2001 From: Gabe F Date: Tue, 26 Sep 2017 06:34:33 -0400 Subject: [PATCH 155/188] DOC: correct grammar in unicode section (#17678) --- doc/source/options.rst | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/doc/source/options.rst b/doc/source/options.rst index f042e4d3f51204..2da55a5a658a47 100644 --- a/doc/source/options.rst +++ b/doc/source/options.rst @@ -474,10 +474,10 @@ Unicode Formatting Enabling this option will affect the performance for printing of DataFrame and Series (about 2 times slower). Use only when it is actually required. -Some East Asian countries use Unicode characters its width is corresponding to 2 alphabets. -If DataFrame or Series contains these characters, default output cannot be aligned properly. +Some East Asian countries use Unicode characters whose width corresponds to two Latin characters. +If a DataFrame or Series contains these characters, the default output mode may not align them properly. -.. note:: Screen captures are attached for each outputs to show the actual results. +.. note:: Screen captures are attached for each output to show the actual results. .. ipython:: python @@ -486,8 +486,9 @@ If DataFrame or Series contains these characters, default output cannot be align .. image:: _static/option_unicode01.png -Enable ``display.unicode.east_asian_width`` allows pandas to check each character's "East Asian Width" property. -These characters can be aligned properly by checking this property, but it takes longer time than standard ``len`` function. +Enabling ``display.unicode.east_asian_width`` allows pandas to check each character's "East Asian Width" property. +These characters can be aligned properly by setting this option to ``True``. However, this will result in longer render +times than the standard ``len`` function. .. ipython:: python @@ -496,9 +497,10 @@ These characters can be aligned properly by checking this property, but it takes .. image:: _static/option_unicode02.png -In addition, Unicode contains characters which width is "Ambiguous". These character's width should be either 1 or 2 depending on terminal setting or encoding. Because this cannot be distinguished from Python, ``display.unicode.ambiguous_as_wide`` option is added to handle this. +In addition, Unicode characters whose width is "Ambiguous" can either be 1 or 2 characters wide depending on the +terminal setting or encoding. The option ``display.unicode.ambiguous_as_wide`` can be used to handle the ambiguity. -By default, "Ambiguous" character's width, "¡" (inverted exclamation) in below example, is regarded as 1. +By default, an "Ambiguous" character's width, such as "¡" (inverted exclamation) in the example below, is taken to be 1. .. ipython:: python @@ -507,7 +509,10 @@ By default, "Ambiguous" character's width, "¡" (inverted exclamation) in below .. image:: _static/option_unicode03.png -Enabling ``display.unicode.ambiguous_as_wide`` lets pandas to figure these character's width as 2. Note that this option will be effective only when ``display.unicode.east_asian_width`` is enabled. Confirm starting position has been changed, but is not aligned properly because the setting is mismatched with this environment. +Enabling ``display.unicode.ambiguous_as_wide`` makes pandas interpret these characters' widths to be 2. +(Note that this option will only be effective when ``display.unicode.east_asian_width`` is enabled.) + +However, setting this option incorrectly for your terminal will cause these characters to be aligned incorrectly: .. ipython:: python From 7e87385e20682184a3f5d188c8e783d63c703b83 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 26 Sep 2017 06:29:19 -0700 Subject: [PATCH 156/188] Separate parsing functions out from tslib (#17363) --- pandas/_libs/period.pyx | 4 +- pandas/_libs/src/inference.pyx | 159 ----- pandas/_libs/tslib.pyx | 274 +------- pandas/_libs/tslibs/parsing.pyx | 681 +++++++++++++++++++ pandas/core/indexes/base.py | 3 +- pandas/core/tools/datetimes.py | 204 +----- pandas/io/date_converters.py | 11 +- pandas/io/parsers.py | 10 +- pandas/tests/indexes/datetimes/test_tools.py | 9 +- pandas/tests/io/parser/parse_dates.py | 5 +- setup.py | 3 + 11 files changed, 721 insertions(+), 642 deletions(-) create mode 100644 pandas/_libs/tslibs/parsing.pyx diff --git a/pandas/_libs/period.pyx b/pandas/_libs/period.pyx index 943f925ec5b04a..725da22104efcc 100644 --- a/pandas/_libs/period.pyx +++ b/pandas/_libs/period.pyx @@ -37,10 +37,10 @@ from tslibs.timezones cimport ( is_utc, is_tzlocal, get_utcoffset, get_dst_info, maybe_get_tz) from tslib cimport _nat_scalar_rules +from tslibs.parsing import parse_time_string, NAT_SENTINEL from tslibs.frequencies cimport get_freq_code from pandas.tseries import offsets -from pandas.core.tools.datetimes import parse_time_string from pandas.tseries import frequencies cdef int64_t NPY_NAT = util.get_nat() @@ -1197,6 +1197,8 @@ class Period(_Period): value = str(value) value = value.upper() dt, _, reso = parse_time_string(value, freq) + if dt is NAT_SENTINEL: + ordinal = iNaT if freq is None: try: diff --git a/pandas/_libs/src/inference.pyx b/pandas/_libs/src/inference.pyx index a2764e87eec556..ed883bf5db5bcc 100644 --- a/pandas/_libs/src/inference.pyx +++ b/pandas/_libs/src/inference.pyx @@ -1384,165 +1384,6 @@ def convert_sql_column(x): return maybe_convert_objects(x, try_float=1) -def try_parse_dates(ndarray[object] values, parser=None, - dayfirst=False, default=None): - cdef: - Py_ssize_t i, n - ndarray[object] result - - n = len(values) - result = np.empty(n, dtype='O') - - if parser is None: - if default is None: # GH2618 - date=datetime.now() - default=datetime(date.year, date.month, 1) - - try: - from dateutil.parser import parse - parse_date = lambda x: parse(x, dayfirst=dayfirst, default=default) - except ImportError: # pragma: no cover - def parse_date(s): - try: - return datetime.strptime(s, '%m/%d/%Y') - except Exception: - return s - # EAFP here - try: - for i from 0 <= i < n: - if values[i] == '': - result[i] = np.nan - else: - result[i] = parse_date(values[i]) - except Exception: - # failed - return values - else: - parse_date = parser - - try: - for i from 0 <= i < n: - if values[i] == '': - result[i] = np.nan - else: - result[i] = parse_date(values[i]) - except Exception: - # raise if passed parser and it failed - raise - - return result - - -def try_parse_date_and_time(ndarray[object] dates, ndarray[object] times, - date_parser=None, time_parser=None, - dayfirst=False, default=None): - cdef: - Py_ssize_t i, n - ndarray[object] result - - from datetime import date, time, datetime, timedelta - - n = len(dates) - if len(times) != n: - raise ValueError('Length of dates and times must be equal') - result = np.empty(n, dtype='O') - - if date_parser is None: - if default is None: # GH2618 - date=datetime.now() - default=datetime(date.year, date.month, 1) - - try: - from dateutil.parser import parse - parse_date = lambda x: parse(x, dayfirst=dayfirst, default=default) - except ImportError: # pragma: no cover - def parse_date(s): - try: - return date.strptime(s, '%m/%d/%Y') - except Exception: - return s - else: - parse_date = date_parser - - if time_parser is None: - try: - from dateutil.parser import parse - parse_time = lambda x: parse(x) - except ImportError: # pragma: no cover - def parse_time(s): - try: - return time.strptime(s, '%H:%M:%S') - except Exception: - return s - - else: - parse_time = time_parser - - for i from 0 <= i < n: - d = parse_date(str(dates[i])) - t = parse_time(str(times[i])) - result[i] = datetime(d.year, d.month, d.day, - t.hour, t.minute, t.second) - - return result - - -def try_parse_year_month_day(ndarray[object] years, ndarray[object] months, - ndarray[object] days): - cdef: - Py_ssize_t i, n - ndarray[object] result - - from datetime import datetime - - n = len(years) - if len(months) != n or len(days) != n: - raise ValueError('Length of years/months/days must all be equal') - result = np.empty(n, dtype='O') - - for i from 0 <= i < n: - result[i] = datetime(int(years[i]), int(months[i]), int(days[i])) - - return result - - -def try_parse_datetime_components(ndarray[object] years, - ndarray[object] months, - ndarray[object] days, - ndarray[object] hours, - ndarray[object] minutes, - ndarray[object] seconds): - - cdef: - Py_ssize_t i, n - ndarray[object] result - int secs - double float_secs - double micros - - from datetime import datetime - - n = len(years) - if (len(months) != n or len(days) != n or len(hours) != n or - len(minutes) != n or len(seconds) != n): - raise ValueError('Length of all datetime components must be equal') - result = np.empty(n, dtype='O') - - for i from 0 <= i < n: - float_secs = float(seconds[i]) - secs = int(float_secs) - - micros = float_secs - secs - if micros > 0: - micros = micros * 1000000 - - result[i] = datetime(int(years[i]), int(months[i]), int(days[i]), - int(hours[i]), int(minutes[i]), secs, - int(micros)) - - return result - - def sanitize_objects(ndarray[object] values, set na_values, convert_empty=True): cdef: diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index d4ca5af09367eb..4c34d0fcb1e5f6 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -67,6 +67,9 @@ from khash cimport ( kh_init_int64, kh_int64_t, kh_resize_int64, kh_get_int64) +from .tslibs.parsing import parse_datetime_string +from .tslibs.parsing import DateParseError # noqa + cimport cython import re @@ -1737,26 +1740,6 @@ def datetime_to_datetime64(ndarray[object] values): return result, inferred_tz -cdef: - set _not_datelike_strings = set(['a', 'A', 'm', 'M', 'p', 'P', 't', 'T']) - -cpdef bint _does_string_look_like_datetime(object date_string): - if date_string.startswith('0'): - # Strings starting with 0 are more consistent with a - # date-like string than a number - return True - - try: - if float(date_string) < 1000: - return False - except ValueError: - pass - - if date_string in _not_datelike_strings: - return False - - return True - def format_array_from_datetime(ndarray[int64_t] values, object tz=None, object format=None, object na_rep=None): @@ -1841,257 +1824,6 @@ def format_array_from_datetime(ndarray[int64_t] values, object tz=None, return result -class DateParseError(ValueError): - pass - - -cdef object _TIMEPAT = re.compile(r'^([01]?[0-9]|2[0-3]):([0-5][0-9])') - - -def parse_datetime_string(object date_string, object freq=None, - dayfirst=False, yearfirst=False, **kwargs): - """parse datetime string, only returns datetime. - Also cares special handling matching time patterns. - - Returns - ------- - datetime - """ - - cdef: - object dt - - if not _does_string_look_like_datetime(date_string): - raise ValueError('Given date string not likely a datetime.') - - if _TIMEPAT.match(date_string): - # use current datetime as default, not pass _DEFAULT_DATETIME - dt = parse_date(date_string, dayfirst=dayfirst, - yearfirst=yearfirst, **kwargs) - return dt - try: - dt, _, _ = _parse_dateabbr_string(date_string, _DEFAULT_DATETIME, freq) - return dt - except DateParseError: - raise - except ValueError: - pass - - try: - dt = parse_date(date_string, default=_DEFAULT_DATETIME, - dayfirst=dayfirst, yearfirst=yearfirst, **kwargs) - except TypeError: - # following may be raised from dateutil - # TypeError: 'NoneType' object is not iterable - raise ValueError('Given date string not likely a datetime.') - - return dt - - -def parse_datetime_string_with_reso(object date_string, object freq=None, - dayfirst=False, yearfirst=False, **kwargs): - """parse datetime string, only returns datetime - - Returns - ------- - datetime - """ - - cdef: - object parsed, reso - - if not _does_string_look_like_datetime(date_string): - raise ValueError('Given date string not likely a datetime.') - - try: - return _parse_dateabbr_string(date_string, _DEFAULT_DATETIME, freq) - except DateParseError: - raise - except ValueError: - pass - - try: - parsed, reso = dateutil_parse(date_string, _DEFAULT_DATETIME, - dayfirst=dayfirst, yearfirst=yearfirst) - except Exception as e: - # TODO: allow raise of errors within instead - raise DateParseError(e) - if parsed is None: - raise DateParseError("Could not parse %s" % date_string) - return parsed, parsed, reso - - -cdef inline object _parse_dateabbr_string(object date_string, object default, - object freq): - cdef: - object ret - int year, quarter = -1, month, mnum, date_len - - # special handling for possibilities eg, 2Q2005, 2Q05, 2005Q1, 05Q1 - assert util.is_string_object(date_string) - - # len(date_string) == 0 - # should be NaT??? - - if date_string in _nat_strings: - return NaT, NaT, '' - - date_string = date_string.upper() - date_len = len(date_string) - - if date_len == 4: - # parse year only like 2000 - try: - ret = default.replace(year=int(date_string)) - return ret, ret, 'year' - except ValueError: - pass - - try: - if 4 <= date_len <= 7: - i = date_string.index('Q', 1, 6) - if i == 1: - quarter = int(date_string[0]) - if date_len == 4 or (date_len == 5 - and date_string[i + 1] == '-'): - # r'(\d)Q-?(\d\d)') - year = 2000 + int(date_string[-2:]) - elif date_len == 6 or (date_len == 7 - and date_string[i + 1] == '-'): - # r'(\d)Q-?(\d\d\d\d)') - year = int(date_string[-4:]) - else: - raise ValueError - elif i == 2 or i == 3: - # r'(\d\d)-?Q(\d)' - if date_len == 4 or (date_len == 5 - and date_string[i - 1] == '-'): - quarter = int(date_string[-1]) - year = 2000 + int(date_string[:2]) - else: - raise ValueError - elif i == 4 or i == 5: - if date_len == 6 or (date_len == 7 - and date_string[i - 1] == '-'): - # r'(\d\d\d\d)-?Q(\d)' - quarter = int(date_string[-1]) - year = int(date_string[:4]) - else: - raise ValueError - - if not (1 <= quarter <= 4): - msg = ('Incorrect quarterly string is given, quarter must be ' - 'between 1 and 4: {0}') - raise DateParseError(msg.format(date_string)) - - if freq is not None: - # hack attack, #1228 - try: - mnum = _MONTH_NUMBERS[_get_rule_month(freq)] + 1 - except (KeyError, ValueError): - msg = ('Unable to retrieve month information from given ' - 'freq: {0}').format(freq) - raise DateParseError(msg) - - month = (mnum + (quarter - 1) * 3) % 12 + 1 - if month > mnum: - year -= 1 - else: - month = (quarter - 1) * 3 + 1 - - ret = default.replace(year=year, month=month) - return ret, ret, 'quarter' - - except DateParseError: - raise - except ValueError: - pass - - if date_len == 6 and (freq == 'M' or getattr( - freq, 'rule_code', None) == 'M'): - year = int(date_string[:4]) - month = int(date_string[4:6]) - try: - ret = default.replace(year=year, month=month) - return ret, ret, 'month' - except ValueError: - pass - - for pat in ['%Y-%m', '%m-%Y', '%b %Y', '%b-%Y']: - try: - ret = datetime.strptime(date_string, pat) - return ret, ret, 'month' - except ValueError: - pass - - raise ValueError('Unable to parse {0}'.format(date_string)) - - -def dateutil_parse(object timestr, object default, ignoretz=False, - tzinfos=None, **kwargs): - """ lifted from dateutil to get resolution""" - - cdef: - object fobj, res, attr, ret, tzdata - object reso = None - dict repl = {} - - fobj = StringIO(str(timestr)) - res = DEFAULTPARSER._parse(fobj, **kwargs) - - # dateutil 2.2 compat - if isinstance(res, tuple): - res, _ = res - - if res is None: - msg = "Unknown datetime string format, unable to parse: {0}" - raise ValueError(msg.format(timestr)) - - for attr in ["year", "month", "day", "hour", - "minute", "second", "microsecond"]: - value = getattr(res, attr) - if value is not None: - repl[attr] = value - reso = attr - - if reso is None: - msg = "Unable to parse datetime string: {0}" - raise ValueError(msg.format(timestr)) - - if reso == 'microsecond': - if repl['microsecond'] == 0: - reso = 'second' - elif repl['microsecond'] % 1000 == 0: - reso = 'millisecond' - - ret = default.replace(**repl) - if res.weekday is not None and not res.day: - ret = ret + relativedelta.relativedelta(weekday=res.weekday) - if not ignoretz: - if callable(tzinfos) or tzinfos and res.tzname in tzinfos: - if callable(tzinfos): - tzdata = tzinfos(res.tzname, res.tzoffset) - else: - tzdata = tzinfos.get(res.tzname) - if isinstance(tzdata, datetime.tzinfo): - tzinfo = tzdata - elif isinstance(tzdata, string_types): - tzinfo = _dateutil_tzstr(tzdata) - elif isinstance(tzdata, int): - tzinfo = tzoffset(res.tzname, tzdata) - else: - raise ValueError("offset must be tzinfo subclass, " - "tz string, or int offset") - ret = ret.replace(tzinfo=tzinfo) - elif res.tzname and res.tzname in time.tzname: - ret = ret.replace(tzinfo=_dateutil_tzlocal()) - elif res.tzoffset == 0: - ret = ret.replace(tzinfo=_dateutil_tzutc()) - elif res.tzoffset: - ret = ret.replace(tzinfo=tzoffset(res.tzname, res.tzoffset)) - return ret, reso - - # const for parsers _DEFAULT_DATETIME = datetime(1, 1, 1).replace( diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx new file mode 100644 index 00000000000000..845d1b8dcabba3 --- /dev/null +++ b/pandas/_libs/tslibs/parsing.pyx @@ -0,0 +1,681 @@ +# -*- coding: utf-8 -*- +# cython: profile=False +# cython: linetrace=False +# distutils: define_macros=CYTHON_TRACE=0 +# distutils: define_macros=CYTHON_TRACE_NOGIL=0 +""" +Parsing functions for datetime and datetime-like strings. +""" +import sys +import re + +from cpython cimport PyString_Check, PyUnicode_Check + +from libc.stdlib cimport free + +cimport cython +from cython cimport Py_ssize_t + + +from datetime import datetime +import time + +import numpy as np +cimport numpy as np +from numpy cimport int64_t, ndarray +np.import_array() + +# Avoid import from outside _libs +if sys.version_info.major == 2: + string_types = basestring + from StringIO import StringIO +else: + string_types = str + from io import StringIO + + +# dateutil compat +from dateutil.tz import (tzoffset, + tzlocal as _dateutil_tzlocal, + tzfile as _dateutil_tzfile, + tzutc as _dateutil_tzutc, + tzstr as _dateutil_tzstr) +from dateutil.relativedelta import relativedelta +from dateutil.parser import DEFAULTPARSER +from dateutil.parser import parse as du_parse + + +class DateParseError(ValueError): + pass + +_nat_strings = set(['NaT', 'nat', 'NAT', 'nan', 'NaN', 'NAN']) + +_DEFAULT_DATETIME = datetime(1, 1, 1).replace(hour=0, minute=0, + second=0, microsecond=0) +_MONTHS = ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', + 'AUG', 'SEP', 'OCT', 'NOV', 'DEC'] +_MONTH_NUMBERS = {k: i for i, k in enumerate(_MONTHS)} +_MONTH_ALIASES = {(k + 1): v for k, v in enumerate(_MONTHS)} + +cdef object _TIMEPAT = re.compile(r'^([01]?[0-9]|2[0-3]):([0-5][0-9])') + +cdef set _not_datelike_strings = set(['a', 'A', 'm', 'M', 'p', 'P', 't', 'T']) + +NAT_SENTINEL = object() +# This allows us to reference NaT without having to import it + + +def parse_datetime_string(date_string, freq=None, dayfirst=False, + yearfirst=False, **kwargs): + """parse datetime string, only returns datetime. + Also cares special handling matching time patterns. + + Returns + ------- + datetime + """ + + cdef: + object dt + + if not _does_string_look_like_datetime(date_string): + raise ValueError('Given date string not likely a datetime.') + + if _TIMEPAT.match(date_string): + # use current datetime as default, not pass _DEFAULT_DATETIME + dt = du_parse(date_string, dayfirst=dayfirst, + yearfirst=yearfirst, **kwargs) + return dt + + try: + dt, _, _ = _parse_dateabbr_string(date_string, _DEFAULT_DATETIME, freq) + return dt + except DateParseError: + raise + except ValueError: + pass + + try: + dt = du_parse(date_string, default=_DEFAULT_DATETIME, + dayfirst=dayfirst, yearfirst=yearfirst, **kwargs) + except TypeError: + # following may be raised from dateutil + # TypeError: 'NoneType' object is not iterable + raise ValueError('Given date string not likely a datetime.') + + return dt + + +def parse_time_string(arg, freq=None, dayfirst=None, yearfirst=None): + """ + Try hard to parse datetime string, leveraging dateutil plus some extra + goodies like quarter recognition. + + Parameters + ---------- + arg : compat.string_types + freq : str or DateOffset, default None + Helps with interpreting time string if supplied + dayfirst : bool, default None + If None uses default from print_config + yearfirst : bool, default None + If None uses default from print_config + + Returns + ------- + datetime, datetime/dateutil.parser._result, str + """ + if not isinstance(arg, string_types): + return arg + + if getattr(freq, "_typ", None) == "dateoffset": + freq = freq.rule_code + + if dayfirst is None: + from pandas.core.config import get_option + dayfirst = get_option("display.date_dayfirst") + if yearfirst is None: + from pandas.core.config import get_option + yearfirst = get_option("display.date_yearfirst") + + res = parse_datetime_string_with_reso(arg, freq=freq, + dayfirst=dayfirst, + yearfirst=yearfirst) + if res[0] is NAT_SENTINEL: + from pandas._libs.tslib import NaT + res = (NaT,) + res[1:] + return res + + +def parse_datetime_string_with_reso(date_string, freq=None, dayfirst=False, + yearfirst=False, **kwargs): + """parse datetime string, only returns datetime + + Returns + ------- + datetime + """ + + cdef: + object parsed, reso + + if not _does_string_look_like_datetime(date_string): + raise ValueError('Given date string not likely a datetime.') + + try: + return _parse_dateabbr_string(date_string, _DEFAULT_DATETIME, freq) + except DateParseError: + raise + except ValueError: + pass + + try: + parsed, reso = dateutil_parse(date_string, _DEFAULT_DATETIME, + dayfirst=dayfirst, yearfirst=yearfirst) + except Exception as e: + # TODO: allow raise of errors within instead + raise DateParseError(e) + if parsed is None: + raise DateParseError("Could not parse %s" % date_string) + return parsed, parsed, reso + + +cpdef bint _does_string_look_like_datetime(object date_string): + if date_string.startswith('0'): + # Strings starting with 0 are more consistent with a + # date-like string than a number + return True + + try: + if float(date_string) < 1000: + return False + except ValueError: + pass + + if date_string in _not_datelike_strings: + return False + + return True + + +cdef inline object _parse_dateabbr_string(object date_string, object default, + object freq): + cdef: + object ret + int year, quarter = -1, month, mnum, date_len + + # special handling for possibilities eg, 2Q2005, 2Q05, 2005Q1, 05Q1 + assert isinstance(date_string, string_types) + + # len(date_string) == 0 + # should be NaT??? + + if date_string in _nat_strings: + return NAT_SENTINEL, NAT_SENTINEL, '' + + date_string = date_string.upper() + date_len = len(date_string) + + if date_len == 4: + # parse year only like 2000 + try: + ret = default.replace(year=int(date_string)) + return ret, ret, 'year' + except ValueError: + pass + + try: + if 4 <= date_len <= 7: + i = date_string.index('Q', 1, 6) + if i == 1: + quarter = int(date_string[0]) + if date_len == 4 or (date_len == 5 + and date_string[i + 1] == '-'): + # r'(\d)Q-?(\d\d)') + year = 2000 + int(date_string[-2:]) + elif date_len == 6 or (date_len == 7 + and date_string[i + 1] == '-'): + # r'(\d)Q-?(\d\d\d\d)') + year = int(date_string[-4:]) + else: + raise ValueError + elif i == 2 or i == 3: + # r'(\d\d)-?Q(\d)' + if date_len == 4 or (date_len == 5 + and date_string[i - 1] == '-'): + quarter = int(date_string[-1]) + year = 2000 + int(date_string[:2]) + else: + raise ValueError + elif i == 4 or i == 5: + if date_len == 6 or (date_len == 7 + and date_string[i - 1] == '-'): + # r'(\d\d\d\d)-?Q(\d)' + quarter = int(date_string[-1]) + year = int(date_string[:4]) + else: + raise ValueError + + if not (1 <= quarter <= 4): + msg = ('Incorrect quarterly string is given, quarter must be ' + 'between 1 and 4: {0}') + raise DateParseError(msg.format(date_string)) + + if freq is not None: + # hack attack, #1228 + try: + mnum = _MONTH_NUMBERS[_get_rule_month(freq)] + 1 + except (KeyError, ValueError): + msg = ('Unable to retrieve month information from given ' + 'freq: {0}').format(freq) + raise DateParseError(msg) + + month = (mnum + (quarter - 1) * 3) % 12 + 1 + if month > mnum: + year -= 1 + else: + month = (quarter - 1) * 3 + 1 + + ret = default.replace(year=year, month=month) + return ret, ret, 'quarter' + + except DateParseError: + raise + except ValueError: + pass + + if date_len == 6 and (freq == 'M' or + getattr(freq, 'rule_code', None) == 'M'): + year = int(date_string[:4]) + month = int(date_string[4:6]) + try: + ret = default.replace(year=year, month=month) + return ret, ret, 'month' + except ValueError: + pass + + for pat in ['%Y-%m', '%m-%Y', '%b %Y', '%b-%Y']: + try: + ret = datetime.strptime(date_string, pat) + return ret, ret, 'month' + except ValueError: + pass + + raise ValueError('Unable to parse {0}'.format(date_string)) + + +def dateutil_parse(object timestr, object default, ignoretz=False, + tzinfos=None, **kwargs): + """ lifted from dateutil to get resolution""" + + cdef: + object fobj, res, attr, ret, tzdata + object reso = None + dict repl = {} + + fobj = StringIO(str(timestr)) + res = DEFAULTPARSER._parse(fobj, **kwargs) + + # dateutil 2.2 compat + if isinstance(res, tuple): # PyTuple_Check + res, _ = res + + if res is None: + msg = "Unknown datetime string format, unable to parse: {0}" + raise ValueError(msg.format(timestr)) + + for attr in ["year", "month", "day", "hour", + "minute", "second", "microsecond"]: + value = getattr(res, attr) + if value is not None: + repl[attr] = value + reso = attr + + if reso is None: + msg = "Unable to parse datetime string: {0}" + raise ValueError(msg.format(timestr)) + + if reso == 'microsecond': + if repl['microsecond'] == 0: + reso = 'second' + elif repl['microsecond'] % 1000 == 0: + reso = 'millisecond' + + ret = default.replace(**repl) + if res.weekday is not None and not res.day: + ret = ret + relativedelta.relativedelta(weekday=res.weekday) + if not ignoretz: + if callable(tzinfos) or tzinfos and res.tzname in tzinfos: + if callable(tzinfos): + tzdata = tzinfos(res.tzname, res.tzoffset) + else: + tzdata = tzinfos.get(res.tzname) + if isinstance(tzdata, datetime.tzinfo): + tzinfo = tzdata + elif isinstance(tzdata, string_types): + tzinfo = _dateutil_tzstr(tzdata) + elif isinstance(tzdata, int): + tzinfo = tzoffset(res.tzname, tzdata) + else: + raise ValueError("offset must be tzinfo subclass, " + "tz string, or int offset") + ret = ret.replace(tzinfo=tzinfo) + elif res.tzname and res.tzname in time.tzname: + ret = ret.replace(tzinfo=_dateutil_tzlocal()) + elif res.tzoffset == 0: + ret = ret.replace(tzinfo=_dateutil_tzutc()) + elif res.tzoffset: + ret = ret.replace(tzinfo=tzoffset(res.tzname, res.tzoffset)) + return ret, reso + + +cpdef object _get_rule_month(object source, object default='DEC'): + """ + Return starting month of given freq, default is December. + + Example + ------- + >>> _get_rule_month('D') + 'DEC' + + >>> _get_rule_month('A-JAN') + 'JAN' + """ + if hasattr(source, 'freqstr'): + source = source.freqstr + source = source.upper() + if '-' not in source: + return default + else: + return source.split('-')[1] + + +#---------------------------------------------------------------------- +# Parsing for type-inference + + +def try_parse_dates(ndarray[object] values, parser=None, + dayfirst=False, default=None): + cdef: + Py_ssize_t i, n + ndarray[object] result + + n = len(values) + result = np.empty(n, dtype='O') + + if parser is None: + if default is None: # GH2618 + date = datetime.now() + default = datetime(date.year, date.month, 1) + + parse_date = lambda x: du_parse(x, dayfirst=dayfirst, default=default) + + # EAFP here + try: + for i from 0 <= i < n: + if values[i] == '': + result[i] = np.nan + else: + result[i] = parse_date(values[i]) + except Exception: + # failed + return values + else: + parse_date = parser + + try: + for i from 0 <= i < n: + if values[i] == '': + result[i] = np.nan + else: + result[i] = parse_date(values[i]) + except Exception: + # raise if passed parser and it failed + raise + + return result + + +def try_parse_date_and_time(ndarray[object] dates, ndarray[object] times, + date_parser=None, time_parser=None, + dayfirst=False, default=None): + cdef: + Py_ssize_t i, n + ndarray[object] result + + n = len(dates) + if len(times) != n: + raise ValueError('Length of dates and times must be equal') + result = np.empty(n, dtype='O') + + if date_parser is None: + if default is None: # GH2618 + date = datetime.now() + default = datetime(date.year, date.month, 1) + + parse_date = lambda x: du_parse(x, dayfirst=dayfirst, default=default) + + else: + parse_date = date_parser + + if time_parser is None: + parse_time = lambda x: du_parse(x) + + else: + parse_time = time_parser + + for i from 0 <= i < n: + d = parse_date(str(dates[i])) + t = parse_time(str(times[i])) + result[i] = datetime(d.year, d.month, d.day, + t.hour, t.minute, t.second) + + return result + + +def try_parse_year_month_day(ndarray[object] years, ndarray[object] months, + ndarray[object] days): + cdef: + Py_ssize_t i, n + ndarray[object] result + + n = len(years) + if len(months) != n or len(days) != n: + raise ValueError('Length of years/months/days must all be equal') + result = np.empty(n, dtype='O') + + for i from 0 <= i < n: + result[i] = datetime(int(years[i]), int(months[i]), int(days[i])) + + return result + + +def try_parse_datetime_components(ndarray[object] years, + ndarray[object] months, + ndarray[object] days, + ndarray[object] hours, + ndarray[object] minutes, + ndarray[object] seconds): + + cdef: + Py_ssize_t i, n + ndarray[object] result + int secs + double float_secs + double micros + + n = len(years) + if (len(months) != n or len(days) != n or len(hours) != n or + len(minutes) != n or len(seconds) != n): + raise ValueError('Length of all datetime components must be equal') + result = np.empty(n, dtype='O') + + for i from 0 <= i < n: + float_secs = float(seconds[i]) + secs = int(float_secs) + + micros = float_secs - secs + if micros > 0: + micros = micros * 1000000 + + result[i] = datetime(int(years[i]), int(months[i]), int(days[i]), + int(hours[i]), int(minutes[i]), secs, + int(micros)) + + return result + + +#---------------------------------------------------------------------- +# Miscellaneous + +_DATEUTIL_LEXER_SPLIT = None +try: + # Since these are private methods from dateutil, it is safely imported + # here so in case this interface changes, pandas will just fallback + # to not using the functionality + from dateutil.parser import _timelex + + if hasattr(_timelex, 'split'): + def _lexer_split_from_str(dt_str): + # The StringIO(str(_)) is for dateutil 2.2 compatibility + return _timelex.split(StringIO(str(dt_str))) + + _DATEUTIL_LEXER_SPLIT = _lexer_split_from_str +except (ImportError, AttributeError): + pass + + +def _format_is_iso(f): + """ + Does format match the iso8601 set that can be handled by the C parser? + Generally of form YYYY-MM-DDTHH:MM:SS - date separator can be different + but must be consistent. Leading 0s in dates and times are optional. + """ + iso_template = '%Y{date_sep}%m{date_sep}%d{time_sep}%H:%M:%S.%f'.format + excluded_formats = ['%Y%m%d', '%Y%m', '%Y'] + + for date_sep in [' ', '/', '\\', '-', '.', '']: + for time_sep in [' ', 'T']: + if (iso_template(date_sep=date_sep, + time_sep=time_sep + ).startswith(f) and f not in excluded_formats): + return True + return False + + +def _guess_datetime_format(dt_str, dayfirst=False, dt_str_parse=du_parse, + dt_str_split=_DATEUTIL_LEXER_SPLIT): + """ + Guess the datetime format of a given datetime string. + + Parameters + ---------- + dt_str : string, datetime string to guess the format of + dayfirst : boolean, default False + If True parses dates with the day first, eg 20/01/2005 + Warning: dayfirst=True is not strict, but will prefer to parse + with day first (this is a known bug). + dt_str_parse : function, defaults to `compat.parse_date` (dateutil) + This function should take in a datetime string and return + a `datetime.datetime` guess that the datetime string represents + dt_str_split : function, defaults to `_DATEUTIL_LEXER_SPLIT` (dateutil) + This function should take in a datetime string and return + a list of strings, the guess of the various specific parts + e.g. '2011/12/30' -> ['2011', '/', '12', '/', '30'] + + Returns + ------- + ret : datetime format string (for `strftime` or `strptime`) + """ + if dt_str_parse is None or dt_str_split is None: + return None + + if not isinstance(dt_str, string_types): + return None + + day_attribute_and_format = (('day',), '%d', 2) + + # attr name, format, padding (if any) + datetime_attrs_to_format = [ + (('year', 'month', 'day'), '%Y%m%d', 0), + (('year',), '%Y', 0), + (('month',), '%B', 0), + (('month',), '%b', 0), + (('month',), '%m', 2), + day_attribute_and_format, + (('hour',), '%H', 2), + (('minute',), '%M', 2), + (('second',), '%S', 2), + (('microsecond',), '%f', 6), + (('second', 'microsecond'), '%S.%f', 0), + ] + + if dayfirst: + datetime_attrs_to_format.remove(day_attribute_and_format) + datetime_attrs_to_format.insert(0, day_attribute_and_format) + + try: + parsed_datetime = dt_str_parse(dt_str, dayfirst=dayfirst) + except: + # In case the datetime can't be parsed, its format cannot be guessed + return None + + if parsed_datetime is None: + return None + + try: + tokens = dt_str_split(dt_str) + except: + # In case the datetime string can't be split, its format cannot + # be guessed + return None + + format_guess = [None] * len(tokens) + found_attrs = set() + + for attrs, attr_format, padding in datetime_attrs_to_format: + # If a given attribute has been placed in the format string, skip + # over other formats for that same underlying attribute (IE, month + # can be represented in multiple different ways) + if set(attrs) & found_attrs: + continue + + if all(getattr(parsed_datetime, attr) is not None for attr in attrs): + for i, token_format in enumerate(format_guess): + token_filled = tokens[i].zfill(padding) + if (token_format is None and + token_filled == parsed_datetime.strftime(attr_format)): + format_guess[i] = attr_format + tokens[i] = token_filled + found_attrs.update(attrs) + break + + # Only consider it a valid guess if we have a year, month and day + if len(set(['year', 'month', 'day']) & found_attrs) != 3: + return None + + output_format = [] + for i, guess in enumerate(format_guess): + if guess is not None: + # Either fill in the format placeholder (like %Y) + output_format.append(guess) + else: + # Or just the token separate (IE, the dashes in "01-01-2013") + try: + # If the token is numeric, then we likely didn't parse it + # properly, so our guess is wrong + float(tokens[i]) + return None + except ValueError: + pass + + output_format.append(tokens[i]) + + guessed_format = ''.join(output_format) + + # rebuild string, capturing any inferred padding + dt_str = ''.join(tokens) + if parsed_datetime.strftime(guessed_format) == dt_str: + return guessed_format + else: + return None diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index be26720adb0bda..dba616c2d15e62 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -7,6 +7,7 @@ algos as libalgos, join as libjoin, Timestamp, Timedelta, ) from pandas._libs.lib import is_datetime_array +from pandas._libs.tslibs import parsing from pandas.compat import range, u from pandas.compat.numpy import function as nv @@ -1037,7 +1038,7 @@ def to_datetime(self, dayfirst=False): if self.inferred_type == 'string': from dateutil.parser import parse parser = lambda x: parse(x, dayfirst=dayfirst) - parsed = lib.try_parse_dates(self.values, parser=parser) + parsed = parsing.try_parse_dates(self.values, parser=parser) return DatetimeIndex(parsed) else: return DatetimeIndex(self.values) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index bf89509fd17467..97ac8445faf4c2 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -2,9 +2,14 @@ import numpy as np from collections import MutableMapping -from pandas._libs import lib, tslib +from pandas._libs import tslib from pandas._libs.tslibs.strptime import array_strptime from pandas._libs.tslibs.timezones import get_timezone +from pandas._libs.tslibs import parsing +from pandas._libs.tslibs.parsing import ( # noqa + parse_time_string, + _format_is_iso, + _guess_datetime_format) from pandas.core.dtypes.common import ( _ensure_object, @@ -19,28 +24,10 @@ is_numeric_dtype) from pandas.core.dtypes.generic import ( ABCIndexClass, ABCSeries, - ABCDataFrame, ABCDateOffset) + ABCDataFrame) from pandas.core.dtypes.missing import notna from pandas.core import algorithms -import pandas.compat as compat - -_DATEUTIL_LEXER_SPLIT = None -try: - # Since these are private methods from dateutil, it is safely imported - # here so in case this interface changes, pandas will just fallback - # to not using the functionality - from dateutil.parser import _timelex - - if hasattr(_timelex, 'split'): - def _lexer_split_from_str(dt_str): - # The StringIO(str(_)) is for dateutil 2.2 compatibility - return _timelex.split(compat.StringIO(str(dt_str))) - - _DATEUTIL_LEXER_SPLIT = _lexer_split_from_str -except (ImportError, AttributeError): - pass - def _infer_tzinfo(start, end): def _infer(a, b): @@ -60,123 +47,6 @@ def _infer(a, b): return tz -def _guess_datetime_format(dt_str, dayfirst=False, - dt_str_parse=compat.parse_date, - dt_str_split=_DATEUTIL_LEXER_SPLIT): - """ - Guess the datetime format of a given datetime string. - - Parameters - ---------- - dt_str : string, datetime string to guess the format of - dayfirst : boolean, default False - If True parses dates with the day first, eg 20/01/2005 - Warning: dayfirst=True is not strict, but will prefer to parse - with day first (this is a known bug). - dt_str_parse : function, defaults to `compat.parse_date` (dateutil) - This function should take in a datetime string and return - a `datetime.datetime` guess that the datetime string represents - dt_str_split : function, defaults to `_DATEUTIL_LEXER_SPLIT` (dateutil) - This function should take in a datetime string and return - a list of strings, the guess of the various specific parts - e.g. '2011/12/30' -> ['2011', '/', '12', '/', '30'] - - Returns - ------- - ret : datetime format string (for `strftime` or `strptime`) - """ - if dt_str_parse is None or dt_str_split is None: - return None - - if not isinstance(dt_str, compat.string_types): - return None - - day_attribute_and_format = (('day',), '%d', 2) - - # attr name, format, padding (if any) - datetime_attrs_to_format = [ - (('year', 'month', 'day'), '%Y%m%d', 0), - (('year',), '%Y', 0), - (('month',), '%B', 0), - (('month',), '%b', 0), - (('month',), '%m', 2), - day_attribute_and_format, - (('hour',), '%H', 2), - (('minute',), '%M', 2), - (('second',), '%S', 2), - (('microsecond',), '%f', 6), - (('second', 'microsecond'), '%S.%f', 0), - ] - - if dayfirst: - datetime_attrs_to_format.remove(day_attribute_and_format) - datetime_attrs_to_format.insert(0, day_attribute_and_format) - - try: - parsed_datetime = dt_str_parse(dt_str, dayfirst=dayfirst) - except: - # In case the datetime can't be parsed, its format cannot be guessed - return None - - if parsed_datetime is None: - return None - - try: - tokens = dt_str_split(dt_str) - except: - # In case the datetime string can't be split, its format cannot - # be guessed - return None - - format_guess = [None] * len(tokens) - found_attrs = set() - - for attrs, attr_format, padding in datetime_attrs_to_format: - # If a given attribute has been placed in the format string, skip - # over other formats for that same underlying attribute (IE, month - # can be represented in multiple different ways) - if set(attrs) & found_attrs: - continue - - if all(getattr(parsed_datetime, attr) is not None for attr in attrs): - for i, token_format in enumerate(format_guess): - token_filled = tokens[i].zfill(padding) - if (token_format is None and - token_filled == parsed_datetime.strftime(attr_format)): - format_guess[i] = attr_format - tokens[i] = token_filled - found_attrs.update(attrs) - break - - # Only consider it a valid guess if we have a year, month and day - if len(set(['year', 'month', 'day']) & found_attrs) != 3: - return None - - output_format = [] - for i, guess in enumerate(format_guess): - if guess is not None: - # Either fill in the format placeholder (like %Y) - output_format.append(guess) - else: - # Or just the token separate (IE, the dashes in "01-01-2013") - try: - # If the token is numeric, then we likely didn't parse it - # properly, so our guess is wrong - float(tokens[i]) - return None - except ValueError: - pass - - output_format.append(tokens[i]) - - guessed_format = ''.join(output_format) - - # rebuild string, capturing any inferred padding - dt_str = ''.join(tokens) - if parsed_datetime.strftime(guessed_format) == dt_str: - return guessed_format - - def _guess_datetime_format_for_array(arr, **kwargs): # Try to guess the format based on the first non-NaN element non_nan_elements = notna(arr).nonzero()[0] @@ -655,9 +525,9 @@ def _attempt_YYYYMMDD(arg, errors): def calc(carg): # calculate the actual result carg = carg.astype(object) - parsed = lib.try_parse_year_month_day(carg / 10000, - carg / 100 % 100, - carg % 100) + parsed = parsing.try_parse_year_month_day(carg / 10000, + carg / 100 % 100, + carg % 100) return tslib.array_to_datetime(parsed, errors=errors) def calc_with_mask(carg, mask): @@ -691,60 +561,6 @@ def calc_with_mask(carg, mask): return None -def _format_is_iso(f): - """ - Does format match the iso8601 set that can be handled by the C parser? - Generally of form YYYY-MM-DDTHH:MM:SS - date separator can be different - but must be consistent. Leading 0s in dates and times are optional. - """ - iso_template = '%Y{date_sep}%m{date_sep}%d{time_sep}%H:%M:%S.%f'.format - excluded_formats = ['%Y%m%d', '%Y%m', '%Y'] - - for date_sep in [' ', '/', '\\', '-', '.', '']: - for time_sep in [' ', 'T']: - if (iso_template(date_sep=date_sep, - time_sep=time_sep - ).startswith(f) and f not in excluded_formats): - return True - return False - - -def parse_time_string(arg, freq=None, dayfirst=None, yearfirst=None): - """ - Try hard to parse datetime string, leveraging dateutil plus some extra - goodies like quarter recognition. - - Parameters - ---------- - arg : compat.string_types - freq : str or DateOffset, default None - Helps with interpreting time string if supplied - dayfirst : bool, default None - If None uses default from print_config - yearfirst : bool, default None - If None uses default from print_config - - Returns - ------- - datetime, datetime/dateutil.parser._result, str - """ - from pandas.core.config import get_option - if not isinstance(arg, compat.string_types): - return arg - - if isinstance(freq, ABCDateOffset): - freq = freq.rule_code - - if dayfirst is None: - dayfirst = get_option("display.date_dayfirst") - if yearfirst is None: - yearfirst = get_option("display.date_yearfirst") - - return tslib.parse_datetime_string_with_reso(arg, freq=freq, - dayfirst=dayfirst, - yearfirst=yearfirst) - - DateParseError = tslib.DateParseError normalize_date = tslib.normalize_date diff --git a/pandas/io/date_converters.py b/pandas/io/date_converters.py index 080d6c3e273a3d..377373f8a01356 100644 --- a/pandas/io/date_converters.py +++ b/pandas/io/date_converters.py @@ -1,20 +1,20 @@ """This module is designed for community supported date conversion functions""" from pandas.compat import range, map import numpy as np -import pandas._libs.lib as lib +from pandas._libs.tslibs import parsing def parse_date_time(date_col, time_col): date_col = _maybe_cast(date_col) time_col = _maybe_cast(time_col) - return lib.try_parse_date_and_time(date_col, time_col) + return parsing.try_parse_date_and_time(date_col, time_col) def parse_date_fields(year_col, month_col, day_col): year_col = _maybe_cast(year_col) month_col = _maybe_cast(month_col) day_col = _maybe_cast(day_col) - return lib.try_parse_year_month_day(year_col, month_col, day_col) + return parsing.try_parse_year_month_day(year_col, month_col, day_col) def parse_all_fields(year_col, month_col, day_col, hour_col, minute_col, @@ -25,8 +25,9 @@ def parse_all_fields(year_col, month_col, day_col, hour_col, minute_col, hour_col = _maybe_cast(hour_col) minute_col = _maybe_cast(minute_col) second_col = _maybe_cast(second_col) - return lib.try_parse_datetime_components(year_col, month_col, day_col, - hour_col, minute_col, second_col) + return parsing.try_parse_datetime_components(year_col, month_col, day_col, + hour_col, minute_col, + second_col) def generic_parser(parse_func, *cols): diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index ed15d4295d6881..eeb79552477e12 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -42,7 +42,7 @@ import pandas._libs.lib as lib import pandas._libs.parsers as parsers - +from pandas._libs.tslibs import parsing # BOM character (byte order mark) # This exists at the beginning of a file to indicate endianness @@ -2981,7 +2981,7 @@ def converter(*date_cols): ) except: return tools.to_datetime( - lib.try_parse_dates(strs, dayfirst=dayfirst)) + parsing.try_parse_dates(strs, dayfirst=dayfirst)) else: try: result = tools.to_datetime( @@ -2992,9 +2992,9 @@ def converter(*date_cols): except Exception: try: return tools.to_datetime( - lib.try_parse_dates(_concat_date_cols(date_cols), - parser=date_parser, - dayfirst=dayfirst), + parsing.try_parse_dates(_concat_date_cols(date_cols), + parser=date_parser, + dayfirst=dayfirst), errors='ignore') except Exception: return generic_parser(date_parser, *date_cols) diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index e0ccedb834adf9..bdfe6b5b09e45a 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -12,7 +12,8 @@ from distutils.version import LooseVersion import pandas as pd -from pandas._libs import tslib, lib +from pandas._libs import tslib +from pandas._libs.tslibs import parsing from pandas.core.tools import datetimes as tools from pandas.core.tools.datetimes import normalize_date from pandas.compat import lmap @@ -1063,7 +1064,7 @@ def test_does_not_convert_mixed_integer(self): bad_date_strings = ('-50000', '999', '123.1234', 'm', 'T') for bad_date_string in bad_date_strings: - assert not tslib._does_string_look_like_datetime(bad_date_string) + assert not parsing._does_string_look_like_datetime(bad_date_string) good_date_strings = ('2012-01-01', '01/01/2012', @@ -1073,7 +1074,7 @@ def test_does_not_convert_mixed_integer(self): '1-1', ) for good_date_string in good_date_strings: - assert tslib._does_string_look_like_datetime(good_date_string) + assert parsing._does_string_look_like_datetime(good_date_string) def test_parsers(self): @@ -1412,7 +1413,7 @@ class TestArrayToDatetime(object): def test_try_parse_dates(self): arr = np.array(['5/1/2000', '6/1/2000', '7/1/2000'], dtype=object) - result = lib.try_parse_dates(arr, dayfirst=True) + result = parsing.try_parse_dates(arr, dayfirst=True) expected = [parse(d, dayfirst=True) for d in arr] assert np.array_equal(result, expected) diff --git a/pandas/tests/io/parser/parse_dates.py b/pandas/tests/io/parser/parse_dates.py index e1ae1b577ea296..90103e7bf26b02 100644 --- a/pandas/tests/io/parser/parse_dates.py +++ b/pandas/tests/io/parser/parse_dates.py @@ -10,7 +10,7 @@ import pytest import numpy as np -import pandas._libs.lib as lib +from pandas._libs.tslibs import parsing from pandas._libs.lib import Timestamp import pandas as pd @@ -53,7 +53,8 @@ def test_multiple_date_col(self): """ def func(*date_cols): - return lib.try_parse_dates(parsers._concat_date_cols(date_cols)) + res = parsing.try_parse_dates(parsers._concat_date_cols(date_cols)) + return res df = self.read_csv(StringIO(data), header=None, date_parser=func, diff --git a/setup.py b/setup.py index 25a4924dad0bc8..d25ae4a5fb45ce 100755 --- a/setup.py +++ b/setup.py @@ -343,6 +343,7 @@ class CheckSDist(sdist_class): 'pandas/_libs/parsers.pyx', 'pandas/_libs/tslibs/timezones.pyx', 'pandas/_libs/tslibs/frequencies.pyx', + 'pandas/_libs/tslibs/parsing.pyx', 'pandas/io/sas/sas.pyx'] def initialize_options(self): @@ -498,6 +499,8 @@ def pxd(name): 'sources': ['pandas/_libs/src/datetime/np_datetime.c', 'pandas/_libs/src/datetime/np_datetime_strings.c', 'pandas/_libs/src/period_helper.c']}, + '_libs.tslibs.parsing': {'pyxfile': '_libs/tslibs/parsing', + 'pxdfiles': ['_libs/src/util']}, '_libs.tslibs.frequencies': {'pyxfile': '_libs/tslibs/frequencies', 'pxdfiles': ['_libs/src/util']}, '_libs.index': {'pyxfile': '_libs/index', From b5842bb7c8196666aa22d819652d1db7a5aeb582 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 26 Sep 2017 16:30:39 +0200 Subject: [PATCH 157/188] DOC: fix no autosummary for numerical index api pages (#17642) --- doc/sphinxext/numpydoc/numpydoc.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/doc/sphinxext/numpydoc/numpydoc.py b/doc/sphinxext/numpydoc/numpydoc.py index 710c3cc9842c45..f06915997c6162 100755 --- a/doc/sphinxext/numpydoc/numpydoc.py +++ b/doc/sphinxext/numpydoc/numpydoc.py @@ -43,9 +43,10 @@ def mangle_docstrings(app, what, name, obj, options, lines, ) # PANDAS HACK (to remove the list of methods/attributes for Categorical) - if what == "class" and (name.endswith(".Categorical") or - name.endswith("CategoricalIndex") or - name.endswith("IntervalIndex")): + no_autosummary = [".Categorical", "CategoricalIndex", "IntervalIndex", + "RangeIndex", "Int64Index", "UInt64Index", + "Float64Index"] + if what == "class" and any(name.endswith(n) for n in no_autosummary): cfg['class_members_list'] = False if what == 'module': From 66f4cc11725fd244c029058e9ac4743c4f36e2b4 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 26 Sep 2017 12:25:07 -0400 Subject: [PATCH 158/188] BUG: remove tab completion for deprecated functions (#17683) closes #17674 --- pandas/core/accessor.py | 3 ++- pandas/core/categorical.py | 8 +------- pandas/core/frame.py | 1 + pandas/core/generic.py | 2 ++ pandas/core/series.py | 2 ++ pandas/tests/frame/test_api.py | 11 +++++++++++ pandas/tests/series/test_api.py | 11 +++++++++++ 7 files changed, 30 insertions(+), 8 deletions(-) diff --git a/pandas/core/accessor.py b/pandas/core/accessor.py index c8476841bfce47..7a2da9655cc4a0 100644 --- a/pandas/core/accessor.py +++ b/pandas/core/accessor.py @@ -10,10 +10,11 @@ class DirNamesMixin(object): _accessors = frozenset([]) + _deprecations = frozenset([]) def _dir_deletions(self): """ delete unwanted __dir__ for this object """ - return self._accessors + return self._accessors | self._deprecations def _dir_additions(self): """ add addtional __dir__ for this object """ diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 8b055e9ae59c3a..011aa746322965 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -231,6 +231,7 @@ class Categorical(PandasObject): # ops, which raise __array_priority__ = 1000 _dtype = CategoricalDtype() + _deprecations = frozenset(['labels']) _typ = 'categorical' def __init__(self, values, categories=None, ordered=None, dtype=None, @@ -412,13 +413,6 @@ def dtype(self): """The :ref:`~pandas.api.types.CategoricalDtype` for this instance""" return self._dtype - def __dir__(self): - # Avoid IPython warnings for deprecated properties - # https://github.com/pandas-dev/pandas/issues/16409 - rv = set(dir(type(self))) - rv.discard("labels") - return sorted(rv) - @property def _constructor(self): return Categorical diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 912dbdb9de7059..579d9f10d5875b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -299,6 +299,7 @@ def _constructor(self): return DataFrame _constructor_sliced = Series + _deprecations = NDFrame._deprecations | frozenset(['sortlevel']) @property def _constructor_expanddim(self): diff --git a/pandas/core/generic.py b/pandas/core/generic.py index a7be145f210833..2fb0e348c01c0b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -112,6 +112,8 @@ class NDFrame(PandasObject, SelectionMixin): '__array_interface__'] _internal_names_set = set(_internal_names) _accessors = frozenset([]) + _deprecations = frozenset(['as_blocks', 'blocks', + 'consolidate', 'convert_objects']) _metadata = [] is_copy = None diff --git a/pandas/core/series.py b/pandas/core/series.py index db8ee2529ef577..89add1ef4c5907 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -145,6 +145,8 @@ class Series(base.IndexOpsMixin, generic.NDFrame): """ _metadata = ['name'] _accessors = frozenset(['dt', 'cat', 'str']) + _deprecations = generic.NDFrame._deprecations | frozenset( + ['sortlevel', 'reshape']) _allow_index_ops = True def __init__(self, data=None, index=None, dtype=None, name=None, diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 230a5806ccb2e6..5ea8230ced41b9 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -438,3 +438,14 @@ def _check_f(base, f): # rename f = lambda x: x.rename({1: 'foo'}, inplace=True) _check_f(d.copy(), f) + + def test_tab_complete_warning(self, ip): + # https://github.com/pandas-dev/pandas/issues/16409 + pytest.importorskip('IPython', minversion="6.0.0") + from IPython.core.completer import provisionalcompleter + + code = "import pandas as pd; df = pd.DataFrame()" + ip.run_code(code) + with tm.assert_produces_warning(None): + with provisionalcompleter('ignore'): + list(ip.Completer.completions('df.', 1)) diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index d0805e2bb54d25..56b8a90ec0c9f1 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -407,3 +407,14 @@ def test_empty_method(self): for full_series in [pd.Series([1]), pd.Series(index=[1])]: assert not full_series.empty + + def test_tab_complete_warning(self, ip): + # https://github.com/pandas-dev/pandas/issues/16409 + pytest.importorskip('IPython', minversion="6.0.0") + from IPython.core.completer import provisionalcompleter + + code = "import pandas as pd; s = pd.Series()" + ip.run_code(code) + with tm.assert_produces_warning(None): + with provisionalcompleter('ignore'): + list(ip.Completer.completions('s.', 1)) From d3be81ad595c5338781bed9963c729a9702e6611 Mon Sep 17 00:00:00 2001 From: kernc Date: Tue, 26 Sep 2017 21:01:39 +0200 Subject: [PATCH 159/188] BUG: Fix/test SparseSeries/SparseDataFrame stack/unstack (#16616) --- doc/source/whatsnew/v0.21.0.txt | 2 +- pandas/core/categorical.py | 10 ++- pandas/core/internals.py | 97 ++++++++++++++++++++++++++++- pandas/core/reshape/reshape.py | 49 +++++---------- pandas/tests/sparse/test_reshape.py | 38 +++++++++++ 5 files changed, 159 insertions(+), 37 deletions(-) create mode 100644 pandas/tests/sparse/test_reshape.py diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index b6bd86bd79a1f2..06f19782682b03 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -607,7 +607,7 @@ Sparse - Bug in ``SparseSeries`` raises ``AttributeError`` when a dictionary is passed in as data (:issue:`16905`) - Bug in :func:`SparseDataFrame.fillna` not filling all NaNs when frame was instantiated from SciPy sparse matrix (:issue:`16112`) - +- Bug in :func:`SparseSeries.unstack` and :func:`SparseDataFrame.stack` (:issue:`16614`, :issue:`15045`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 011aa746322965..d79937829cf3fb 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -125,10 +125,16 @@ def f(self, other): return f -def maybe_to_categorical(array): - """ coerce to a categorical if a series is given """ +def _maybe_to_categorical(array): + """ + Coerce to a categorical if a series is given. + + Internal use ONLY. + """ if isinstance(array, (ABCSeries, ABCCategoricalIndex)): return array._values + elif isinstance(array, np.ndarray): + return Categorical(array) return array diff --git a/pandas/core/internals.py b/pandas/core/internals.py index e6f61a22e31373..9e348819ce5a3c 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -56,7 +56,7 @@ from pandas.core.index import Index, MultiIndex, _ensure_index from pandas.core.indexing import maybe_convert_indices, length_of_indexer -from pandas.core.categorical import Categorical, maybe_to_categorical +from pandas.core.categorical import Categorical, _maybe_to_categorical from pandas.core.indexes.datetimes import DatetimeIndex from pandas.io.formats.printing import pprint_thing @@ -1484,6 +1484,35 @@ def equals(self, other): return False return array_equivalent(self.values, other.values) + def _unstack(self, unstacker_func, new_columns): + """Return a list of unstacked blocks of self + + Parameters + ---------- + unstacker_func : callable + Partially applied unstacker. + new_columns : Index + All columns of the unstacked BlockManager. + + Returns + ------- + blocks : list of Block + New blocks of unstacked values. + mask : array_like of bool + The mask of columns of `blocks` we should keep. + """ + unstacker = unstacker_func(self.values.T) + new_items = unstacker.get_new_columns() + new_placement = new_columns.get_indexer(new_items) + new_values, mask = unstacker.get_new_values() + + mask = mask.any(0) + new_values = new_values.T[mask] + new_placement = new_placement[mask] + + blocks = [make_block(new_values, placement=new_placement)] + return blocks, mask + def quantile(self, qs, interpolation='linear', axis=0, mgr=None): """ compute the quantiles of the @@ -1712,6 +1741,38 @@ def _slice(self, slicer): def _try_cast_result(self, result, dtype=None): return result + def _unstack(self, unstacker_func, new_columns): + """Return a list of unstacked blocks of self + + Parameters + ---------- + unstacker_func : callable + Partially applied unstacker. + new_columns : Index + All columns of the unstacked BlockManager. + + Returns + ------- + blocks : list of Block + New blocks of unstacked values. + mask : array_like of bool + The mask of columns of `blocks` we should keep. + """ + # NonConsolidatable blocks can have a single item only, so we return + # one block per item + unstacker = unstacker_func(self.values.T) + new_items = unstacker.get_new_columns() + new_placement = new_columns.get_indexer(new_items) + new_values, mask = unstacker.get_new_values() + + mask = mask.any(0) + new_values = new_values.T[mask] + new_placement = new_placement[mask] + + blocks = [self.make_block_same_class(vals, [place]) + for vals, place in zip(new_values, new_placement)] + return blocks, mask + class NumericBlock(Block): __slots__ = () @@ -2227,7 +2288,7 @@ class CategoricalBlock(NonConsolidatableMixIn, ObjectBlock): def __init__(self, values, placement, fastpath=False, **kwargs): # coerce to categorical if we can - super(CategoricalBlock, self).__init__(maybe_to_categorical(values), + super(CategoricalBlock, self).__init__(_maybe_to_categorical(values), fastpath=True, placement=placement, **kwargs) @@ -4192,6 +4253,38 @@ def canonicalize(block): return all(block.equals(oblock) for block, oblock in zip(self_blocks, other_blocks)) + def unstack(self, unstacker_func): + """Return a blockmanager with all blocks unstacked. + + Parameters + ---------- + unstacker_func : callable + A (partially-applied) ``pd.core.reshape._Unstacker`` class. + + Returns + ------- + unstacked : BlockManager + """ + dummy = unstacker_func(np.empty((0, 0)), value_columns=self.items) + new_columns = dummy.get_new_columns() + new_index = dummy.get_new_index() + new_blocks = [] + columns_mask = [] + + for blk in self.blocks: + blocks, mask = blk._unstack( + partial(unstacker_func, + value_columns=self.items[blk.mgr_locs.indexer]), + new_columns) + + new_blocks.extend(blocks) + columns_mask.extend(mask) + + new_columns = new_columns[columns_mask] + + bm = BlockManager(new_blocks, [new_columns, new_index]) + return bm + class SingleBlockManager(BlockManager): """ manage a single block with """ diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index bff09be6149f32..d280c4f3f73d7b 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -2,6 +2,7 @@ # pylint: disable=W0703,W0622,W0613,W0201 from pandas.compat import range, text_type, zip from pandas import compat +from functools import partial import itertools import re @@ -10,7 +11,7 @@ from pandas.core.dtypes.common import ( _ensure_platform_int, is_list_like, is_bool_dtype, - needs_i8_conversion) + needs_i8_conversion, is_sparse) from pandas.core.dtypes.cast import maybe_promote from pandas.core.dtypes.missing import notna import pandas.core.dtypes.concat as _concat @@ -75,10 +76,15 @@ def __init__(self, values, index, level=-1, value_columns=None, fill_value=None): self.is_categorical = None + self.is_sparse = is_sparse(values) if values.ndim == 1: if isinstance(values, Categorical): self.is_categorical = values values = np.array(values) + elif self.is_sparse: + # XXX: Makes SparseArray *dense*, but it's supposedly + # a single column at a time, so it's "doable" + values = values.values values = values[:, np.newaxis] self.values = values self.value_columns = value_columns @@ -177,7 +183,8 @@ def get_result(self): ordered=ordered) for i in range(values.shape[-1])] - return DataFrame(values, index=index, columns=columns) + klass = SparseDataFrame if self.is_sparse else DataFrame + return klass(values, index=index, columns=columns) def get_new_values(self): values = self.values @@ -469,36 +476,12 @@ def unstack(obj, level, fill_value=None): def _unstack_frame(obj, level, fill_value=None): - from pandas.core.internals import BlockManager, make_block - if obj._is_mixed_type: - unstacker = _Unstacker(np.empty(obj.shape, dtype=bool), # dummy - obj.index, level=level, - value_columns=obj.columns) - new_columns = unstacker.get_new_columns() - new_index = unstacker.get_new_index() - new_axes = [new_columns, new_index] - - new_blocks = [] - mask_blocks = [] - for blk in obj._data.blocks: - blk_items = obj._data.items[blk.mgr_locs.indexer] - bunstacker = _Unstacker(blk.values.T, obj.index, level=level, - value_columns=blk_items, - fill_value=fill_value) - new_items = bunstacker.get_new_columns() - new_placement = new_columns.get_indexer(new_items) - new_values, mask = bunstacker.get_new_values() - - mblk = make_block(mask.T, placement=new_placement) - mask_blocks.append(mblk) - - newb = make_block(new_values.T, placement=new_placement) - new_blocks.append(newb) - - result = DataFrame(BlockManager(new_blocks, new_axes)) - mask_frame = DataFrame(BlockManager(mask_blocks, new_axes)) - return result.loc[:, mask_frame.sum(0) > 0] + unstacker = partial(_Unstacker, index=obj.index, + level=level, fill_value=fill_value) + blocks = obj._data.unstack(unstacker) + klass = type(obj) + return klass(blocks) else: unstacker = _Unstacker(obj.values, obj.index, level=level, value_columns=obj.columns, @@ -559,7 +542,9 @@ def factorize(index): mask = notna(new_values) new_values = new_values[mask] new_index = new_index[mask] - return Series(new_values, index=new_index) + + klass = type(frame)._constructor_sliced + return klass(new_values, index=new_index) def stack_multiple(frame, level, dropna=True): diff --git a/pandas/tests/sparse/test_reshape.py b/pandas/tests/sparse/test_reshape.py new file mode 100644 index 00000000000000..b492c47375bcf5 --- /dev/null +++ b/pandas/tests/sparse/test_reshape.py @@ -0,0 +1,38 @@ +import pytest +import numpy as np + +import pandas as pd +import pandas.util.testing as tm + + +@pytest.fixture +def sparse_df(): + return pd.SparseDataFrame({0: {0: 1}, 1: {1: 1}, 2: {2: 1}}) # eye + + +@pytest.fixture +def multi_index3(): + return pd.MultiIndex.from_tuples([(0, 0), (1, 1), (2, 2)]) + + +def test_sparse_frame_stack(sparse_df, multi_index3): + ss = sparse_df.stack() + expected = pd.SparseSeries(np.ones(3), index=multi_index3) + tm.assert_sp_series_equal(ss, expected) + + +def test_sparse_frame_unstack(sparse_df): + mi = pd.MultiIndex.from_tuples([(0, 0), (1, 0), (1, 2)]) + sparse_df.index = mi + arr = np.array([[1, np.nan, np.nan], + [np.nan, 1, np.nan], + [np.nan, np.nan, 1]]) + unstacked_df = pd.DataFrame(arr, index=mi).unstack() + unstacked_sdf = sparse_df.unstack() + + tm.assert_numpy_array_equal(unstacked_df.values, unstacked_sdf.values) + + +def test_sparse_series_unstack(sparse_df, multi_index3): + frame = pd.SparseSeries(np.ones(3), index=multi_index3).unstack() + tm.assert_sp_frame_equal(frame, sparse_df) From 44747c8e91cee3599f96a4f154cb0323269b4ef1 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 27 Sep 2017 03:20:43 -0700 Subject: [PATCH 160/188] typo fix evalute_compare-->evaluate_compare (#17688) --- pandas/core/indexes/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index dba616c2d15e62..c4e1398d0178fe 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3742,7 +3742,7 @@ def _evaluate_with_timedelta_like(self, other, op, opstr): def _evaluate_with_datetime_like(self, other, op, opstr): raise TypeError("can only perform ops with datetime like values") - def _evalute_compare(self, op): + def _evaluate_compare(self, op): raise base.AbstractMethodError(self) @classmethod From f9d88cd6b3543bbb678378fc4fe736f13497d21e Mon Sep 17 00:00:00 2001 From: Lucas Kushner Date: Wed, 27 Sep 2017 10:07:47 -0500 Subject: [PATCH 161/188] Deprecating Series.argmin and Series.argmax (#16830) (#16955) * Deprecating Series.argmin and Series.argmax (#16830) Added statements about correcting behavior in future commit Add reference to github ticket Fixing placement of github comment Made test code more explicit Fixing unrelated tests that are also throwing warnings Updating whatsnew to give more detail about deprecation Fixing whatsnew and breaking out tests to catch warnings Additional comments and more concise whatsnew Updating deprecate decorator to support custom message DOC: Update docstrings, depr message, and whatsnew * Added debug prints * Try splitting the filters * Reword whatsnew * Change sparse series test * Skip on py2 * Change to idxmin * Remove py2 skips * Catch more warnings * Final switch to idxmax * Consistent tests, refactor to_string * Fixed tests --- doc/source/whatsnew/v0.21.0.txt | 22 ++++++++++ pandas/core/series.py | 29 +++++++++---- pandas/io/formats/format.py | 4 +- pandas/tests/series/test_analytics.py | 60 ++++++++++++++++++++------- pandas/tests/series/test_api.py | 2 +- pandas/tests/series/test_operators.py | 28 ++++++------- pandas/tests/sparse/test_series.py | 16 ++++++- pandas/util/_decorators.py | 8 ++-- 8 files changed, 124 insertions(+), 45 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 06f19782682b03..ae55b4a0aa4691 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -487,11 +487,33 @@ Other API Changes Deprecations ~~~~~~~~~~~~ + - :func:`read_excel()` has deprecated ``sheetname`` in favor of ``sheet_name`` for consistency with ``.to_excel()`` (:issue:`10559`). - ``pd.options.html.border`` has been deprecated in favor of ``pd.options.display.html.border`` (:issue:`15793`). - :func:`SeriesGroupBy.nth` has deprecated ``True`` in favor of ``'all'`` for its kwarg ``dropna`` (:issue:`11038`). - :func:`DataFrame.as_blocks` is deprecated, as this is exposing the internal implementation (:issue:`17302`) +.. _whatsnew_0210.deprecations.argmin_min + +Series.argmax and Series.argmin +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +- The behavior of :func:`Series.argmax` has been deprecated in favor of :func:`Series.idxmax` (:issue:`16830`) +- The behavior of :func:`Series.argmin` has been deprecated in favor of :func:`Series.idxmin` (:issue:`16830`) + +For compatibility with NumPy arrays, ``pd.Series`` implements ``argmax`` and +``argmin``. Since pandas 0.13.0, ``argmax`` has been an alias for +:meth:`pandas.Series.idxmax`, and ``argmin`` has been an alias for +:meth:`pandas.Series.idxmin`. They return the *label* of the maximum or minimum, +rather than the *position*. + +We've deprecated the current behavior of ``Series.argmax`` and +``Series.argmin``. Using either of these will emit a ``FutureWarning``. Use +:meth:`Series.idxmax` if you want the label of the maximum. Use +``Series.values.argmax()`` if you want the position of the maximum. Likewise for +the minimum. In a future release ``Series.argmax`` and ``Series.argmin`` will +return the position of the maximum or minimum. + .. _whatsnew_0210.prior_deprecations: Removal of prior version deprecations/changes diff --git a/pandas/core/series.py b/pandas/core/series.py index 89add1ef4c5907..a05324142b223a 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -69,7 +69,8 @@ import pandas.core.common as com import pandas.core.nanops as nanops import pandas.io.formats.format as fmt -from pandas.util._decorators import Appender, deprecate_kwarg, Substitution +from pandas.util._decorators import ( + Appender, deprecate, deprecate_kwarg, Substitution) from pandas.util._validators import validate_bool_kwarg from pandas._libs import index as libindex, tslib as libts, lib, iNaT @@ -1274,7 +1275,7 @@ def duplicated(self, keep='first'): def idxmin(self, axis=None, skipna=True, *args, **kwargs): """ - Index of first occurrence of minimum of values. + Index *label* of the first occurrence of minimum of values. Parameters ---------- @@ -1287,7 +1288,9 @@ def idxmin(self, axis=None, skipna=True, *args, **kwargs): Notes ----- - This method is the Series version of ``ndarray.argmin``. + This method is the Series version of ``ndarray.argmin``. This method + returns the label of the minimum, while ``ndarray.argmin`` returns + the position. To get the position, use ``series.values.argmin()``. See Also -------- @@ -1302,7 +1305,7 @@ def idxmin(self, axis=None, skipna=True, *args, **kwargs): def idxmax(self, axis=None, skipna=True, *args, **kwargs): """ - Index of first occurrence of maximum of values. + Index *label* of the first occurrence of maximum of values. Parameters ---------- @@ -1315,7 +1318,9 @@ def idxmax(self, axis=None, skipna=True, *args, **kwargs): Notes ----- - This method is the Series version of ``ndarray.argmax``. + This method is the Series version of ``ndarray.argmax``. This method + returns the label of the maximum, while ``ndarray.argmax`` returns + the position. To get the position, use ``series.values.argmax()``. See Also -------- @@ -1329,8 +1334,18 @@ def idxmax(self, axis=None, skipna=True, *args, **kwargs): return self.index[i] # ndarray compat - argmin = idxmin - argmax = idxmax + argmin = deprecate('argmin', idxmin, + msg="'argmin' is deprecated. Use 'idxmin' instead. " + "The behavior of 'argmin' will be corrected to " + "return the positional minimum in the future. " + "Use 'series.values.argmin' to get the position of " + "the minimum now.") + argmax = deprecate('argmax', idxmax, + msg="'argmax' is deprecated. Use 'idxmax' instead. " + "The behavior of 'argmax' will be corrected to " + "return the positional maximum in the future. " + "Use 'series.values.argmax' to get the position of " + "the maximum now.") def round(self, decimals=0, *args, **kwargs): """ diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 547b9676717c99..386d9c3ffe30df 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -598,9 +598,7 @@ def to_string(self): text = self._join_multiline(*strcols) else: # max_cols == 0. Try to fit frame to terminal text = self.adj.adjoin(1, *strcols).split('\n') - row_lens = Series(text).apply(len) - max_len_col_ix = np.argmax(row_lens) - max_len = row_lens[max_len_col_ix] + max_len = Series(text).str.len().max() headers = [ele[0] for ele in strcols] # Size of last col determines dot col size. See # `self._to_str_columns diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 914181dc941549..9f5e4f2ac4b6e6 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1242,16 +1242,31 @@ def test_idxmin(self): result = s.idxmin() assert result == 1 - def test_numpy_argmin(self): - # argmin is aliased to idxmin - data = np.random.randint(0, 11, size=10) - result = np.argmin(Series(data)) - assert result == np.argmin(data) + def test_numpy_argmin_deprecated(self): + # See gh-16830 + data = np.arange(1, 11) + + s = Series(data, index=data) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + # The deprecation of Series.argmin also causes a deprecation + # warning when calling np.argmin. This behavior is temporary + # until the implemention of Series.argmin is corrected. + result = np.argmin(s) + + assert result == 1 + + with tm.assert_produces_warning(FutureWarning): + # argmin is aliased to idxmin + result = s.argmin() + + assert result == 1 if not _np_version_under1p10: - msg = "the 'out' parameter is not supported" - tm.assert_raises_regex(ValueError, msg, np.argmin, - Series(data), out=data) + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + msg = "the 'out' parameter is not supported" + tm.assert_raises_regex(ValueError, msg, np.argmin, + s, out=data) def test_idxmax(self): # test idxmax @@ -1297,17 +1312,30 @@ def test_idxmax(self): result = s.idxmin() assert result == 1.1 - def test_numpy_argmax(self): + def test_numpy_argmax_deprecated(self): + # See gh-16830 + data = np.arange(1, 11) + + s = Series(data, index=data) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + # The deprecation of Series.argmax also causes a deprecation + # warning when calling np.argmax. This behavior is temporary + # until the implemention of Series.argmax is corrected. + result = np.argmax(s) + assert result == 10 + + with tm.assert_produces_warning(FutureWarning): + # argmax is aliased to idxmax + result = s.argmax() - # argmax is aliased to idxmax - data = np.random.randint(0, 11, size=10) - result = np.argmax(Series(data)) - assert result == np.argmax(data) + assert result == 10 if not _np_version_under1p10: - msg = "the 'out' parameter is not supported" - tm.assert_raises_regex(ValueError, msg, np.argmax, - Series(data), out=data) + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + msg = "the 'out' parameter is not supported" + tm.assert_raises_regex(ValueError, msg, np.argmax, + s, out=data) def test_ptp(self): N = 1000 diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index 56b8a90ec0c9f1..6b950be15ca465 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -345,7 +345,7 @@ def test_ndarray_compat(self): index=date_range('1/1/2000', periods=1000)) def f(x): - return x[x.argmax()] + return x[x.idxmax()] result = tsdf.apply(f) expected = tsdf.max() diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index 114a055de81953..c8cc80b1cf4b1b 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -1872,33 +1872,33 @@ def test_op_duplicate_index(self): ), ] ) - def test_assert_argminmax_raises(self, test_input, error_type): + def test_assert_idxminmax_raises(self, test_input, error_type): """ Cases where ``Series.argmax`` and related should raise an exception """ with pytest.raises(error_type): - test_input.argmin() + test_input.idxmin() with pytest.raises(error_type): - test_input.argmin(skipna=False) + test_input.idxmin(skipna=False) with pytest.raises(error_type): - test_input.argmax() + test_input.idxmax() with pytest.raises(error_type): - test_input.argmax(skipna=False) + test_input.idxmax(skipna=False) - def test_argminmax_with_inf(self): + def test_idxminmax_with_inf(self): # For numeric data with NA and Inf (GH #13595) s = pd.Series([0, -np.inf, np.inf, np.nan]) - assert s.argmin() == 1 - assert np.isnan(s.argmin(skipna=False)) + assert s.idxmin() == 1 + assert np.isnan(s.idxmin(skipna=False)) - assert s.argmax() == 2 - assert np.isnan(s.argmax(skipna=False)) + assert s.idxmax() == 2 + assert np.isnan(s.idxmax(skipna=False)) # Using old-style behavior that treats floating point nan, -inf, and # +inf as missing with pd.option_context('mode.use_inf_as_na', True): - assert s.argmin() == 0 - assert np.isnan(s.argmin(skipna=False)) - assert s.argmax() == 0 - np.isnan(s.argmax(skipna=False)) + assert s.idxmin() == 0 + assert np.isnan(s.idxmin(skipna=False)) + assert s.idxmax() == 0 + np.isnan(s.idxmax(skipna=False)) diff --git a/pandas/tests/sparse/test_series.py b/pandas/tests/sparse/test_series.py index b44314d4e733be..451f3695933470 100644 --- a/pandas/tests/sparse/test_series.py +++ b/pandas/tests/sparse/test_series.py @@ -1379,11 +1379,25 @@ def test_numpy_func_call(self): # numpy passes in 'axis=None' or `axis=-1' funcs = ['sum', 'cumsum', 'var', 'mean', 'prod', 'cumprod', 'std', 'argsort', - 'argmin', 'argmax', 'min', 'max'] + 'min', 'max'] for func in funcs: for series in ('bseries', 'zbseries'): getattr(np, func)(getattr(self, series)) + def test_deprecated_numpy_func_call(self): + # NOTE: These should be add to the 'test_numpy_func_call' test above + # once the behavior of argmin/argmax is corrected. + funcs = ['argmin', 'argmax'] + for func in funcs: + for series in ('bseries', 'zbseries'): + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + getattr(np, func)(getattr(self, series)) + + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + getattr(getattr(self, series), func)() + @pytest.mark.parametrize( 'datetime_type', (np.datetime64, diff --git a/pandas/util/_decorators.py b/pandas/util/_decorators.py index 3733e4311aa732..9e4e5515a292bc 100644 --- a/pandas/util/_decorators.py +++ b/pandas/util/_decorators.py @@ -7,7 +7,7 @@ def deprecate(name, alternative, alt_name=None, klass=None, - stacklevel=2): + stacklevel=2, msg=None): """ Return a new function that emits a deprecation warning on use. @@ -21,14 +21,16 @@ def deprecate(name, alternative, alt_name=None, klass=None, Name to use in preference of alternative.__name__ klass : Warning, default FutureWarning stacklevel : int, default 2 + msg : str + The message to display in the warning. + Default is '{name} is deprecated. Use {alt_name} instead.' """ alt_name = alt_name or alternative.__name__ klass = klass or FutureWarning + msg = msg or "{} is deprecated. Use {} instead".format(name, alt_name) def wrapper(*args, **kwargs): - msg = "{name} is deprecated. Use {alt_name} instead".format( - name=name, alt_name=alt_name) warnings.warn(msg, klass, stacklevel=stacklevel) return alternative(*args, **kwargs) return wrapper From eaa5081e381e111bd7a8b7c277c12527d7ae52e4 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 28 Sep 2017 04:59:25 -0500 Subject: [PATCH 162/188] CI: Pin miniconda version (#17700) --- appveyor.yml | 2 +- ci/install.ps1 | 4 ++-- ci/install_circle.sh | 6 ++++-- ci/install_travis.sh | 10 +++++++--- ci/requirements-2.7_SLOW.run | 2 +- ci/requirements-2.7_WIN.run | 2 +- ci/requirements_all.txt | 2 +- pandas/tests/io/test_pytables.py | 4 ++++ 8 files changed, 21 insertions(+), 11 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index a1f8886f6d068f..f1259f271ee395 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -59,7 +59,7 @@ install: # install our build environment - cmd: conda config --set show_channel_urls true --set always_yes true --set changeps1 false - - cmd: conda update -q conda + # - cmd: conda update -q conda - cmd: conda config --set ssl_verify false # add the pandas channel *before* defaults to have defaults take priority diff --git a/ci/install.ps1 b/ci/install.ps1 index 64ec7f81884cd1..b784b4ebf5e6ac 100644 --- a/ci/install.ps1 +++ b/ci/install.ps1 @@ -7,7 +7,7 @@ $MINICONDA_URL = "http://repo.continuum.io/miniconda/" function DownloadMiniconda ($python_version, $platform_suffix) { $webclient = New-Object System.Net.WebClient - $filename = "Miniconda3-latest-Windows-" + $platform_suffix + ".exe" + $filename = "Miniconda3-4.3.21-Windows-" + $platform_suffix + ".exe" $url = $MINICONDA_URL + $filename $basedir = $pwd.Path + "\" @@ -85,7 +85,7 @@ function UpdateConda ($python_home) { function main () { InstallMiniconda "3.5" $env:PYTHON_ARCH $env:CONDA_ROOT - UpdateConda $env:CONDA_ROOT + # UpdateConda $env:CONDA_ROOT InstallCondaPackages $env:CONDA_ROOT "conda-build jinja2 anaconda-client" } diff --git a/ci/install_circle.sh b/ci/install_circle.sh index fd79f907625e9d..eba98be561397d 100755 --- a/ci/install_circle.sh +++ b/ci/install_circle.sh @@ -10,7 +10,9 @@ echo "[Using clean Miniconda install]" rm -rf "$MINICONDA_DIR" # install miniconda -wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -q -O miniconda.sh || exit 1 +# wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -q -O miniconda.sh || exit 1 +# Pin miniconda +wget https://repo.continuum.io/miniconda/Miniconda2-4.3.21-Linux-x86_64.sh -q -O miniconda.sh || exit 1 bash miniconda.sh -b -p "$MINICONDA_DIR" || exit 1 export PATH="$MINICONDA_DIR/bin:$PATH" @@ -18,7 +20,7 @@ export PATH="$MINICONDA_DIR/bin:$PATH" echo "[update conda]" conda config --set ssl_verify false || exit 1 conda config --set always_yes true --set changeps1 false || exit 1 -conda update -q conda +# conda update -q conda # add the pandas channel to take priority # to add extra packages diff --git a/ci/install_travis.sh b/ci/install_travis.sh index b85263daa1eaca..faf404ddcd2931 100755 --- a/ci/install_travis.sh +++ b/ci/install_travis.sh @@ -34,9 +34,13 @@ fi # install miniconda if [ "${TRAVIS_OS_NAME}" == "osx" ]; then - time wget http://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O miniconda.sh || exit 1 + # temporarily pin miniconda + # time wget http://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O miniconda.sh || exit 1 + time wget https://repo.continuum.io/miniconda/Miniconda2-4.3.21-MacOSX-x86_64.sh -O miniconda.sh || exit 1 else - time wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh || exit 1 + # temporarily pin miniconda + # time wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh || exit 1 + time wget https://repo.continuum.io/miniconda/Miniconda2-4.3.21-Linux-x86_64.sh -O miniconda.sh || exit 1 fi time bash miniconda.sh -b -p "$MINICONDA_DIR" || exit 1 @@ -48,7 +52,7 @@ echo echo "[update conda]" conda config --set ssl_verify false || exit 1 conda config --set quiet true --set always_yes true --set changeps1 false || exit 1 -conda update -q conda +# conda update -q conda echo echo "[add channels]" diff --git a/ci/requirements-2.7_SLOW.run b/ci/requirements-2.7_SLOW.run index f7708283ad04a0..db95a6ccb23140 100644 --- a/ci/requirements-2.7_SLOW.run +++ b/ci/requirements-2.7_SLOW.run @@ -16,4 +16,4 @@ s3fs psycopg2 pymysql html5lib -beautiful-soup +beautifulsoup4 diff --git a/ci/requirements-2.7_WIN.run b/ci/requirements-2.7_WIN.run index f953682f52d45a..a81542ee5006c7 100644 --- a/ci/requirements-2.7_WIN.run +++ b/ci/requirements-2.7_WIN.run @@ -14,5 +14,5 @@ xlsxwriter s3fs bottleneck html5lib -beautiful-soup +beautifulsoup4 jinja2=2.8 diff --git a/ci/requirements_all.txt b/ci/requirements_all.txt index b153b6989df868..e13afd619f1054 100644 --- a/ci/requirements_all.txt +++ b/ci/requirements_all.txt @@ -13,7 +13,7 @@ xlrd xlwt html5lib patsy -beautiful-soup +beautifulsoup4 numpy cython scipy diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index ff21afc11d2205..c5729d421758e3 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -799,6 +799,10 @@ def test_complibs(self): # Remove lzo if its not available on this platform if not tables.which_lib_version('lzo'): all_complibs.remove('lzo') + # Remove bzip2 if its not available on this platform + if not tables.which_lib_version("bzip2"): + all_complibs.remove("bzip2") + all_levels = range(0, 10) all_tests = [(lib, lvl) for lib in all_complibs for lvl in all_levels] From db1206aaf00cf0024a2ff28d828c0a78d6cbe7df Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 28 Sep 2017 07:50:32 -0400 Subject: [PATCH 163/188] DEPR: deprecate pd.TimeGrouper (#17703) closes #16747 --- doc/source/whatsnew/v0.21.0.txt | 1 + pandas/core/api.py | 17 ++++++++++++++--- pandas/tests/api/test_api.py | 9 +++++++-- pandas/tests/groupby/test_groupby.py | 12 ++++++------ pandas/tests/groupby/test_timegrouper.py | 20 +++++++++++--------- pandas/tests/groupby/test_transform.py | 2 +- pandas/tests/test_resample.py | 12 ++++++++---- 7 files changed, 48 insertions(+), 25 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index ae55b4a0aa4691..dae93feb48b02f 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -492,6 +492,7 @@ Deprecations - ``pd.options.html.border`` has been deprecated in favor of ``pd.options.display.html.border`` (:issue:`15793`). - :func:`SeriesGroupBy.nth` has deprecated ``True`` in favor of ``'all'`` for its kwarg ``dropna`` (:issue:`11038`). - :func:`DataFrame.as_blocks` is deprecated, as this is exposing the internal implementation (:issue:`17302`) +- ``pd.TimeGrouper`` is deprecated in favor of :class:`pandas.Grouper` (:issue:`16747`) .. _whatsnew_0210.deprecations.argmin_min diff --git a/pandas/core/api.py b/pandas/core/api.py index 6a32d3763ffb19..a012ccce839653 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -33,7 +33,6 @@ from pandas.tseries.offsets import DateOffset from pandas.core.tools.datetimes import to_datetime from pandas.core.tools.timedeltas import to_timedelta -from pandas.core.resample import TimeGrouper # see gh-14094. from pandas.util._depr_module import _DeprecatedModule @@ -52,8 +51,8 @@ # deprecation, xref #13790 def match(*args, **kwargs): - import warnings + import warnings warnings.warn("pd.match() is deprecated and will be removed " "in a future version", FutureWarning, stacklevel=2) @@ -64,8 +63,20 @@ def match(*args, **kwargs): def groupby(*args, **kwargs): import warnings - warnings.warn("pd.groupby() is deprecated and will be removed " + warnings.warn("pd.groupby() is deprecated and will be removed; " "Please use the Series.groupby() or " "DataFrame.groupby() methods", FutureWarning, stacklevel=2) return args[0].groupby(*args[1:], **kwargs) + + +# deprecation, xref +class TimeGrouper(object): + + def __new__(cls, *args, **kwargs): + from pandas.core.resample import TimeGrouper + import warnings + warnings.warn("pd.TimeGrouper is deprecated and will be removed; " + "Please use pd.Grouper(freq=...)", + FutureWarning, stacklevel=2) + return TimeGrouper(*args, **kwargs) diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index cbc73615811a2c..c593290410b961 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -47,11 +47,11 @@ class TestPDApi(Base): 'Grouper', 'HDFStore', 'Index', 'Int64Index', 'MultiIndex', 'Period', 'PeriodIndex', 'RangeIndex', 'UInt64Index', 'Series', 'SparseArray', 'SparseDataFrame', - 'SparseSeries', 'TimeGrouper', 'Timedelta', + 'SparseSeries', 'Timedelta', 'TimedeltaIndex', 'Timestamp', 'Interval', 'IntervalIndex'] # these are already deprecated; awaiting removal - deprecated_classes = ['WidePanel', 'Panel4D', + deprecated_classes = ['WidePanel', 'Panel4D', 'TimeGrouper', 'SparseList', 'Expr', 'Term'] # these should be deprecated in the future @@ -184,6 +184,11 @@ def test_groupby(self): check_stacklevel=False): pd.groupby(pd.Series([1, 2, 3]), [1, 1, 1]) + def test_TimeGrouper(self): + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + pd.TimeGrouper(freq='D') + # GH 15940 def test_get_store(self): diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 8957beacab376d..d91cff436dee2e 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -3335,7 +3335,7 @@ def test_groupby_with_empty(self): index = pd.DatetimeIndex(()) data = () series = pd.Series(data, index) - grouper = pd.core.resample.TimeGrouper('D') + grouper = pd.Grouper(freq='D') grouped = series.groupby(grouper) assert next(iter(grouped), None) is None @@ -3354,7 +3354,7 @@ def test_groupby_with_small_elem(self): df = pd.DataFrame({'event': ['start', 'start'], 'change': [1234, 5678]}, index=pd.DatetimeIndex(['2014-09-10', '2013-10-10'])) - grouped = df.groupby([pd.TimeGrouper(freq='M'), 'event']) + grouped = df.groupby([pd.Grouper(freq='M'), 'event']) assert len(grouped.groups) == 2 assert grouped.ngroups == 2 assert (pd.Timestamp('2014-09-30'), 'start') in grouped.groups @@ -3369,7 +3369,7 @@ def test_groupby_with_small_elem(self): 'change': [1234, 5678, 9123]}, index=pd.DatetimeIndex(['2014-09-10', '2013-10-10', '2014-09-15'])) - grouped = df.groupby([pd.TimeGrouper(freq='M'), 'event']) + grouped = df.groupby([pd.Grouper(freq='M'), 'event']) assert len(grouped.groups) == 2 assert grouped.ngroups == 2 assert (pd.Timestamp('2014-09-30'), 'start') in grouped.groups @@ -3385,7 +3385,7 @@ def test_groupby_with_small_elem(self): 'change': [1234, 5678, 9123]}, index=pd.DatetimeIndex(['2014-09-10', '2013-10-10', '2014-08-05'])) - grouped = df.groupby([pd.TimeGrouper(freq='M'), 'event']) + grouped = df.groupby([pd.Grouper(freq='M'), 'event']) assert len(grouped.groups) == 3 assert grouped.ngroups == 3 assert (pd.Timestamp('2014-09-30'), 'start') in grouped.groups @@ -3682,9 +3682,9 @@ def test_nunique_with_timegrouper(self): Timestamp('2016-06-28 16:09:30'), Timestamp('2016-06-28 16:46:28')], 'data': ['1', '2', '3']}).set_index('time') - result = test.groupby(pd.TimeGrouper(freq='h'))['data'].nunique() + result = test.groupby(pd.Grouper(freq='h'))['data'].nunique() expected = test.groupby( - pd.TimeGrouper(freq='h') + pd.Grouper(freq='h') )['data'].apply(pd.Series.nunique) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index f83a3fcd0668d9..fafcbf947e3df7 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -52,10 +52,10 @@ def test_groupby_with_timegrouper(self): assert_frame_equal(result1, expected) df_sorted = df.sort_index() - result2 = df_sorted.groupby(pd.TimeGrouper(freq='5D')).sum() + result2 = df_sorted.groupby(pd.Grouper(freq='5D')).sum() assert_frame_equal(result2, expected) - result3 = df.groupby(pd.TimeGrouper(freq='5D')).sum() + result3 = df.groupby(pd.Grouper(freq='5D')).sum() assert_frame_equal(result3, expected) def test_groupby_with_timegrouper_methods(self): @@ -80,7 +80,7 @@ def test_groupby_with_timegrouper_methods(self): for df in [df_original, df_sorted]: df = df.set_index('Date', drop=False) - g = df.groupby(pd.TimeGrouper('6M')) + g = df.groupby(pd.Grouper(freq='6M')) assert g.group_keys assert isinstance(g.grouper, pd.core.groupby.BinGrouper) groups = g.groups @@ -265,11 +265,11 @@ def test_timegrouper_with_reg_groups(self): ['date', 'user_id']).sort_index().astype('int64') expected.name = 'whole_cost' - result1 = df.sort_index().groupby([pd.TimeGrouper(freq=freq), + result1 = df.sort_index().groupby([pd.Grouper(freq=freq), 'user_id'])['whole_cost'].sum() assert_series_equal(result1, expected) - result2 = df.groupby([pd.TimeGrouper(freq=freq), 'user_id'])[ + result2 = df.groupby([pd.Grouper(freq=freq), 'user_id'])[ 'whole_cost'].sum() assert_series_equal(result2, expected) @@ -340,7 +340,7 @@ def sumfunc_series(x): return pd.Series([x['value'].sum()], ('sum',)) expected = df.groupby(pd.Grouper(key='date')).apply(sumfunc_series) - result = (df_dt.groupby(pd.TimeGrouper(freq='M', key='date')) + result = (df_dt.groupby(pd.Grouper(freq='M', key='date')) .apply(sumfunc_series)) assert_frame_equal(result.reset_index(drop=True), expected.reset_index(drop=True)) @@ -358,8 +358,10 @@ def sumfunc_value(x): return x.value.sum() expected = df.groupby(pd.Grouper(key='date')).apply(sumfunc_value) - result = (df_dt.groupby(pd.TimeGrouper(freq='M', key='date')) - .apply(sumfunc_value)) + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + result = (df_dt.groupby(pd.TimeGrouper(freq='M', key='date')) + .apply(sumfunc_value)) assert_series_equal(result.reset_index(drop=True), expected.reset_index(drop=True)) @@ -617,7 +619,7 @@ def test_nunique_with_timegrouper_and_nat(self): Timestamp('2016-06-28 16:46:28')], 'data': ['1', '2', '3']}) - grouper = pd.TimeGrouper(key='time', freq='h') + grouper = pd.Grouper(key='time', freq='h') result = test.groupby(grouper)['data'].nunique() expected = test[test.time.notnull()].groupby(grouper)['data'].nunique() tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index 267b67972c6406..4b821dade6eae7 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -57,7 +57,7 @@ def demean(arr): # GH 8430 df = tm.makeTimeDataFrame() - g = df.groupby(pd.TimeGrouper('M')) + g = df.groupby(pd.Grouper(freq='M')) g.transform(lambda x: x - 1) # GH 9700 diff --git a/pandas/tests/test_resample.py b/pandas/tests/test_resample.py index 28a68a0a6e36d3..7449beb8f97dfe 100644 --- a/pandas/tests/test_resample.py +++ b/pandas/tests/test_resample.py @@ -1983,8 +1983,8 @@ def test_resample_nunique(self): pd.Timestamp('2015-06-08 00:00:00'): '2015-06-08'}}) r = df.resample('D') g = df.groupby(pd.Grouper(freq='D')) - expected = df.groupby(pd.TimeGrouper('D')).ID.apply(lambda x: - x.nunique()) + expected = df.groupby(pd.Grouper(freq='D')).ID.apply(lambda x: + x.nunique()) assert expected.name == 'ID' for t in [r, g]: @@ -3075,7 +3075,9 @@ def setup_method(self, method): index=date_range('1/1/2000', periods=1000)) def test_apply(self): - grouper = TimeGrouper('A', label='right', closed='right') + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + grouper = pd.TimeGrouper(freq='A', label='right', closed='right') grouped = self.ts.groupby(grouper) @@ -3093,7 +3095,9 @@ def test_count(self): expected = self.ts.groupby(lambda x: x.year).count() - grouper = TimeGrouper('A', label='right', closed='right') + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + grouper = pd.TimeGrouper(freq='A', label='right', closed='right') result = self.ts.groupby(grouper).count() expected.index = result.index assert_series_equal(result, expected) From 45bd47186938fcd247aad3c2dc572c1581c06f4c Mon Sep 17 00:00:00 2001 From: Licht Takeuchi Date: Thu, 28 Sep 2017 23:11:25 +0900 Subject: [PATCH 164/188] BUG: Fix make_sparse mask generation (#17574) --- asv_bench/benchmarks/sparse.py | 65 ++++++++++++++++++++++++++++++- doc/source/whatsnew/v0.21.0.txt | 1 + pandas/_libs/sparse.pyx | 19 +++++++++ pandas/core/sparse/array.py | 9 ++++- pandas/tests/sparse/test_array.py | 9 +++++ 5 files changed, 101 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/sparse.py b/asv_bench/benchmarks/sparse.py index b958f5e0e5c342..a46205026481e5 100644 --- a/asv_bench/benchmarks/sparse.py +++ b/asv_bench/benchmarks/sparse.py @@ -2,7 +2,7 @@ from .pandas_vb_common import * import scipy.sparse -from pandas import SparseSeries, SparseDataFrame +from pandas import SparseSeries, SparseDataFrame, SparseArray class sparse_series_to_frame(object): @@ -23,6 +23,69 @@ def time_sparse_series_to_frame(self): SparseDataFrame(self.series) +class sparse_array_constructor(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1) + self.int64_10percent = self.make_numeric_array(length=1000000, dense_size=100000, fill_value=0, dtype=np.int64) + self.int64_1percent = self.make_numeric_array(length=1000000, dense_size=10000, fill_value=0, dtype=np.int64) + + self.float64_10percent = self.make_numeric_array(length=1000000, dense_size=100000, fill_value=np.nan, dtype=np.float64) + self.float64_1percent = self.make_numeric_array(length=1000000, dense_size=10000, fill_value=np.nan, dtype=np.float64) + + self.object_nan_fill_value_10percent = self.make_object_array(length=1000000, dense_size=100000, fill_value=np.nan) + self.object_nan_fill_value_1percent = self.make_object_array(length=1000000, dense_size=10000, fill_value=np.nan) + + self.object_non_nan_fill_value_10percent = self.make_object_array(length=1000000, dense_size=100000, fill_value=0) + self.object_non_nan_fill_value_1percent = self.make_object_array(length=1000000, dense_size=10000, fill_value=0) + + def make_numeric_array(self, length, dense_size, fill_value, dtype): + arr = np.array([fill_value] * length, dtype=dtype) + indexer = np.unique(np.random.randint(0, length, dense_size)) + arr[indexer] = np.random.randint(0, 100, len(indexer)) + return (arr, fill_value, dtype) + + def make_object_array(self, length, dense_size, fill_value): + elems = np.array(['a', 0.0, False, 1, 2], dtype=np.object) + arr = np.array([fill_value] * length, dtype=np.object) + indexer = np.unique(np.random.randint(0, length, dense_size)) + arr[indexer] = np.random.choice(elems, len(indexer)) + return (arr, fill_value, np.object) + + def time_sparse_array_constructor_int64_10percent(self): + arr, fill_value, dtype = self.int64_10percent + SparseArray(arr, fill_value=fill_value, dtype=dtype) + + def time_sparse_array_constructor_int64_1percent(self): + arr, fill_value, dtype = self.int64_1percent + SparseArray(arr, fill_value=fill_value, dtype=dtype) + + def time_sparse_array_constructor_float64_10percent(self): + arr, fill_value, dtype = self.float64_10percent + SparseArray(arr, fill_value=fill_value, dtype=dtype) + + def time_sparse_array_constructor_float64_1percent(self): + arr, fill_value, dtype = self.float64_1percent + SparseArray(arr, fill_value=fill_value, dtype=dtype) + + def time_sparse_array_constructor_object_nan_fill_value_10percent(self): + arr, fill_value, dtype = self.object_nan_fill_value_10percent + SparseArray(arr, fill_value=fill_value, dtype=dtype) + + def time_sparse_array_constructor_object_nan_fill_value_1percent(self): + arr, fill_value, dtype = self.object_nan_fill_value_1percent + SparseArray(arr, fill_value=fill_value, dtype=dtype) + + def time_sparse_array_constructor_object_non_nan_fill_value_10percent(self): + arr, fill_value, dtype = self.object_non_nan_fill_value_10percent + SparseArray(arr, fill_value=fill_value, dtype=dtype) + + def time_sparse_array_constructor_object_non_nan_fill_value_1percent(self): + arr, fill_value, dtype = self.object_non_nan_fill_value_1percent + SparseArray(arr, fill_value=fill_value, dtype=dtype) + + class sparse_frame_constructor(object): goal_time = 0.2 diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index dae93feb48b02f..eeabe6cff6e30a 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -631,6 +631,7 @@ Sparse - Bug in ``SparseSeries`` raises ``AttributeError`` when a dictionary is passed in as data (:issue:`16905`) - Bug in :func:`SparseDataFrame.fillna` not filling all NaNs when frame was instantiated from SciPy sparse matrix (:issue:`16112`) - Bug in :func:`SparseSeries.unstack` and :func:`SparseDataFrame.stack` (:issue:`16614`, :issue:`15045`) +- Bug in :func:`make_sparse` treating two numeric/boolean data, which have same bits, as same when array ``dtype`` is ``object`` (:issue:`17574`) Reshaping ^^^^^^^^^ diff --git a/pandas/_libs/sparse.pyx b/pandas/_libs/sparse.pyx index 1cc7f5ace95ea5..fac678e531c8be 100644 --- a/pandas/_libs/sparse.pyx +++ b/pandas/_libs/sparse.pyx @@ -848,3 +848,22 @@ def reindex_integer(ndarray[float64_t, ndim=1] values, IntIndex sparse_index, ndarray[int32_t, ndim=1] indexer): pass + + +# ----------------------------------------------------------------------------- +# SparseArray mask create operations + +def make_mask_object_ndarray(ndarray[object, ndim=1] arr, object fill_value): + cdef object value + cdef Py_ssize_t i + cdef Py_ssize_t new_length = len(arr) + cdef ndarray[int8_t, ndim=1] mask + + mask = np.ones(new_length, dtype=np.int8) + + for i in range(new_length): + value = arr[i] + if value == fill_value and type(value) == type(fill_value): + mask[i] = 0 + + return mask.view(dtype=np.bool) diff --git a/pandas/core/sparse/array.py b/pandas/core/sparse/array.py index f965c91999a03d..3b45a013734c91 100644 --- a/pandas/core/sparse/array.py +++ b/pandas/core/sparse/array.py @@ -19,6 +19,7 @@ from pandas.core.dtypes.common import ( _ensure_platform_int, is_float, is_integer, + is_object_dtype, is_integer_dtype, is_bool_dtype, is_list_like, @@ -789,7 +790,13 @@ def make_sparse(arr, kind='block', fill_value=None): if is_string_dtype(arr): arr = arr.astype(object) - mask = arr != fill_value + if is_object_dtype(arr.dtype): + # element-wise equality check method in numpy doesn't treat + # each element type, eg. 0, 0.0, and False are treated as + # same. So we have to check the both of its type and value. + mask = splib.make_mask_object_ndarray(arr, fill_value) + else: + mask = arr != fill_value length = len(arr) if length != mask.size: diff --git a/pandas/tests/sparse/test_array.py b/pandas/tests/sparse/test_array.py index b0a9182a265fe8..f653ee50982ad1 100644 --- a/pandas/tests/sparse/test_array.py +++ b/pandas/tests/sparse/test_array.py @@ -61,6 +61,15 @@ def test_constructor_object_dtype(self): assert arr.dtype == np.object assert arr.fill_value == 'A' + # GH 17574 + data = [False, 0, 100.0, 0.0] + arr = SparseArray(data, dtype=np.object, fill_value=False) + assert arr.dtype == np.object + assert arr.fill_value is False + arr_expected = np.array(data, dtype=np.object) + it = (type(x) == type(y) and x == y for x, y in zip(arr, arr_expected)) + assert np.fromiter(it, dtype=np.bool).all() + def test_constructor_spindex_dtype(self): arr = SparseArray(data=[1, 2], sparse_index=IntIndex(4, [1, 2])) tm.assert_sp_array_equal(arr, SparseArray([np.nan, 1, 2, np.nan])) From 074b4850151cd2785f670e1d18c412e46c509f60 Mon Sep 17 00:00:00 2001 From: Amol K Date: Thu, 28 Sep 2017 14:12:50 +0000 Subject: [PATCH 165/188] Fixed Value Error when doing HDFStore.Select of contiguous mixed-data (#17670) --- doc/source/whatsnew/v0.21.0.txt | 1 + pandas/io/pytables.py | 5 ++--- pandas/tests/io/test_pytables.py | 13 +++++++++++++ 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index eeabe6cff6e30a..50f11c38bae236 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -603,6 +603,7 @@ I/O - Bug in :func:`read_html` where import check fails when run in multiple threads (:issue:`16928`) - Bug in :func:`read_csv` where automatic delimiter detection caused a ``TypeError`` to be thrown when a bad line was encountered rather than the correct error message (:issue:`13374`) - Bug in ``DataFrame.to_html()`` with ``notebook=True`` where DataFrames with named indices or non-MultiIndex indices had undesired horizontal or vertical alignment for column or row labels, respectively (:issue:`16792`) +- Bug in :func:`HDFStore.select` when reading a contiguous mixed-data table featuring VLArray (:issue:`17021`) Plotting ^^^^^^^^ diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 4d300b200971ac..ea69116ec363da 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -2441,13 +2441,12 @@ def read_array(self, key, start=None, stop=None): """ read an array for the specified node (off of group """ import tables node = getattr(self.group, key) - data = node[start:stop] attrs = node._v_attrs transposed = getattr(attrs, 'transposed', False) if isinstance(node, tables.VLArray): - ret = data[0] + ret = node[0][start:stop] else: dtype = getattr(attrs, 'value_type', None) shape = getattr(attrs, 'shape', None) @@ -2456,7 +2455,7 @@ def read_array(self, key, start=None, stop=None): # length 0 axis ret = np.empty(shape, dtype=dtype) else: - ret = data + ret = node[start:stop] if dtype == u('datetime64'): diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index c5729d421758e3..2fe3cf1f34d44c 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -4391,6 +4391,19 @@ def test_path_pathlib(self): lambda p: pd.read_hdf(p, 'df')) tm.assert_frame_equal(df, result) + @pytest.mark.parametrize('start, stop', [(0, 2), (1, 2), (None, None)]) + def test_contiguous_mixed_data_table(self, start, stop): + # GH 17021 + # ValueError when reading a contiguous mixed-data table ft. VLArray + df = DataFrame({'a': Series([20111010, 20111011, 20111012]), + 'b': Series(['ab', 'cd', 'ab'])}) + + with ensure_clean_store(self.path) as store: + store.append('test_dataset', df) + + result = store.select('test_dataset', start=start, stop=stop) + assert_frame_equal(df[start:stop], result) + def test_path_pathlib_hdfstore(self): df = tm.makeDataFrame() From cc58b84f423db58de9edca762f0abbe10c638efb Mon Sep 17 00:00:00 2001 From: topper-123 Date: Thu, 28 Sep 2017 21:26:38 +0200 Subject: [PATCH 166/188] DOC: Improved doc string for IntervalIndex + related changes (#17706) --- pandas/_libs/interval.pyx | 4 ++ pandas/core/indexes/interval.py | 110 ++++++++++++++++++++++++++------ 2 files changed, 96 insertions(+), 18 deletions(-) diff --git a/pandas/_libs/interval.pyx b/pandas/_libs/interval.pyx index bfbda9696ff2bb..306597031817df 100644 --- a/pandas/_libs/interval.pyx +++ b/pandas/_libs/interval.pyx @@ -58,6 +58,10 @@ cdef class Interval(IntervalMixin): closed : {'left', 'right', 'both', 'neither'} Whether the interval is closed on the left-side, right-side, both or neither. Defaults to 'right'. + + See Also + -------- + IntervalIndex : an Index of intervals that are all closed on the same side. """ cdef readonly object left, right diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 55ed2342571ab8..a697ed7888f90b 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -120,9 +120,42 @@ class IntervalIndex(IntervalMixin, Index): copy : boolean, default False Copy the meta-data + Examples + --------- + A new ``IntervalIndex`` is typically constructed using + :func:`interval_range`: + + >>> pd.interval_range(start=0, end=5) + IntervalIndex([(0, 1], (1, 2], (2, 3], (3, 4], (4, 5]] + closed='right', dtype='interval[int64]') + + It may also be constructed using one of the constructor + methods :meth:`IntervalIndex.from_arrays`, + :meth:`IntervalIndex.from_breaks`, :meth:`IntervalIndex.from_intervals` + and :meth:`IntervalIndex.from_tuples`. + + See further examples in the doc strings of ``interval_range`` and the + mentioned constructor methods. + + Notes + ------ + See the `user guide + `_ + for more. + See Also -------- Index + Interval : A bounded slice-like interval + interval_range : Function to create a fixed frequency IntervalIndex + IntervalIndex.from_arrays : Construct an IntervalIndex from a left and + right array + IntervalIndex.from_breaks : Construct an IntervalIndex from an array of + splits + IntervalIndex.from_intervals : Construct an IntervalIndex from an array of + Interval objects + IntervalIndex.from_tuples : Construct an IntervalIndex from a list/array of + tuples """ _typ = 'intervalindex' _comparables = ['name'] @@ -319,11 +352,20 @@ def from_breaks(cls, breaks, closed='right', name=None, copy=False): Examples -------- + >>> pd.IntervalIndex.from_breaks([0, 1, 2, 3]) + IntervalIndex([(0, 1], (1, 2], (2, 3]] + closed='right', + dtype='interval[int64]') - >>> IntervalIndex.from_breaks([0, 1, 2, 3]) - IntervalIndex(left=[0, 1, 2], - right=[1, 2, 3], - closed='right') + See Also + -------- + interval_range : Function to create a fixed frequency IntervalIndex + IntervalIndex.from_arrays : Construct an IntervalIndex from a left and + right array + IntervalIndex.from_intervals : Construct an IntervalIndex from an array + of Interval objects + IntervalIndex.from_tuples : Construct an IntervalIndex from a + list/array of tuples """ breaks = np.asarray(breaks) return cls.from_arrays(breaks[:-1], breaks[1:], closed, @@ -350,11 +392,20 @@ def from_arrays(cls, left, right, closed='right', name=None, copy=False): Examples -------- + >>> pd.IntervalIndex.from_arrays([0, 1, 2], [1, 2, 3]) + IntervalIndex([(0, 1], (1, 2], (2, 3]] + closed='right', + dtype='interval[int64]') - >>> IntervalIndex.from_arrays([0, 1, 2], [1, 2, 3]) - IntervalIndex(left=[0, 1, 2], - right=[1, 2, 3], - closed='right') + See Also + -------- + interval_range : Function to create a fixed frequency IntervalIndex + IntervalIndex.from_breaks : Construct an IntervalIndex from an array of + splits + IntervalIndex.from_intervals : Construct an IntervalIndex from an array + of Interval objects + IntervalIndex.from_tuples : Construct an IntervalIndex from a + list/array of tuples """ left = np.asarray(left) right = np.asarray(right) @@ -378,19 +429,27 @@ def from_intervals(cls, data, name=None, copy=False): Examples -------- - - >>> IntervalIndex.from_intervals([Interval(0, 1), Interval(1, 2)]) - IntervalIndex(left=[0, 1], - right=[1, 2], - closed='right') + >>> pd.IntervalIndex.from_intervals([pd.Interval(0, 1), + ... pd.Interval(1, 2)]) + IntervalIndex([(0, 1], (1, 2]] + closed='right', dtype='interval[int64]') The generic Index constructor work identically when it infers an array of all intervals: - >>> Index([Interval(0, 1), Interval(1, 2)]) - IntervalIndex(left=[0, 1], - right=[1, 2], - closed='right') + >>> pd.Index([pd.Interval(0, 1), pd.Interval(1, 2)]) + IntervalIndex([(0, 1], (1, 2]] + closed='right', dtype='interval[int64]') + + See Also + -------- + interval_range : Function to create a fixed frequency IntervalIndex + IntervalIndex.from_arrays : Construct an IntervalIndex from a left and + right array + IntervalIndex.from_breaks : Construct an IntervalIndex from an array of + splits + IntervalIndex.from_tuples : Construct an IntervalIndex from a + list/array of tuples """ data = np.asarray(data) left, right, closed = intervals_to_interval_bounds(data) @@ -415,7 +474,19 @@ def from_tuples(cls, data, closed='right', name=None, copy=False): Examples -------- + >>> pd.IntervalIndex.from_tuples([(0, 1), (1,2)]) + IntervalIndex([(0, 1], (1, 2]], + closed='right', dtype='interval[int64]') + See Also + -------- + interval_range : Function to create a fixed frequency IntervalIndex + IntervalIndex.from_arrays : Construct an IntervalIndex from a left and + right array + IntervalIndex.from_breaks : Construct an IntervalIndex from an array of + splits + IntervalIndex.from_intervals : Construct an IntervalIndex from an array + of Interval objects """ left = [] right = [] @@ -1121,7 +1192,6 @@ def interval_range(start=None, end=None, periods=None, freq=None, Examples -------- - Numeric ``start`` and ``end`` is supported. >>> pd.interval_range(start=0, end=5) @@ -1159,6 +1229,10 @@ def interval_range(start=None, end=None, periods=None, freq=None, >>> pd.interval_range(end=5, periods=4, closed='both') IntervalIndex([[1, 2], [2, 3], [3, 4], [4, 5]] closed='both', dtype='interval[int64]') + + See Also + -------- + IntervalIndex : an Index of intervals that are all closed on the same side. """ if com._count_not_none(start, end, periods) != 2: raise ValueError('Of the three parameters: start, end, and periods, ' From 42adf7da3f42a5ab02ea53fcced36a3ed85a6037 Mon Sep 17 00:00:00 2001 From: louispotok Date: Thu, 28 Sep 2017 16:42:01 -0700 Subject: [PATCH 167/188] Add chunksize param to read_json when lines=True (#17168) closes #17048 --- asv_bench/benchmarks/io_bench.py | 30 ++++ doc/source/io.rst | 10 ++ doc/source/whatsnew/v0.21.0.txt | 1 + pandas/io/json/json.py | 215 ++++++++++++++++++++----- pandas/tests/io/json/test_pandas.py | 47 ------ pandas/tests/io/json/test_readlines.py | 168 +++++++++++++++++++ 6 files changed, 383 insertions(+), 88 deletions(-) create mode 100644 pandas/tests/io/json/test_readlines.py diff --git a/asv_bench/benchmarks/io_bench.py b/asv_bench/benchmarks/io_bench.py index 52064d2cdb8a25..93273955a29b9f 100644 --- a/asv_bench/benchmarks/io_bench.py +++ b/asv_bench/benchmarks/io_bench.py @@ -1,3 +1,4 @@ +import os from .pandas_vb_common import * from pandas import concat, Timestamp, compat try: @@ -192,3 +193,32 @@ def time_read_nrows(self, compression, engine): ext = ".bz2" pd.read_csv(self.big_fname + ext, nrows=10, compression=compression, engine=engine) + + +class read_json_lines(object): + goal_time = 0.2 + fname = "__test__.json" + + def setup(self): + self.N = 100000 + self.C = 5 + self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)])) + self.df.to_json(self.fname,orient="records",lines=True) + + def teardown(self): + try: + os.remove(self.fname) + except: + pass + + def time_read_json_lines(self): + pd.read_json(self.fname, lines=True) + + def time_read_json_lines_chunk(self): + pd.concat(pd.read_json(self.fname, lines=True, chunksize=self.N//4)) + + def peakmem_read_json_lines(self): + pd.read_json(self.fname, lines=True) + + def peakmem_read_json_lines_chunk(self): + pd.concat(pd.read_json(self.fname, lines=True, chunksize=self.N//4)) diff --git a/doc/source/io.rst b/doc/source/io.rst index d6abed6e9d1ad6..4eba9687efc58e 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1845,6 +1845,7 @@ is ``None``. To explicitly force ``Series`` parsing, pass ``typ=series`` seconds, milliseconds, microseconds or nanoseconds respectively. - ``lines`` : reads file as one json object per line. - ``encoding`` : The encoding to use to decode py3 bytes. +- ``chunksize`` : when used in combination with ``lines=True``, return a JsonReader which reads in ``chunksize`` lines per iteration. The parser will raise one of ``ValueError/TypeError/AssertionError`` if the JSON is not parseable. @@ -2049,6 +2050,10 @@ Line delimited json pandas is able to read and write line-delimited json files that are common in data processing pipelines using Hadoop or Spark. +.. versionadded:: 0.21.0 + +For line-delimited json files, pandas can also return an iterator which reads in ``chunksize`` lines at a time. This can be useful for large files or to read from a stream. + .. ipython:: python jsonl = ''' @@ -2059,6 +2064,11 @@ using Hadoop or Spark. df df.to_json(orient='records', lines=True) + # reader is an iterator that returns `chunksize` lines each iteration + reader = pd.read_json(StringIO(jsonl), lines=True, chunksize=1) + reader + for chunk in reader: + print(chunk) .. _io.table_schema: diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 50f11c38bae236..d5d508d02cb730 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -162,6 +162,7 @@ Other Enhancements - :func:`MultiIndex.is_monotonic_decreasing` has been implemented. Previously returned ``False`` in all cases. (:issue:`16554`) - :func:`Categorical.rename_categories` now accepts a dict-like argument as `new_categories` and only updates the categories found in that dict. (:issue:`17336`) - :func:`read_excel` raises ``ImportError`` with a better message if ``xlrd`` is not installed. (:issue:`17613`) +- :func:`read_json` now accepts a ``chunksize`` parameter that can be used when ``lines=True``. If ``chunksize`` is passed, read_json now returns an iterator which reads in ``chunksize`` lines with each iteration. (:issue:`17048`) - :meth:`DataFrame.assign` will preserve the original order of ``**kwargs`` for Python 3.6+ users instead of sorting the column names diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 5dae6099446d0f..ab74b265b6a067 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -1,4 +1,5 @@ # pylint: disable-msg=E1101,W0613,W0603 +from itertools import islice import os import numpy as np @@ -8,8 +9,10 @@ from pandas import compat, isna from pandas import Series, DataFrame, to_datetime, MultiIndex from pandas.io.common import (get_filepath_or_buffer, _get_handle, - _stringify_path) + _stringify_path, BaseIterator) +from pandas.io.parsers import _validate_integer from pandas.core.common import AbstractMethodError +from pandas.core.reshape.concat import concat from pandas.io.formats.printing import pprint_thing from .normalize import _convert_to_line_delimits from .table_schema import build_table_schema @@ -175,7 +178,7 @@ def write(self): def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, convert_axes=True, convert_dates=True, keep_default_dates=True, numpy=False, precise_float=False, date_unit=None, encoding=None, - lines=False): + lines=False, chunksize=None): """ Convert a JSON string to pandas object @@ -264,6 +267,16 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, .. versionadded:: 0.19.0 + chunksize: integer, default None + Return JsonReader object for iteration. + See the `line-delimted json docs + `_ + for more information on ``chunksize``. + This can only be passed if `lines=True`. + If this is None, the file will be read into memory all at once. + + .. versionadded:: 0.21.0 + Returns ------- result : Series or DataFrame, depending on the value of `typ`. @@ -323,47 +336,167 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, filepath_or_buffer, _, _ = get_filepath_or_buffer(path_or_buf, encoding=encoding) - if isinstance(filepath_or_buffer, compat.string_types): - try: - exists = os.path.exists(filepath_or_buffer) - - # if the filepath is too long will raise here - # 5874 - except (TypeError, ValueError): - exists = False - - if exists: - fh, handles = _get_handle(filepath_or_buffer, 'r', - encoding=encoding) - json = fh.read() - fh.close() + + json_reader = JsonReader( + filepath_or_buffer, orient=orient, typ=typ, dtype=dtype, + convert_axes=convert_axes, convert_dates=convert_dates, + keep_default_dates=keep_default_dates, numpy=numpy, + precise_float=precise_float, date_unit=date_unit, encoding=encoding, + lines=lines, chunksize=chunksize + ) + + if chunksize: + return json_reader + + return json_reader.read() + + +class JsonReader(BaseIterator): + """ + JsonReader provides an interface for reading in a JSON file. + + If initialized with ``lines=True`` and ``chunksize``, can be iterated over + ``chunksize`` lines at a time. Otherwise, calling ``read`` reads in the + whole document. + """ + def __init__(self, filepath_or_buffer, orient, typ, dtype, convert_axes, + convert_dates, keep_default_dates, numpy, precise_float, + date_unit, encoding, lines, chunksize): + + self.path_or_buf = filepath_or_buffer + self.orient = orient + self.typ = typ + self.dtype = dtype + self.convert_axes = convert_axes + self.convert_dates = convert_dates + self.keep_default_dates = keep_default_dates + self.numpy = numpy + self.precise_float = precise_float + self.date_unit = date_unit + self.encoding = encoding + self.lines = lines + self.chunksize = chunksize + self.nrows_seen = 0 + self.should_close = False + + if self.chunksize is not None: + self.chunksize = _validate_integer("chunksize", self.chunksize, 1) + if not self.lines: + raise ValueError("chunksize can only be passed if lines=True") + + data = self._get_data_from_filepath(filepath_or_buffer) + self.data = self._preprocess_data(data) + + def _preprocess_data(self, data): + """ + At this point, the data either has a `read` attribute (e.g. a file + object or a StringIO) or is a string that is a JSON document. + + If self.chunksize, we prepare the data for the `__next__` method. + Otherwise, we read it into memory for the `read` method. + """ + if hasattr(data, 'read') and not self.chunksize: + data = data.read() + if not hasattr(data, 'read') and self.chunksize: + data = StringIO(data) + + return data + + def _get_data_from_filepath(self, filepath_or_buffer): + """ + read_json accepts three input types: + 1. filepath (string-like) + 2. file-like object (e.g. open file object, StringIO) + 3. JSON string + + This method turns (1) into (2) to simplify the rest of the processing. + It returns input types (2) and (3) unchanged. + """ + + data = filepath_or_buffer + + if isinstance(data, compat.string_types): + try: + exists = os.path.exists(filepath_or_buffer) + + # gh-5874: if the filepath is too long will raise here + except (TypeError, ValueError): + pass + + else: + if exists: + data, _ = _get_handle(filepath_or_buffer, 'r', + encoding=self.encoding) + self.should_close = True + self.open_stream = data + + return data + + def _combine_lines(self, lines): + """Combines a list of JSON objects into one JSON object""" + lines = filter(None, map(lambda x: x.strip(), lines)) + return '[' + ','.join(lines) + ']' + + def read(self): + """Read the whole JSON input into a pandas object""" + if self.lines and self.chunksize: + obj = concat(self) + elif self.lines: + obj = self._get_object_parser( + self._combine_lines(self.data.split('\n')) + ) else: - json = filepath_or_buffer - elif hasattr(filepath_or_buffer, 'read'): - json = filepath_or_buffer.read() - else: - json = filepath_or_buffer + obj = self._get_object_parser(self.data) + self.close() + return obj + + def _get_object_parser(self, json): + """parses a json document into a pandas object""" + typ = self.typ + dtype = self.dtype + kwargs = { + "orient": self.orient, "dtype": self.dtype, + "convert_axes": self.convert_axes, + "convert_dates": self.convert_dates, + "keep_default_dates": self.keep_default_dates, "numpy": self.numpy, + "precise_float": self.precise_float, "date_unit": self.date_unit + } + obj = None + if typ == 'frame': + obj = FrameParser(json, **kwargs).parse() + + if typ == 'series' or obj is None: + if not isinstance(dtype, bool): + dtype = dict(data=dtype) + obj = SeriesParser(json, **kwargs).parse() + + return obj + + def close(self): + """ + If we opened a stream earlier, in _get_data_from_filepath, we should + close it. If an open stream or file was passed, we leave it open. + """ + if self.should_close: + try: + self.open_stream.close() + except (IOError, AttributeError): + pass - if lines: - # If given a json lines file, we break the string into lines, add - # commas and put it in a json list to make a valid json object. - lines = list(StringIO(json.strip())) - json = '[' + ','.join(lines) + ']' - - obj = None - if typ == 'frame': - obj = FrameParser(json, orient, dtype, convert_axes, convert_dates, - keep_default_dates, numpy, precise_float, - date_unit).parse() - - if typ == 'series' or obj is None: - if not isinstance(dtype, bool): - dtype = dict(data=dtype) - obj = SeriesParser(json, orient, dtype, convert_axes, convert_dates, - keep_default_dates, numpy, precise_float, - date_unit).parse() - - return obj + def __next__(self): + lines = list(islice(self.data, self.chunksize)) + if lines: + lines_json = self._combine_lines(lines) + obj = self._get_object_parser(lines_json) + + # Make sure that the returned objects have the right index. + obj.index = range(self.nrows_seen, self.nrows_seen + len(obj)) + self.nrows_seen += len(obj) + + return obj + + self.close() + raise StopIteration class Parser(object): diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 671d4248818e40..de4afec883efdb 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -985,53 +985,6 @@ def test_tz_range_is_utc(self): df = DataFrame({'DT': dti}) assert dumps(df, iso_dates=True) == dfexp - def test_read_jsonl(self): - # GH9180 - result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True) - expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) - assert_frame_equal(result, expected) - - def test_read_jsonl_unicode_chars(self): - # GH15132: non-ascii unicode characters - # \u201d == RIGHT DOUBLE QUOTATION MARK - - # simulate file handle - json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n' - json = StringIO(json) - result = read_json(json, lines=True) - expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]], - columns=['a', 'b']) - assert_frame_equal(result, expected) - - # simulate string - json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n' - result = read_json(json, lines=True) - expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]], - columns=['a', 'b']) - assert_frame_equal(result, expected) - - def test_to_jsonl(self): - # GH9180 - df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) - result = df.to_json(orient="records", lines=True) - expected = '{"a":1,"b":2}\n{"a":1,"b":2}' - assert result == expected - - df = DataFrame([["foo}", "bar"], ['foo"', "bar"]], columns=['a', 'b']) - result = df.to_json(orient="records", lines=True) - expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}' - assert result == expected - assert_frame_equal(pd.read_json(result, lines=True), df) - - # GH15096: escaped characters in columns and data - df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]], - columns=["a\\", 'b']) - result = df.to_json(orient="records", lines=True) - expected = ('{"a\\\\":"foo\\\\","b":"bar"}\n' - '{"a\\\\":"foo\\"","b":"bar"}') - assert result == expected - assert_frame_equal(pd.read_json(result, lines=True), df) - def test_latin_encoding(self): if compat.PY2: tm.assert_raises_regex( diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py new file mode 100644 index 00000000000000..d14355b07cf204 --- /dev/null +++ b/pandas/tests/io/json/test_readlines.py @@ -0,0 +1,168 @@ +# -*- coding: utf-8 -*- +import pytest +import pandas as pd +from pandas import DataFrame, read_json +from pandas.compat import StringIO +from pandas.io.json.json import JsonReader +import pandas.util.testing as tm +from pandas.util.testing import (assert_frame_equal, assert_series_equal, + ensure_clean) + + +@pytest.fixture +def lines_json_df(): + df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) + return df.to_json(lines=True, orient="records") + + +def test_read_jsonl(): + # GH9180 + result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True) + expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) + assert_frame_equal(result, expected) + + +def test_read_jsonl_unicode_chars(): + # GH15132: non-ascii unicode characters + # \u201d == RIGHT DOUBLE QUOTATION MARK + + # simulate file handle + json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n' + json = StringIO(json) + result = read_json(json, lines=True) + expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]], + columns=['a', 'b']) + assert_frame_equal(result, expected) + + # simulate string + json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n' + result = read_json(json, lines=True) + expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]], + columns=['a', 'b']) + assert_frame_equal(result, expected) + + +def test_to_jsonl(): + # GH9180 + df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) + result = df.to_json(orient="records", lines=True) + expected = '{"a":1,"b":2}\n{"a":1,"b":2}' + assert result == expected + + df = DataFrame([["foo}", "bar"], ['foo"', "bar"]], columns=['a', 'b']) + result = df.to_json(orient="records", lines=True) + expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}' + assert result == expected + assert_frame_equal(read_json(result, lines=True), df) + + # GH15096: escaped characters in columns and data + df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]], + columns=["a\\", 'b']) + result = df.to_json(orient="records", lines=True) + expected = ('{"a\\\\":"foo\\\\","b":"bar"}\n' + '{"a\\\\":"foo\\"","b":"bar"}') + assert result == expected + assert_frame_equal(read_json(result, lines=True), df) + + +@pytest.mark.parametrize("chunksize", [1, 1.0]) +def test_readjson_chunks(lines_json_df, chunksize): + # Basic test that read_json(chunks=True) gives the same result as + # read_json(chunks=False) + # GH17048: memory usage when lines=True + + unchunked = read_json(StringIO(lines_json_df), lines=True) + reader = read_json(StringIO(lines_json_df), lines=True, + chunksize=chunksize) + chunked = pd.concat(reader) + + assert_frame_equal(chunked, unchunked) + + +def test_readjson_chunksize_requires_lines(lines_json_df): + msg = "chunksize can only be passed if lines=True" + with tm.assert_raises_regex(ValueError, msg): + pd.read_json(StringIO(lines_json_df), lines=False, chunksize=2) + + +def test_readjson_chunks_series(): + # Test reading line-format JSON to Series with chunksize param + s = pd.Series({'A': 1, 'B': 2}) + + strio = StringIO(s.to_json(lines=True, orient="records")) + unchunked = pd.read_json(strio, lines=True, typ='Series') + + strio = StringIO(s.to_json(lines=True, orient="records")) + chunked = pd.concat(pd.read_json( + strio, lines=True, typ='Series', chunksize=1 + )) + + assert_series_equal(chunked, unchunked) + + +def test_readjson_each_chunk(lines_json_df): + # Other tests check that the final result of read_json(chunksize=True) + # is correct. This checks the intermediate chunks. + chunks = list( + pd.read_json(StringIO(lines_json_df), lines=True, chunksize=2) + ) + assert chunks[0].shape == (2, 2) + assert chunks[1].shape == (1, 2) + + +def test_readjson_chunks_from_file(): + with ensure_clean('test.json') as path: + df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) + df.to_json(path, lines=True, orient="records") + chunked = pd.concat(pd.read_json(path, lines=True, chunksize=1)) + unchunked = pd.read_json(path, lines=True) + assert_frame_equal(unchunked, chunked) + + +@pytest.mark.parametrize("chunksize", [None, 1]) +def test_readjson_chunks_closes(chunksize): + with ensure_clean('test.json') as path: + df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) + df.to_json(path, lines=True, orient="records") + reader = JsonReader( + path, orient=None, typ="frame", dtype=True, convert_axes=True, + convert_dates=True, keep_default_dates=True, numpy=False, + precise_float=False, date_unit=None, encoding=None, + lines=True, chunksize=chunksize) + reader.read() + assert reader.open_stream.closed, "didn't close stream with \ + chunksize = %s" % chunksize + + +@pytest.mark.parametrize("chunksize", [0, -1, 2.2, "foo"]) +def test_readjson_invalid_chunksize(lines_json_df, chunksize): + msg = r"'chunksize' must be an integer >=1" + + with tm.assert_raises_regex(ValueError, msg): + pd.read_json(StringIO(lines_json_df), lines=True, + chunksize=chunksize) + + +@pytest.mark.parametrize("chunksize", [None, 1, 2]) +def test_readjson_chunks_multiple_empty_lines(chunksize): + j = """ + + {"A":1,"B":4} + + + + {"A":2,"B":5} + + + + + + + + {"A":3,"B":6} + """ + orig = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) + test = pd.read_json(j, lines=True, chunksize=chunksize) + if chunksize is not None: + test = pd.concat(test) + tm.assert_frame_equal(orig, test, obj="chunksize: %s" % chunksize) From bbf0ddaf6461a9586c5e459d9c00fe863adc43f8 Mon Sep 17 00:00:00 2001 From: Licht Takeuchi Date: Fri, 29 Sep 2017 08:44:29 +0900 Subject: [PATCH 168/188] BUG: Add SparseArray.all (#17570) --- doc/source/whatsnew/v0.21.0.txt | 1 + pandas/compat/numpy/function.py | 8 +++ pandas/core/sparse/array.py | 42 +++++++++++++++ pandas/tests/sparse/test_array.py | 88 +++++++++++++++++++++++++++++++ 4 files changed, 139 insertions(+) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index d5d508d02cb730..ee781ec4b0361c 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -634,6 +634,7 @@ Sparse - Bug in :func:`SparseDataFrame.fillna` not filling all NaNs when frame was instantiated from SciPy sparse matrix (:issue:`16112`) - Bug in :func:`SparseSeries.unstack` and :func:`SparseDataFrame.stack` (:issue:`16614`, :issue:`15045`) - Bug in :func:`make_sparse` treating two numeric/boolean data, which have same bits, as same when array ``dtype`` is ``object`` (:issue:`17574`) +- :func:`SparseArray.all` and :func:`SparseArray.any` are now implemented to handle ``SparseArray``, these were used but not implemented (:issue:`17570`) Reshaping ^^^^^^^^^ diff --git a/pandas/compat/numpy/function.py b/pandas/compat/numpy/function.py index ccbd3d9704e0c9..d42be569635696 100644 --- a/pandas/compat/numpy/function.py +++ b/pandas/compat/numpy/function.py @@ -184,6 +184,14 @@ def validate_cum_func_with_skipna(skipna, args, kwargs, name): return skipna +ALLANY_DEFAULTS = OrderedDict() +ALLANY_DEFAULTS['dtype'] = None +ALLANY_DEFAULTS['out'] = None +validate_all = CompatValidator(ALLANY_DEFAULTS, fname='all', + method='both', max_fname_arg_count=1) +validate_any = CompatValidator(ALLANY_DEFAULTS, fname='any', + method='both', max_fname_arg_count=1) + LOGICAL_FUNC_DEFAULTS = dict(out=None) validate_logical_func = CompatValidator(LOGICAL_FUNC_DEFAULTS, method='kwargs') diff --git a/pandas/core/sparse/array.py b/pandas/core/sparse/array.py index 3b45a013734c91..0424ac8703e255 100644 --- a/pandas/core/sparse/array.py +++ b/pandas/core/sparse/array.py @@ -615,6 +615,48 @@ def fillna(self, value, downcast=None): return self._simple_new(new_values, self.sp_index, fill_value=fill_value) + def all(self, axis=0, *args, **kwargs): + """ + Tests whether all elements evaluate True + + Returns + ------- + all : bool + + See Also + -------- + numpy.all + """ + nv.validate_all(args, kwargs) + + values = self.sp_values + + if len(values) != len(self) and not np.all(self.fill_value): + return False + + return values.all() + + def any(self, axis=0, *args, **kwargs): + """ + Tests whether at least one of elements evaluate True + + Returns + ------- + any : bool + + See Also + -------- + numpy.any + """ + nv.validate_any(args, kwargs) + + values = self.sp_values + + if len(values) != len(self) and np.any(self.fill_value): + return True + + return values.any() + def sum(self, axis=0, *args, **kwargs): """ Sum of non-NA/null values diff --git a/pandas/tests/sparse/test_array.py b/pandas/tests/sparse/test_array.py index f653ee50982ad1..8de93ff3209613 100644 --- a/pandas/tests/sparse/test_array.py +++ b/pandas/tests/sparse/test_array.py @@ -664,6 +664,94 @@ def test_fillna_overlap(self): class TestSparseArrayAnalytics(object): + @pytest.mark.parametrize('data,pos,neg', [ + ([True, True, True], True, False), + ([1, 2, 1], 1, 0), + ([1.0, 2.0, 1.0], 1.0, 0.0) + ]) + def test_all(self, data, pos, neg): + # GH 17570 + out = SparseArray(data).all() + assert out + + out = SparseArray(data, fill_value=pos).all() + assert out + + data[1] = neg + out = SparseArray(data).all() + assert not out + + out = SparseArray(data, fill_value=pos).all() + assert not out + + @pytest.mark.parametrize('data,pos,neg', [ + ([True, True, True], True, False), + ([1, 2, 1], 1, 0), + ([1.0, 2.0, 1.0], 1.0, 0.0) + ]) + def test_numpy_all(self, data, pos, neg): + # GH 17570 + out = np.all(SparseArray(data)) + assert out + + out = np.all(SparseArray(data, fill_value=pos)) + assert out + + data[1] = neg + out = np.all(SparseArray(data)) + assert not out + + out = np.all(SparseArray(data, fill_value=pos)) + assert not out + + msg = "the 'out' parameter is not supported" + tm.assert_raises_regex(ValueError, msg, np.all, + SparseArray(data), out=out) + + @pytest.mark.parametrize('data,pos,neg', [ + ([False, True, False], True, False), + ([0, 2, 0], 2, 0), + ([0.0, 2.0, 0.0], 2.0, 0.0) + ]) + def test_any(self, data, pos, neg): + # GH 17570 + out = SparseArray(data).any() + assert out + + out = SparseArray(data, fill_value=pos).any() + assert out + + data[1] = neg + out = SparseArray(data).any() + assert not out + + out = SparseArray(data, fill_value=pos).any() + assert not out + + @pytest.mark.parametrize('data,pos,neg', [ + ([False, True, False], True, False), + ([0, 2, 0], 2, 0), + ([0.0, 2.0, 0.0], 2.0, 0.0) + ]) + def test_numpy_any(self, data, pos, neg): + # GH 17570 + out = np.any(SparseArray(data)) + assert out + + out = np.any(SparseArray(data, fill_value=pos)) + assert out + + data[1] = neg + out = np.any(SparseArray(data)) + assert not out + + out = np.any(SparseArray(data, fill_value=pos)) + assert not out + + msg = "the 'out' parameter is not supported" + tm.assert_raises_regex(ValueError, msg, np.any, + SparseArray(data), out=out) + def test_sum(self): data = np.arange(10).astype(float) out = SparseArray(data).sum() From 00e52abe927150d10a72e397893bee56f4cc6505 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 29 Sep 2017 03:04:41 -0700 Subject: [PATCH 169/188] update imports of DateParseError, remove unused imports from tslib (#17713) See #17652 --- pandas/_libs/tslib.pyx | 36 +++++--------------- pandas/core/tools/datetimes.py | 2 +- pandas/tests/indexes/datetimes/test_tools.py | 4 +-- pandas/tests/scalar/test_period.py | 5 +-- 4 files changed, 14 insertions(+), 33 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 4c34d0fcb1e5f6..b0b70bb8102047 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -1,12 +1,9 @@ # -*- coding: utf-8 -*- # cython: profile=False -import warnings - cimport numpy as np from numpy cimport (int8_t, int32_t, int64_t, import_array, ndarray, - float64_t, - NPY_INT64, NPY_DATETIME, NPY_TIMEDELTA) + float64_t, NPY_DATETIME, NPY_TIMEDELTA) import numpy as np import sys @@ -16,12 +13,10 @@ from cpython cimport ( PyTypeObject, PyFloat_Check, PyComplex_Check, - PyLong_Check, PyObject_RichCompareBool, PyObject_RichCompare, Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE, - PyUnicode_Check, - PyUnicode_AsUTF8String) + PyUnicode_Check) cdef extern from "Python.h": cdef PyTypeObject *Py_TYPE(object) @@ -38,7 +33,6 @@ from datetime cimport ( pandas_datetimestruct, pandas_datetime_to_datetimestruct, pandas_datetimestruct_to_datetime, - cmp_pandas_datetimestruct, days_per_month_table, get_datetime64_value, get_timedelta64_value, @@ -68,23 +62,12 @@ from khash cimport ( kh_resize_int64, kh_get_int64) from .tslibs.parsing import parse_datetime_string -from .tslibs.parsing import DateParseError # noqa cimport cython -import re import time -# dateutil compat -from dateutil.tz import (tzoffset, tzlocal as _dateutil_tzlocal, - tzutc as _dateutil_tzutc, - tzstr as _dateutil_tzstr) - -from dateutil.relativedelta import relativedelta -from dateutil.parser import DEFAULTPARSER - -from pandas.compat import (parse_date, string_types, iteritems, - StringIO, callable) +from pandas.compat import iteritems, callable import operator import collections @@ -97,9 +80,6 @@ import_array() # import datetime C API PyDateTime_IMPORT -# in numpy 1.7, will prob need the following: -# numpy_pydatetime_import - cdef int64_t NPY_NAT = util.get_nat() iNaT = NPY_NAT @@ -318,7 +298,7 @@ class Timestamp(_Timestamp): tz : string / timezone object, default None Timezone to localize to """ - if isinstance(tz, string_types): + if util.is_string_object(tz): tz = maybe_get_tz(tz) return cls(datetime.now(tz)) @@ -613,7 +593,7 @@ class Timestamp(_Timestamp): if self.tzinfo is None: # tz naive, localize tz = maybe_get_tz(tz) - if not isinstance(ambiguous, string_types): + if not util.is_string_object(ambiguous): ambiguous = [ambiguous] value = tz_localize_to_utc(np.array([self.value], dtype='i8'), tz, ambiguous=ambiguous, errors=errors)[0] @@ -2426,8 +2406,8 @@ class Timedelta(_Timedelta): raise TypeError( "Invalid type {0}. Must be int or float.".format(type(v))) - kwargs = dict([ (k, _to_py_int_float(v)) - for k, v in iteritems(kwargs) ]) + kwargs = dict([(k, _to_py_int_float(v)) + for k, v in iteritems(kwargs)]) try: nano = kwargs.pop('nanoseconds', 0) @@ -3682,7 +3662,7 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None, result[i] = v - delta return result - if isinstance(ambiguous, string_types): + if util.is_string_object(ambiguous): if ambiguous == 'infer': infer_dst = True elif ambiguous == 'NaT': diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 97ac8445faf4c2..8fe28aa4006131 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -8,6 +8,7 @@ from pandas._libs.tslibs import parsing from pandas._libs.tslibs.parsing import ( # noqa parse_time_string, + DateParseError, _format_is_iso, _guess_datetime_format) @@ -561,7 +562,6 @@ def calc_with_mask(carg, mask): return None -DateParseError = tslib.DateParseError normalize_date = tslib.normalize_date # Fixed time formats for time parsing diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index bdfe6b5b09e45a..b8ce1f0af6ea85 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -1335,13 +1335,13 @@ def test_parsers_monthfreq(self): def test_parsers_quarterly_with_freq(self): msg = ('Incorrect quarterly string is given, quarter ' 'must be between 1 and 4: 2013Q5') - with tm.assert_raises_regex(tslib.DateParseError, msg): + with tm.assert_raises_regex(parsing.DateParseError, msg): tools.parse_time_string('2013Q5') # GH 5418 msg = ('Unable to retrieve month information from given freq: ' 'INVLD-L-DEC-SAT') - with tm.assert_raises_regex(tslib.DateParseError, msg): + with tm.assert_raises_regex(parsing.DateParseError, msg): tools.parse_time_string('2013Q1', freq='INVLD-L-DEC-SAT') cases = {('2013Q2', None): datetime(2013, 4, 1), diff --git a/pandas/tests/scalar/test_period.py b/pandas/tests/scalar/test_period.py index c17a216df44cbd..28d85c52604d94 100644 --- a/pandas/tests/scalar/test_period.py +++ b/pandas/tests/scalar/test_period.py @@ -11,6 +11,7 @@ from pandas.compat.numpy import np_datetime64_compat from pandas._libs import tslib, period as libperiod +from pandas._libs.tslibs.parsing import DateParseError from pandas import Period, Timestamp, offsets from pandas.tseries.frequencies import DAYS, MONTHS @@ -886,8 +887,8 @@ def test_constructor_infer_freq(self): def test_badinput(self): pytest.raises(ValueError, Period, '-2000', 'A') - pytest.raises(tslib.DateParseError, Period, '0', 'A') - pytest.raises(tslib.DateParseError, Period, '1/1/-2000', 'A') + pytest.raises(DateParseError, Period, '0', 'A') + pytest.raises(DateParseError, Period, '1/1/-2000', 'A') def test_multiples(self): result1 = Period('1989', freq='2A') From ffa86c5d154e7013863f94a5a72b574aa2846508 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 29 Sep 2017 03:05:59 -0700 Subject: [PATCH 170/188] Add missing file to _pyxfiles, delete commented-out (#17712) --- setup.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/setup.py b/setup.py index d25ae4a5fb45ce..793aa089e708fa 100755 --- a/setup.py +++ b/setup.py @@ -341,6 +341,7 @@ class CheckSDist(sdist_class): 'pandas/_libs/window.pyx', 'pandas/_libs/sparse.pyx', 'pandas/_libs/parsers.pyx', + 'pandas/_libs/tslibs/strptime.pyx', 'pandas/_libs/tslibs/timezones.pyx', 'pandas/_libs/tslibs/frequencies.pyx', 'pandas/_libs/tslibs/parsing.pyx', @@ -349,14 +350,6 @@ class CheckSDist(sdist_class): def initialize_options(self): sdist_class.initialize_options(self) - ''' - self._pyxfiles = [] - for root, dirs, files in os.walk('pandas'): - for f in files: - if f.endswith('.pyx'): - self._pyxfiles.append(pjoin(root, f)) - ''' - def run(self): if 'cython' in cmdclass: self.run_command('cython') From 54f6648cdebfa376c83f9fc03b53effe82df7492 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 29 Sep 2017 03:16:35 -0700 Subject: [PATCH 171/188] Last of the timezones funcs (#17669) --- pandas/_libs/tslibs/timezones.pxd | 2 -- pandas/_libs/tslibs/timezones.pyx | 19 +++++++++++++++++++ pandas/core/indexes/datetimes.py | 2 +- pandas/core/tools/datetimes.py | 19 ------------------- pandas/tests/tseries/test_timezones.py | 19 +++++++++---------- 5 files changed, 29 insertions(+), 32 deletions(-) diff --git a/pandas/_libs/tslibs/timezones.pxd b/pandas/_libs/tslibs/timezones.pxd index e5d1343e1c9843..95e0474b3a174a 100644 --- a/pandas/_libs/tslibs/timezones.pxd +++ b/pandas/_libs/tslibs/timezones.pxd @@ -1,8 +1,6 @@ # -*- coding: utf-8 -*- # cython: profile=False -from numpy cimport ndarray - cdef bint is_utc(object tz) cdef bint is_tzlocal(object tz) diff --git a/pandas/_libs/tslibs/timezones.pyx b/pandas/_libs/tslibs/timezones.pyx index 48d82996a0bd0f..7f778dde86e232 100644 --- a/pandas/_libs/tslibs/timezones.pyx +++ b/pandas/_libs/tslibs/timezones.pyx @@ -1,5 +1,8 @@ # -*- coding: utf-8 -*- # cython: profile=False +# cython: linetrace=False +# distutils: define_macros=CYTHON_TRACE=0 +# distutils: define_macros=CYTHON_TRACE_NOGIL=0 cimport cython from cython cimport Py_ssize_t @@ -275,3 +278,19 @@ cdef object get_dst_info(object tz): dst_cache[cache_key] = (trans, deltas, typ) return dst_cache[cache_key] + + +def infer_tzinfo(start, end): + if start is not None and end is not None: + tz = start.tzinfo + if end.tzinfo: + if not (get_timezone(tz) == get_timezone(end.tzinfo)): + msg = 'Inputs must both have the same timezone, {tz1} != {tz2}' + raise AssertionError(msg.format(tz1=tz, tz2=end.tzinfo)) + elif start is not None: + tz = start.tzinfo + elif end is not None: + tz = end.tzinfo + else: + tz = None + return tz diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 39dc24642235ba..9127864eab8a16 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -443,7 +443,7 @@ def _generate(cls, start, end, periods, name, offset, raise ValueError("Closed has to be either 'left', 'right' or None") try: - inferred_tz = tools._infer_tzinfo(start, end) + inferred_tz = timezones.infer_tzinfo(start, end) except: raise TypeError('Start and end cannot both be tz-aware with ' 'different timezones') diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 8fe28aa4006131..e335dfe3a41421 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -4,7 +4,6 @@ from pandas._libs import tslib from pandas._libs.tslibs.strptime import array_strptime -from pandas._libs.tslibs.timezones import get_timezone from pandas._libs.tslibs import parsing from pandas._libs.tslibs.parsing import ( # noqa parse_time_string, @@ -30,24 +29,6 @@ from pandas.core import algorithms -def _infer_tzinfo(start, end): - def _infer(a, b): - tz = a.tzinfo - if b and b.tzinfo: - if not (get_timezone(tz) == get_timezone(b.tzinfo)): - raise AssertionError('Inputs must both have the same timezone,' - ' {timezone1} != {timezone2}' - .format(timezone1=tz, timezone2=b.tzinfo)) - return tz - - tz = None - if start is not None: - tz = _infer(start, end) - elif end is not None: - tz = _infer(end, start) - return tz - - def _guess_datetime_format_for_array(arr, **kwargs): # Try to guess the format based on the first non-NaN element non_nan_elements = notna(arr).nonzero()[0] diff --git a/pandas/tests/tseries/test_timezones.py b/pandas/tests/tseries/test_timezones.py index e7b470e01e2af3..aa8fe90ea65006 100644 --- a/pandas/tests/tseries/test_timezones.py +++ b/pandas/tests/tseries/test_timezones.py @@ -12,7 +12,6 @@ from datetime import datetime, timedelta, tzinfo, date import pandas.util.testing as tm -import pandas.core.tools.datetimes as tools import pandas.tseries.offsets as offsets from pandas.compat import lrange, zip from pandas.core.indexes.datetimes import bdate_range, date_range @@ -646,20 +645,20 @@ def test_infer_tz(self): start = self.localize(eastern, _start) end = self.localize(eastern, _end) - assert (tools._infer_tzinfo(start, end) is self.localize( - eastern, _start).tzinfo) - assert (tools._infer_tzinfo(start, None) is self.localize( - eastern, _start).tzinfo) - assert (tools._infer_tzinfo(None, end) is self.localize(eastern, - _end).tzinfo) + assert (timezones.infer_tzinfo(start, end) is + self.localize(eastern, _start).tzinfo) + assert (timezones.infer_tzinfo(start, None) is + self.localize(eastern, _start).tzinfo) + assert (timezones.infer_tzinfo(None, end) is + self.localize(eastern, _end).tzinfo) start = utc.localize(_start) end = utc.localize(_end) - assert (tools._infer_tzinfo(start, end) is utc) + assert (timezones.infer_tzinfo(start, end) is utc) end = self.localize(eastern, _end) - pytest.raises(Exception, tools._infer_tzinfo, start, end) - pytest.raises(Exception, tools._infer_tzinfo, end, start) + pytest.raises(Exception, timezones.infer_tzinfo, start, end) + pytest.raises(Exception, timezones.infer_tzinfo, end, start) def test_tz_string(self): result = date_range('1/1/2000', periods=10, From ad7d051bdfdbadc4221307d691fd55412d9d7ae8 Mon Sep 17 00:00:00 2001 From: Jean-Mathieu Deschenes Date: Fri, 29 Sep 2017 06:31:22 -0400 Subject: [PATCH 172/188] BUG: DataFrame sort_values and multiple "by" columns fails to order NaT correctly closes #16836 Author: Jean-Mathieu Deschenes This patch had conflicts when merged, resolved by Committer: Jeff Reback Closes #16995 from jdeschenes/datetime_sort_issues and squashes the following commits: 257e10a43 [Jean-Mathieu Deschenes] Changes requested by @jreback c6d55e2ad [Jean-Mathieu Deschenes] Fix for #16836 --- doc/source/whatsnew/v0.21.0.txt | 1 + pandas/core/frame.py | 7 +------ pandas/tests/frame/test_sorting.py | 29 ++++++++++++++++++++++++++++- 3 files changed, 30 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index ee781ec4b0361c..4a3122a78b2340 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -648,6 +648,7 @@ Reshaping - :func:`Series.argmin`, :func:`Series.argmax`, and their counterparts on ``DataFrame`` and groupby objects work correctly with floating point data that contains infinite values (:issue:`13595`). - Bug in :func:`unique` where checking a tuple of strings raised a ``TypeError`` (:issue:`17108`) - Bug in :func:`concat` where order of result index was unpredictable if it contained non-comparable elements (:issue:`17344`) +- Fixes regression when sorting by multiple columns on a ``datetime64`` dtype ``Series`` with ``NaT`` values (:issue:`16836`) Numeric ^^^^^^^ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 579d9f10d5875b..a12e611f6618a0 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3453,18 +3453,13 @@ def sort_values(self, by, axis=0, ascending=True, inplace=False, if len(by) > 1: from pandas.core.sorting import lexsort_indexer - def trans(v): - if needs_i8_conversion(v): - return v.view('i8') - return v - keys = [] for x in by: k = self.xs(x, axis=other_axis).values if k.ndim == 2: raise ValueError('Cannot sort by duplicate column %s' % str(x)) - keys.append(trans(k)) + keys.append(k) indexer = lexsort_indexer(keys, orders=ascending, na_position=na_position) indexer = _ensure_platform_int(indexer) diff --git a/pandas/tests/frame/test_sorting.py b/pandas/tests/frame/test_sorting.py index 891c94b59074aa..e6f823bf6fac22 100644 --- a/pandas/tests/frame/test_sorting.py +++ b/pandas/tests/frame/test_sorting.py @@ -269,6 +269,11 @@ def test_sort_datetimes(self): df2 = df.sort_values(by=['B']) assert_frame_equal(df1, df2) + df1 = df.sort_values(by='B') + + df2 = df.sort_values(by=['C', 'B']) + assert_frame_equal(df1, df2) + def test_frame_column_inplace_sort_exception(self): s = self.frame['A'] with tm.assert_raises_regex(ValueError, "This Series is a view"): @@ -321,7 +326,29 @@ def test_sort_nat_values_in_int_column(self): assert_frame_equal(df_sorted, df_reversed) df_sorted = df.sort_values(["datetime", "float"], na_position="last") - assert_frame_equal(df_sorted, df_reversed) + assert_frame_equal(df_sorted, df) + + # Ascending should not affect the results. + df_sorted = df.sort_values(["datetime", "float"], ascending=False) + assert_frame_equal(df_sorted, df) + + def test_sort_nat(self): + + # GH 16836 + + d1 = [Timestamp(x) for x in ['2016-01-01', '2015-01-01', + np.nan, '2016-01-01']] + d2 = [Timestamp(x) for x in ['2017-01-01', '2014-01-01', + '2016-01-01', '2015-01-01']] + df = pd.DataFrame({'a': d1, 'b': d2}, index=[0, 1, 2, 3]) + + d3 = [Timestamp(x) for x in ['2015-01-01', '2016-01-01', + '2016-01-01', np.nan]] + d4 = [Timestamp(x) for x in ['2014-01-01', '2015-01-01', + '2017-01-01', '2016-01-01']] + expected = pd.DataFrame({'a': d3, 'b': d4}, index=[1, 3, 0, 2]) + sorted_df = df.sort_values(by=['a', 'b'], ) + tm.assert_frame_equal(sorted_df, expected) class TestDataFrameSortIndexKinds(TestData): From e2a0251d32a1467e9ab86281a31f57aca582a88f Mon Sep 17 00:00:00 2001 From: topper-123 Date: Fri, 29 Sep 2017 17:00:08 +0200 Subject: [PATCH 173/188] Doc improvements for IntervalIndex and Interval (#17714) --- doc/source/advanced.rst | 23 +++++++++++++++++++++++ pandas/_libs/interval.pyx | 26 ++++++++++++++++++++++---- pandas/core/indexes/interval.py | 20 +++++++++----------- 3 files changed, 54 insertions(+), 15 deletions(-) diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst index 799d04859cc2ac..cfdb53ec7e4b1a 100644 --- a/doc/source/advanced.rst +++ b/doc/source/advanced.rst @@ -833,12 +833,21 @@ Of course if you need integer based selection, then use ``iloc`` IntervalIndex ~~~~~~~~~~~~~ +:class:`IntervalIndex` together with its own dtype, ``interval`` as well as the +:class:`Interval` scalar type, allow first-class support in pandas for interval +notation. + +The ``IntervalIndex`` allows some unique indexing and is also used as a +return type for the categories in :func:`cut` and :func:`qcut`. + .. versionadded:: 0.20.0 .. warning:: These indexing behaviors are provisional and may change in a future version of pandas. +An ``IntervalIndex`` can be used in ``Series`` and in ``DataFrame`` as the index. + .. ipython:: python df = pd.DataFrame({'A': [1, 2, 3, 4]}, @@ -860,6 +869,20 @@ If you select a lable *contained* within an interval, this will also select the df.loc[2.5] df.loc[[2.5, 3.5]] +``Interval`` and ``IntervalIndex`` are used by ``cut`` and ``qcut``: + +.. ipython:: python + + c = pd.cut(range(4), bins=2) + c + c.categories + +Furthermore, ``IntervalIndex`` allows one to bin *other* data with these same +bins, with ``NaN`` representing a missing value similar to other dtypes. + +.. ipython:: python + + pd.cut([0, 3, 5, 1], bins=c.categories) Miscellaneous indexing FAQ -------------------------- diff --git a/pandas/_libs/interval.pyx b/pandas/_libs/interval.pyx index 306597031817df..264a983fe4d536 100644 --- a/pandas/_libs/interval.pyx +++ b/pandas/_libs/interval.pyx @@ -51,17 +51,35 @@ cdef class Interval(IntervalMixin): .. versionadded:: 0.20.0 - Attributes + Parameters ---------- - left, right : values - Left and right bounds for each interval. + left : value + Left bound for interval. + right : value + Right bound for interval. closed : {'left', 'right', 'both', 'neither'} Whether the interval is closed on the left-side, right-side, both or neither. Defaults to 'right'. + Examples + -------- + >>> iv = pd.Interval(left=0, right=5) + >>> iv + Interval(0, 5, closed='right') + >>> 2.5 in iv + True + + >>> year_2017 = pd.Interval(pd.Timestamp('2017-01-01'), + ... pd.Timestamp('2017-12-31'), closed='both') + >>> pd.Timestamp('2017-01-01 00:00') in year_2017 + True + See Also -------- - IntervalIndex : an Index of intervals that are all closed on the same side. + IntervalIndex : an Index of ``interval`` s that are all closed on the same + side. + cut, qcut : convert arrays of continuous data into categoricals/series of + ``Interval``. """ cdef readonly object left, right diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index a697ed7888f90b..29699f664bbf34 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -105,8 +105,10 @@ class IntervalIndex(IntervalMixin, Index): .. versionadded:: 0.20.0 - Warning: the indexing behaviors are provisional and may change in - a future version of pandas. + .. warning:: + + The indexing behaviors are provisional and may change in + a future version of pandas. Attributes ---------- @@ -147,15 +149,11 @@ class IntervalIndex(IntervalMixin, Index): -------- Index Interval : A bounded slice-like interval - interval_range : Function to create a fixed frequency IntervalIndex - IntervalIndex.from_arrays : Construct an IntervalIndex from a left and - right array - IntervalIndex.from_breaks : Construct an IntervalIndex from an array of - splits - IntervalIndex.from_intervals : Construct an IntervalIndex from an array of - Interval objects - IntervalIndex.from_tuples : Construct an IntervalIndex from a list/array of - tuples + interval_range : Function to create a fixed frequency + IntervalIndex, IntervalIndex.from_arrays, IntervalIndex.from_breaks, + IntervalIndex.from_intervals, IntervalIndex.from_tuples + cut, qcut : convert arrays of continuous data into categoricals/series of + ``Interval``. """ _typ = 'intervalindex' _comparables = ['name'] From b8467c00f78eec73efd14f159f1ba935a65b4ee7 Mon Sep 17 00:00:00 2001 From: topper-123 Date: Sat, 30 Sep 2017 17:25:57 +0200 Subject: [PATCH 174/188] DOC: Add examples for MultiIndex.get_locs + cleanups (#17675) --- pandas/core/categorical.py | 2 +- pandas/core/indexes/multi.py | 65 +++++++++++++++++++++++++----------- 2 files changed, 47 insertions(+), 20 deletions(-) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index d79937829cf3fb..61e28dde2e34c0 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -229,7 +229,7 @@ class Categorical(PandasObject): See also -------- - pandas.api.types.CategoricalDtype + pandas.api.types.CategoricalDtype : Type for categorical data CategoricalIndex : An Index with an underlying ``Categorical`` """ diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 35f738b347a3eb..9ffac0832062d0 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -72,8 +72,8 @@ class MultiIndex(Index): Examples --------- A new ``MultiIndex`` is typically constructed using one of the helper - methods :meth:`MultiIndex.from_arrays``, :meth:`MultiIndex.from_product`` - and :meth:`MultiIndex.from_tuples``. For example (using ``.from_arrays``): + methods :meth:`MultiIndex.from_arrays`, :meth:`MultiIndex.from_product` + and :meth:`MultiIndex.from_tuples`. For example (using ``.from_arrays``): >>> arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']] >>> pd.MultiIndex.from_arrays(arrays, names=('number', 'color')) @@ -1982,33 +1982,41 @@ def _partial_tup_index(self, tup, side='left'): def get_loc(self, key, method=None): """ - Get integer location, slice or boolean mask for requested label or - tuple. If the key is past the lexsort depth, the return may be a - boolean mask array, otherwise it is always a slice or int. + Get location for a label or a tuple of labels as an integer, slice or + boolean mask. Parameters ---------- - key : label or tuple + key : label or tuple of labels (one for each level) method : None Returns ------- loc : int, slice object or boolean mask + If the key is past the lexsort depth, the return may be a + boolean mask array, otherwise it is always a slice or int. Examples --------- >>> mi = pd.MultiIndex.from_arrays([list('abb'), list('def')]) + >>> mi.get_loc('b') slice(1, 3, None) + >>> mi.get_loc(('b', 'e')) 1 + Notes + ------ + The key cannot be a slice, list of same-level labels, a boolean mask, + or a sequence of such. If you want to use those, use + :meth:`MultiIndex.get_locs` instead. + See also -------- Index.get_loc : get_loc method for (single-level) index. - get_locs : Given a tuple of slices/lists/labels/boolean indexer to a - level-wise spec, produce an indexer to extract those - locations. + MultiIndex.get_locs : Get location for a label/slice/list/mask or a + sequence of such. """ if method is not None: raise NotImplementedError('only the default get_loc method is ' @@ -2117,8 +2125,9 @@ def get_loc_level(self, key, level=0, drop_level=True): See Also --------- - MultiIndex.get_loc : Get integer location, slice or boolean mask for - requested label or tuple. + MultiIndex.get_loc : Get location for a label or a tuple of labels. + MultiIndex.get_locs : Get location for a label/slice/list/mask or a + sequence of such """ def maybe_droplevels(indexer, levels, drop_level): @@ -2328,23 +2337,41 @@ def convert_indexer(start, stop, step, indexer=indexer, labels=labels): j = labels.searchsorted(loc, side='right') return slice(i, j) - def get_locs(self, tup): + def get_locs(self, seq): """ - Given a tuple of slices/lists/labels/boolean indexer to a level-wise - spec produce an indexer to extract those locations + Get location for a given label/slice/list/mask or a sequence of such as + an array of integers. Parameters ---------- - key : tuple of (slices/list/labels) + seq : label/slice/list/mask or a sequence of such + You should use one of the above for each level. + If a level should not be used, set it to ``slice(None)``. Returns ------- - locs : integer list of locations or boolean indexer suitable - for passing to iloc + locs : array of integers suitable for passing to iloc + + Examples + --------- + >>> mi = pd.MultiIndex.from_arrays([list('abb'), list('def')]) + + >>> mi.get_locs('b') + array([1, 2], dtype=int64) + + >>> mi.get_locs([slice(None), ['e', 'f']]) + array([1, 2], dtype=int64) + + >>> mi.get_locs([[True, False, True], slice('e', 'f')]) + array([2], dtype=int64) + + See also + -------- + MultiIndex.get_loc : Get location for a label or a tuple of labels. """ # must be lexsorted to at least as many levels - true_slices = [i for (i, s) in enumerate(is_true_slices(tup)) if s] + true_slices = [i for (i, s) in enumerate(is_true_slices(seq)) if s] if true_slices and true_slices[-1] >= self.lexsort_depth: raise UnsortedIndexError('MultiIndex slicing requires the index ' 'to be lexsorted: slicing on levels {0}, ' @@ -2377,7 +2404,7 @@ def _update_indexer(idxr, indexer=indexer): return indexer return indexer & idxr - for i, k in enumerate(tup): + for i, k in enumerate(seq): if is_bool_indexer(k): # a boolean indexer, must be the same length! From 030e374940a93b7920c0c2ac5c950668564c3703 Mon Sep 17 00:00:00 2001 From: huashuai Date: Sat, 30 Sep 2017 12:33:52 -0700 Subject: [PATCH 175/188] BUG: Fix series rename called with str altering name rather index (GH17407) (#17654) * BUG: Fix series rename called with str altering the name. GH17407 * add whatsnew for the fix for #17407 * Fix typo in whatsnew * remove whitespace * Update code after @jreback's comments * Change `or` to `and` for checking iterable * Only check against Iterable in is_list_like and add test for `str` * Update v0.21.0.txt --- doc/source/whatsnew/v0.21.0.txt | 1 + pandas/core/dtypes/inference.py | 3 ++- pandas/tests/dtypes/test_inference.py | 2 +- pandas/tests/series/test_indexing.py | 10 ++++++++++ 4 files changed, 14 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 4a3122a78b2340..e0e0c180525501 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -589,6 +589,7 @@ Indexing - Bug in intersection of ``RangeIndex`` with negative step (:issue:`17296`) - Bug in ``IntervalIndex`` where performing a scalar lookup fails for included right endpoints of non-overlapping monotonic decreasing indexes (:issue:`16417`, :issue:`17271`) - Bug in :meth:`DataFrame.first_valid_index` and :meth:`DataFrame.last_valid_index` when no valid entry (:issue:`17400`) +- Bug in :func:`Series.rename` when called with a `callable`, incorrectly alters the name of the `Series`, rather than the name of the `Index`. (:issue:`17407`) I/O ^^^ diff --git a/pandas/core/dtypes/inference.py b/pandas/core/dtypes/inference.py index ff7e215951a1f7..de769c69f44fd0 100644 --- a/pandas/core/dtypes/inference.py +++ b/pandas/core/dtypes/inference.py @@ -3,6 +3,7 @@ import collections import re import numpy as np +from collections import Iterable from numbers import Number from pandas.compat import (PY2, string_types, text_type, string_and_binary_types) @@ -262,7 +263,7 @@ def is_list_like(obj): False """ - return (hasattr(obj, '__iter__') and + return (isinstance(obj, Iterable) and not isinstance(obj, string_and_binary_types)) diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index dbde7ae5081d4a..857f7a283aa951 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -58,7 +58,7 @@ def __getitem__(self): def test_is_list_like(): passes = ([], [1], (1, ), (1, 2), {'a': 1}, set([1, 'a']), Series([1]), Series([]), Series(['a']).str) - fails = (1, '2', object()) + fails = (1, '2', object(), str) for p in passes: assert inference.is_list_like(p) diff --git a/pandas/tests/series/test_indexing.py b/pandas/tests/series/test_indexing.py index 2182e3fbfc2129..83d6a09d38f415 100644 --- a/pandas/tests/series/test_indexing.py +++ b/pandas/tests/series/test_indexing.py @@ -2188,6 +2188,16 @@ def test_reindex_fill_value(self): expected = Series([False, True, False], index=[1, 2, 3]) assert_series_equal(result, expected) + def test_rename(self): + + # GH 17407 + s = Series(range(1, 6), index=pd.Index(range(2, 7), name='IntIndex')) + result = s.rename(str) + expected = s.rename(lambda i: str(i)) + assert_series_equal(result, expected) + + assert result.name == expected.name + def test_select(self): n = len(self.ts) result = self.ts.select(lambda x: x >= self.ts.index[n // 2]) From baadad7581c48b0b1c6401b7e3b32fd09e7f0863 Mon Sep 17 00:00:00 2001 From: MarsGuy Date: Sun, 1 Oct 2017 17:55:40 +0530 Subject: [PATCH 176/188] DOC: Fixed typo in documentation for 'pandas.DataFrame.replace' (#17731) --- pandas/core/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 2fb0e348c01c0b..6fd4f3eeb6b907 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4260,7 +4260,7 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, dicts of such objects are also allowed. inplace : boolean, default False If True, in place. Note: this will modify any - other views on this object (e.g. a column form a DataFrame). + other views on this object (e.g. a column from a DataFrame). Returns the caller if this is True. limit : int, default None Maximum size gap to forward or backward fill From fd336fbea59edf6324d5c4ac8b22ed696312f50e Mon Sep 17 00:00:00 2001 From: Licht Takeuchi Date: Sun, 1 Oct 2017 23:53:45 +0900 Subject: [PATCH 177/188] BUG: Fix unexpected sort in groupby (#17621) --- doc/source/whatsnew/v0.21.0.txt | 1 + pandas/core/generic.py | 2 +- pandas/core/groupby.py | 23 +++++++++++-- pandas/tests/groupby/test_groupby.py | 47 +++++++++++++++----------- pandas/tests/groupby/test_whitelist.py | 16 ++++++--- pandas/tests/test_multilevel.py | 17 +++++++--- 6 files changed, 76 insertions(+), 30 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index e0e0c180525501..11eba13dd0f1f1 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -627,6 +627,7 @@ Groupby/Resample/Rolling - Bug in ``.rolling(...).apply(...)`` with a ``DataFrame`` with a ``DatetimeIndex``, a ``window`` of a timedelta-convertible and ``min_periods >= 1` (:issue:`15305`) - Bug in ``DataFrame.groupby`` where index and column keys were not recognized correctly when the number of keys equaled the number of elements on the groupby axis (:issue:`16859`) - Bug in ``groupby.nunique()`` with ``TimeGrouper`` which cannot handle ``NaT`` correctly (:issue:`17575`) +- Bug in ``DataFrame.groupby`` where a single level selection from a ``MultiIndex`` unexpectedly sorts (:issue:`17537`) Sparse ^^^^^^ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6fd4f3eeb6b907..4f6fd0828693e1 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6631,7 +6631,7 @@ def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None, return rs def _agg_by_level(self, name, axis=0, level=0, skipna=True, **kwargs): - grouped = self.groupby(level=level, axis=axis) + grouped = self.groupby(level=level, axis=axis, sort=False) if hasattr(grouped, name) and skipna: return getattr(grouped, name)(**kwargs) axis = self._get_axis_number(axis) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index a62ae40a85941f..2f2056279558d3 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2586,10 +2586,27 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True, """ group_axis = obj._get_axis(axis) - # validate that the passed level is compatible with the passed + # validate that the passed single level is compatible with the passed # axis of the object if level is not None: - if not isinstance(group_axis, MultiIndex): + # TODO: These if-block and else-block are almost same. + # MultiIndex instance check is removable, but it seems that there are + # some processes only for non-MultiIndex in else-block, + # eg. `obj.index.name != level`. We have to consider carefully whether + # these are applicable for MultiIndex. Even if these are applicable, + # we need to check if it makes no side effect to subsequent processes + # on the outside of this condition. + # (GH 17621) + if isinstance(group_axis, MultiIndex): + if is_list_like(level) and len(level) == 1: + level = level[0] + + if key is None and is_scalar(level): + # Get the level values from group_axis + key = group_axis.get_level_values(level) + level = None + + else: # allow level to be a length-one list-like object # (e.g., level=[0]) # GH 13901 @@ -2611,6 +2628,8 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True, raise ValueError('level > 0 or level < -1 only valid with ' ' MultiIndex') + # NOTE: `group_axis` and `group_axis.get_level_values(level)` + # are same in this section. level = None key = group_axis diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index d91cff436dee2e..47bf837fa62d95 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1791,18 +1791,20 @@ def aggfun(ser): agged2 = df.groupby(keys).aggregate(aggfun) assert len(agged2.columns) + 1 == len(df.columns) - def test_groupby_level(self): + @pytest.mark.parametrize('sort', [True, False]) + def test_groupby_level(self, sort): + # GH 17537 frame = self.mframe deleveled = frame.reset_index() - result0 = frame.groupby(level=0).sum() - result1 = frame.groupby(level=1).sum() + result0 = frame.groupby(level=0, sort=sort).sum() + result1 = frame.groupby(level=1, sort=sort).sum() - expected0 = frame.groupby(deleveled['first'].values).sum() - expected1 = frame.groupby(deleveled['second'].values).sum() + expected0 = frame.groupby(deleveled['first'].values, sort=sort).sum() + expected1 = frame.groupby(deleveled['second'].values, sort=sort).sum() - expected0 = expected0.reindex(frame.index.levels[0]) - expected1 = expected1.reindex(frame.index.levels[1]) + expected0.index.name = 'first' + expected1.index.name = 'second' assert result0.index.name == 'first' assert result1.index.name == 'second' @@ -1813,15 +1815,15 @@ def test_groupby_level(self): assert result1.index.name == frame.index.names[1] # groupby level name - result0 = frame.groupby(level='first').sum() - result1 = frame.groupby(level='second').sum() + result0 = frame.groupby(level='first', sort=sort).sum() + result1 = frame.groupby(level='second', sort=sort).sum() assert_frame_equal(result0, expected0) assert_frame_equal(result1, expected1) # axis=1 - result0 = frame.T.groupby(level=0, axis=1).sum() - result1 = frame.T.groupby(level=1, axis=1).sum() + result0 = frame.T.groupby(level=0, axis=1, sort=sort).sum() + result1 = frame.T.groupby(level=1, axis=1, sort=sort).sum() assert_frame_equal(result0, expected0.T) assert_frame_equal(result1, expected1.T) @@ -1835,15 +1837,17 @@ def test_groupby_level_index_names(self): df.groupby(level='exp') pytest.raises(ValueError, df.groupby, level='foo') - def test_groupby_level_with_nas(self): + @pytest.mark.parametrize('sort', [True, False]) + def test_groupby_level_with_nas(self, sort): + # GH 17537 index = MultiIndex(levels=[[1, 0], [0, 1, 2, 3]], labels=[[1, 1, 1, 1, 0, 0, 0, 0], [0, 1, 2, 3, 0, 1, 2, 3]]) # factorizing doesn't confuse things s = Series(np.arange(8.), index=index) - result = s.groupby(level=0).sum() - expected = Series([22., 6.], index=[1, 0]) + result = s.groupby(level=0, sort=sort).sum() + expected = Series([6., 22.], index=[0, 1]) assert_series_equal(result, expected) index = MultiIndex(levels=[[1, 0], [0, 1, 2, 3]], @@ -1852,8 +1856,8 @@ def test_groupby_level_with_nas(self): # factorizing doesn't confuse things s = Series(np.arange(8.), index=index) - result = s.groupby(level=0).sum() - expected = Series([18., 6.], index=[1, 0]) + result = s.groupby(level=0, sort=sort).sum() + expected = Series([6., 18.], index=[0.0, 1.0]) assert_series_equal(result, expected) def test_groupby_level_apply(self): @@ -1936,9 +1940,14 @@ def test_groupby_complex(self): result = a.sum(level=0) assert_series_equal(result, expected) - def test_level_preserve_order(self): - grouped = self.mframe.groupby(level=0) - exp_labels = np.array([0, 0, 0, 1, 1, 2, 2, 3, 3, 3], np.intp) + @pytest.mark.parametrize('sort,labels', [ + [True, [2, 2, 2, 0, 0, 1, 1, 3, 3, 3]], + [False, [0, 0, 0, 1, 1, 2, 2, 3, 3, 3]] + ]) + def test_level_preserve_order(self, sort, labels): + # GH 17537 + grouped = self.mframe.groupby(level=0, sort=sort) + exp_labels = np.array(labels, np.intp) assert_almost_equal(grouped.grouper.labels[0], exp_labels) def test_grouping_labels(self): diff --git a/pandas/tests/groupby/test_whitelist.py b/pandas/tests/groupby/test_whitelist.py index 1c5161d2ffb431..259f466316c414 100644 --- a/pandas/tests/groupby/test_whitelist.py +++ b/pandas/tests/groupby/test_whitelist.py @@ -174,12 +174,16 @@ def raw_frame(): @pytest.mark.parametrize( - "op, level, axis, skipna", + "op, level, axis, skipna, sort", product(AGG_FUNCTIONS, lrange(2), lrange(2), + [True, False], [True, False])) -def test_regression_whitelist_methods(raw_frame, op, level, axis, skipna): +def test_regression_whitelist_methods( + raw_frame, op, level, + axis, skipna, sort): # GH6944 + # GH 17537 # explicity test the whitelest methods if axis == 0: @@ -188,15 +192,19 @@ def test_regression_whitelist_methods(raw_frame, op, level, axis, skipna): frame = raw_frame.T if op in AGG_FUNCTIONS_WITH_SKIPNA: - grouped = frame.groupby(level=level, axis=axis) + grouped = frame.groupby(level=level, axis=axis, sort=sort) result = getattr(grouped, op)(skipna=skipna) expected = getattr(frame, op)(level=level, axis=axis, skipna=skipna) + if sort: + expected = expected.sort_index(axis=axis, level=level) tm.assert_frame_equal(result, expected) else: - grouped = frame.groupby(level=level, axis=axis) + grouped = frame.groupby(level=level, axis=axis, sort=sort) result = getattr(grouped, op)() expected = getattr(frame, op)(level=level, axis=axis) + if sort: + expected = expected.sort_index(axis=axis, level=level) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 6976fe162c5d5c..050335988ca417 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -1392,17 +1392,23 @@ def test_count(self): AGG_FUNCTIONS = ['sum', 'prod', 'min', 'max', 'median', 'mean', 'skew', 'mad', 'std', 'var', 'sem'] - def test_series_group_min_max(self): + @pytest.mark.parametrize('sort', [True, False]) + def test_series_group_min_max(self, sort): + # GH 17537 for op, level, skipna in cart_product(self.AGG_FUNCTIONS, lrange(2), [False, True]): - grouped = self.series.groupby(level=level) + grouped = self.series.groupby(level=level, sort=sort) aggf = lambda x: getattr(x, op)(skipna=skipna) # skipna=True leftside = grouped.agg(aggf) rightside = getattr(self.series, op)(level=level, skipna=skipna) + if sort: + rightside = rightside.sort_index(level=level) tm.assert_series_equal(leftside, rightside) - def test_frame_group_ops(self): + @pytest.mark.parametrize('sort', [True, False]) + def test_frame_group_ops(self, sort): + # GH 17537 self.frame.iloc[1, [1, 2]] = np.nan self.frame.iloc[7, [0, 1]] = np.nan @@ -1415,7 +1421,7 @@ def test_frame_group_ops(self): else: frame = self.frame.T - grouped = frame.groupby(level=level, axis=axis) + grouped = frame.groupby(level=level, axis=axis, sort=sort) pieces = [] @@ -1426,6 +1432,9 @@ def aggf(x): leftside = grouped.agg(aggf) rightside = getattr(frame, op)(level=level, axis=axis, skipna=skipna) + if sort: + rightside = rightside.sort_index(level=level, axis=axis) + frame = frame.sort_index(level=level, axis=axis) # for good measure, groupby detail level_index = frame._get_axis(axis).levels[level] From 7d4a260cbe6d5c1825541adcd0d5310f32a3ba42 Mon Sep 17 00:00:00 2001 From: Andreas Winkler Date: Sun, 1 Oct 2017 16:55:32 +0200 Subject: [PATCH 178/188] BUG: Fix some PeriodIndex resampling issues (#16153) closes #15944 xref #12884 closes #13083 closes #13224 --- doc/source/whatsnew/v0.21.0.txt | 76 ++++++++ pandas/core/resample.py | 132 ++++++++------ pandas/tests/test_resample.py | 304 ++++++++++++++++++++------------ 3 files changed, 340 insertions(+), 172 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 11eba13dd0f1f1..1094e96bd0d201 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -171,6 +171,82 @@ Other Enhancements Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. _whatsnew_0210.api_breaking.period_index_resampling: + +``PeriodIndex`` resampling +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In previous versions of pandas, resampling a ``Series``/``DataFrame`` indexed by a ``PeriodIndex`` returned a ``DatetimeIndex`` in some cases (:issue:`12884`). Resampling to a multiplied frequency now returns a ``PeriodIndex`` (:issue:`15944`). As a minor enhancement, resampling a ``PeriodIndex`` can now handle ``NaT`` values (:issue:`13224`) + +Previous Behavior: + +.. code-block:: ipython + + In [1]: pi = pd.period_range('2017-01', periods=12, freq='M') + + In [2]: s = pd.Series(np.arange(12), index=pi) + + In [3]: resampled = s.resample('2Q').mean() + + In [4]: resampled + Out[4]: + 2017-03-31 1.0 + 2017-09-30 5.5 + 2018-03-31 10.0 + Freq: 2Q-DEC, dtype: float64 + + In [5]: resampled.index + Out[5]: DatetimeIndex(['2017-03-31', '2017-09-30', '2018-03-31'], dtype='datetime64[ns]', freq='2Q-DEC') + +New Behavior: + +.. ipython:: python + + pi = pd.period_range('2017-01', periods=12, freq='M') + + s = pd.Series(np.arange(12), index=pi) + + resampled = s.resample('2Q').mean() + + resampled + + resampled.index + + +Upsampling and calling ``.ohlc()`` previously returned a ``Series``, basically identical to calling ``.asfreq()``. OHLC upsampling now returns a DataFrame with columns ``open``, ``high``, ``low`` and ``close`` (:issue:`13083`). This is consistent with downsampling and ``DatetimeIndex`` behavior. + +Previous Behavior: + +.. code-block:: ipython + + In [1]: pi = pd.PeriodIndex(start='2000-01-01', freq='D', periods=10) + + In [2]: s = pd.Series(np.arange(10), index=pi) + + In [3]: s.resample('H').ohlc() + Out[3]: + 2000-01-01 00:00 0.0 + ... + 2000-01-10 23:00 NaN + Freq: H, Length: 240, dtype: float64 + + In [4]: s.resample('M').ohlc() + Out[4]: + open high low close + 2000-01 0 9 0 9 + +New Behavior: + +.. ipython:: python + + pi = pd.PeriodIndex(start='2000-01-01', freq='D', periods=10) + + s = pd.Series(np.arange(10), index=pi) + + s.resample('H').ohlc() + + s.resample('M').ohlc() + .. _whatsnew_0210.api_breaking.deps: diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 01c7e875b8eccc..083fbcaaabe460 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -14,7 +14,7 @@ from pandas.core.indexes.datetimes import DatetimeIndex, date_range from pandas.core.indexes.timedeltas import TimedeltaIndex from pandas.tseries.offsets import DateOffset, Tick, Day, _delta_to_nanoseconds -from pandas.core.indexes.period import PeriodIndex, period_range +from pandas.core.indexes.period import PeriodIndex import pandas.core.common as com import pandas.core.algorithms as algos from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries @@ -834,53 +834,32 @@ class PeriodIndexResampler(DatetimeIndexResampler): def _resampler_for_grouping(self): return PeriodIndexResamplerGroupby + def _get_binner_for_time(self): + if self.kind == 'timestamp': + return super(PeriodIndexResampler, self)._get_binner_for_time() + return self.groupby._get_period_bins(self.ax) + def _convert_obj(self, obj): obj = super(PeriodIndexResampler, self)._convert_obj(obj) - offset = to_offset(self.freq) - if offset.n > 1: - if self.kind == 'period': # pragma: no cover - print('Warning: multiple of frequency -> timestamps') - - # Cannot have multiple of periods, convert to timestamp + if self._from_selection: + # see GH 14008, GH 12871 + msg = ("Resampling from level= or on= selection" + " with a PeriodIndex is not currently supported," + " use .set_index(...) to explicitly set index") + raise NotImplementedError(msg) + + if self.loffset is not None: + # Cannot apply loffset/timedelta to PeriodIndex -> convert to + # timestamps self.kind = 'timestamp' # convert to timestamp - if not (self.kind is None or self.kind == 'period'): - if self._from_selection: - # see GH 14008, GH 12871 - msg = ("Resampling from level= or on= selection" - " with a PeriodIndex is not currently supported," - " use .set_index(...) to explicitly set index") - raise NotImplementedError(msg) - else: - obj = obj.to_timestamp(how=self.convention) + if self.kind == 'timestamp': + obj = obj.to_timestamp(how=self.convention) return obj - def aggregate(self, arg, *args, **kwargs): - result, how = self._aggregate(arg, *args, **kwargs) - if result is None: - result = self._downsample(arg, *args, **kwargs) - - result = self._apply_loffset(result) - return result - - agg = aggregate - - def _get_new_index(self): - """ return our new index """ - ax = self.ax - - if len(ax) == 0: - values = [] - else: - start = ax[0].asfreq(self.freq, how=self.convention) - end = ax[-1].asfreq(self.freq, how='end') - values = period_range(start, end, freq=self.freq).asi8 - - return ax._shallow_copy(values, freq=self.freq) - def _downsample(self, how, **kwargs): """ Downsample the cython defined function @@ -898,22 +877,17 @@ def _downsample(self, how, **kwargs): how = self._is_cython_func(how) or how ax = self.ax - new_index = self._get_new_index() - - # Start vs. end of period - memb = ax.asfreq(self.freq, how=self.convention) - if is_subperiod(ax.freq, self.freq): # Downsampling - if len(new_index) == 0: - bins = [] - else: - i8 = memb.asi8 - rng = np.arange(i8[0], i8[-1] + 1) - bins = memb.searchsorted(rng, side='right') - grouper = BinGrouper(bins, new_index) - return self._groupby_and_aggregate(how, grouper=grouper) + return self._groupby_and_aggregate(how, grouper=self.grouper) elif is_superperiod(ax.freq, self.freq): + if how == 'ohlc': + # GH #13083 + # upsampling to subperiods is handled as an asfreq, which works + # for pure aggregating/reducing methods + # OHLC reduces along the time dimension, but creates multiple + # values for each period -> handle by _groupby_and_aggregate() + return self._groupby_and_aggregate(how, grouper=self.grouper) return self.asfreq() elif ax.freq == self.freq: return self.asfreq() @@ -936,19 +910,16 @@ def _upsample(self, method, limit=None, fill_value=None): .fillna """ - if self._from_selection: - raise ValueError("Upsampling from level= or on= selection" - " is not supported, use .set_index(...)" - " to explicitly set index to" - " datetime-like") + # we may need to actually resample as if we are timestamps if self.kind == 'timestamp': return super(PeriodIndexResampler, self)._upsample( method, limit=limit, fill_value=fill_value) + self._set_binner() ax = self.ax obj = self.obj - new_index = self._get_new_index() + new_index = self.binner # Start vs. end of period memb = ax.asfreq(self.freq, how=self.convention) @@ -1293,6 +1264,51 @@ def _get_time_period_bins(self, ax): return binner, bins, labels + def _get_period_bins(self, ax): + if not isinstance(ax, PeriodIndex): + raise TypeError('axis must be a PeriodIndex, but got ' + 'an instance of %r' % type(ax).__name__) + + memb = ax.asfreq(self.freq, how=self.convention) + + # NaT handling as in pandas._lib.lib.generate_bins_dt64() + nat_count = 0 + if memb.hasnans: + nat_count = np.sum(memb._isnan) + memb = memb[~memb._isnan] + + # if index contains no valid (non-NaT) values, return empty index + if not len(memb): + binner = labels = PeriodIndex( + data=[], freq=self.freq, name=ax.name) + return binner, [], labels + + start = ax.min().asfreq(self.freq, how=self.convention) + end = ax.max().asfreq(self.freq, how='end') + + labels = binner = PeriodIndex(start=start, end=end, + freq=self.freq, name=ax.name) + + i8 = memb.asi8 + freq_mult = self.freq.n + + # when upsampling to subperiods, we need to generate enough bins + expected_bins_count = len(binner) * freq_mult + i8_extend = expected_bins_count - (i8[-1] - i8[0]) + rng = np.arange(i8[0], i8[-1] + i8_extend, freq_mult) + rng += freq_mult + bins = memb.searchsorted(rng, side='left') + + if nat_count > 0: + # NaT handling as in pandas._lib.lib.generate_bins_dt64() + # shift bins by the number of NaT + bins += nat_count + bins = np.insert(bins, 0, nat_count) + binner = binner.insert(0, tslib.NaT) + labels = labels.insert(0, tslib.NaT) + + return binner, bins, labels + def _take_new_index(obj, indexer, new_index, axis=0): from pandas.core.api import Series, DataFrame diff --git a/pandas/tests/test_resample.py b/pandas/tests/test_resample.py index 7449beb8f97dfe..cd15203eccd826 100644 --- a/pandas/tests/test_resample.py +++ b/pandas/tests/test_resample.py @@ -18,7 +18,7 @@ from pandas.core.dtypes.generic import ABCSeries, ABCDataFrame from pandas.compat import range, lrange, zip, product, OrderedDict -from pandas.core.base import SpecificationError +from pandas.core.base import SpecificationError, AbstractMethodError from pandas.errors import UnsupportedFunctionCall from pandas.core.groupby import DataError from pandas.tseries.frequencies import MONTHS, DAYS @@ -698,35 +698,58 @@ def create_index(self, *args, **kwargs): factory = self._index_factory() return factory(*args, **kwargs) - def test_asfreq_downsample(self): - s = self.create_series() - - result = s.resample('2D').asfreq() - expected = s.reindex(s.index.take(np.arange(0, len(s.index), 2))) - expected.index.freq = to_offset('2D') - assert_series_equal(result, expected) - - frame = s.to_frame('value') - result = frame.resample('2D').asfreq() - expected = frame.reindex( - frame.index.take(np.arange(0, len(frame.index), 2))) - expected.index.freq = to_offset('2D') - assert_frame_equal(result, expected) - - def test_asfreq_upsample(self): - s = self.create_series() - - result = s.resample('1H').asfreq() - new_index = self.create_index(s.index[0], s.index[-1], freq='1H') - expected = s.reindex(new_index) - assert_series_equal(result, expected) - - frame = s.to_frame('value') - result = frame.resample('1H').asfreq() - new_index = self.create_index(frame.index[0], - frame.index[-1], freq='1H') - expected = frame.reindex(new_index) - assert_frame_equal(result, expected) + @pytest.fixture + def _index_start(self): + return datetime(2005, 1, 1) + + @pytest.fixture + def _index_end(self): + return datetime(2005, 1, 10) + + @pytest.fixture + def _index_freq(self): + return 'D' + + @pytest.fixture + def index(self, _index_start, _index_end, _index_freq): + return self.create_index(_index_start, _index_end, freq=_index_freq) + + @pytest.fixture + def _series_name(self): + raise AbstractMethodError(self) + + @pytest.fixture + def _static_values(self, index): + return np.arange(len(index)) + + @pytest.fixture + def series(self, index, _series_name, _static_values): + return Series(_static_values, index=index, name=_series_name) + + @pytest.fixture + def frame(self, index, _static_values): + return DataFrame({'value': _static_values}, index=index) + + @pytest.fixture(params=[Series, DataFrame]) + def series_and_frame(self, request, index, _series_name, _static_values): + if request.param == Series: + return Series(_static_values, index=index, name=_series_name) + if request.param == DataFrame: + return DataFrame({'value': _static_values}, index=index) + + @pytest.mark.parametrize('freq', ['2D', '1H']) + def test_asfreq(self, series_and_frame, freq): + obj = series_and_frame + + result = obj.resample(freq).asfreq() + if freq == '2D': + new_index = obj.index.take(np.arange(0, len(obj.index), 2)) + new_index.freq = to_offset('2D') + else: + new_index = self.create_index(obj.index[0], obj.index[-1], + freq=freq) + expected = obj.reindex(new_index) + assert_almost_equal(result, expected) def test_asfreq_fill_value(self): # test for fill value during resampling, issue 3715 @@ -824,7 +847,7 @@ def test_resample_loffset_arg_type(self): periods=len(df.index) / 2, freq='2D') - # loffset coreces PeriodIndex to DateTimeIndex + # loffset coerces PeriodIndex to DateTimeIndex if isinstance(expected_index, PeriodIndex): expected_index = expected_index.to_timestamp() @@ -866,6 +889,10 @@ def test_apply_to_empty_series(self): class TestDatetimeIndex(Base): _index_factory = lambda x: date_range + @pytest.fixture + def _series_name(self): + return 'dti' + def setup_method(self, method): dti = DatetimeIndex(start=datetime(2005, 1, 1), end=datetime(2005, 1, 10), freq='Min') @@ -2214,57 +2241,35 @@ def test_resample_datetime_values(self): class TestPeriodIndex(Base): _index_factory = lambda x: period_range + @pytest.fixture + def _series_name(self): + return 'pi' + def create_series(self): + # TODO: replace calls to .create_series() by injecting the series + # fixture i = period_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq='D') return Series(np.arange(len(i)), index=i, name='pi') - def test_asfreq_downsample(self): - - # series - s = self.create_series() - expected = s.reindex(s.index.take(np.arange(0, len(s.index), 2))) - expected.index = expected.index.to_timestamp() - expected.index.freq = to_offset('2D') - - # this is a bug, this *should* return a PeriodIndex - # directly - # GH 12884 - result = s.resample('2D').asfreq() - assert_series_equal(result, expected) - - # frame - frame = s.to_frame('value') - expected = frame.reindex( - frame.index.take(np.arange(0, len(frame.index), 2))) - expected.index = expected.index.to_timestamp() - expected.index.freq = to_offset('2D') - result = frame.resample('2D').asfreq() - assert_frame_equal(result, expected) - - def test_asfreq_upsample(self): - - # this is a bug, this *should* return a PeriodIndex - # directly - # GH 12884 - s = self.create_series() - new_index = date_range(s.index[0].to_timestamp(how='start'), - (s.index[-1] + 1).to_timestamp(how='start'), - freq='1H', - closed='left') - expected = s.to_timestamp().reindex(new_index).to_period() - result = s.resample('1H').asfreq() - assert_series_equal(result, expected) - - frame = s.to_frame('value') - new_index = date_range(frame.index[0].to_timestamp(how='start'), - (frame.index[-1] + 1).to_timestamp(how='start'), - freq='1H', - closed='left') - expected = frame.to_timestamp().reindex(new_index).to_period() - result = frame.resample('1H').asfreq() - assert_frame_equal(result, expected) + @pytest.mark.parametrize('freq', ['2D', '1H', '2H']) + @pytest.mark.parametrize('kind', ['period', None, 'timestamp']) + def test_asfreq(self, series_and_frame, freq, kind): + # GH 12884, 15944 + # make sure .asfreq() returns PeriodIndex (except kind='timestamp') + + obj = series_and_frame + if kind == 'timestamp': + expected = obj.to_timestamp().resample(freq).asfreq() + else: + start = obj.index[0].to_timestamp(how='start') + end = (obj.index[-1] + 1).to_timestamp(how='start') + new_index = date_range(start=start, end=end, freq=freq, + closed='left') + expected = obj.to_timestamp().reindex(new_index).to_period(freq) + result = obj.resample(freq, kind=kind).asfreq() + assert_almost_equal(result, expected) def test_asfreq_fill_value(self): # test for fill value during resampling, issue 3715 @@ -2285,8 +2290,9 @@ def test_asfreq_fill_value(self): result = frame.resample('1H', kind='timestamp').asfreq(fill_value=3.0) assert_frame_equal(result, expected) - def test_selection(self): - index = self.create_series().index + @pytest.mark.parametrize('freq', ['H', '12H', '2D', 'W']) + @pytest.mark.parametrize('kind', [None, 'period', 'timestamp']) + def test_selection(self, index, freq, kind): # This is a bug, these should be implemented # GH 14008 df = pd.DataFrame({'date': index, @@ -2294,12 +2300,10 @@ def test_selection(self): index=pd.MultiIndex.from_arrays([ np.arange(len(index), dtype=np.int64), index], names=['v', 'd'])) - with pytest.raises(NotImplementedError): - df.resample('2D', on='date') - + df.resample(freq, on='date', kind=kind) with pytest.raises(NotImplementedError): - df.resample('2D', level='d') + df.resample(freq, level='d', kind=kind) def test_annual_upsample_D_s_f(self): self._check_annual_upsample_cases('D', 'start', 'ffill') @@ -2366,15 +2370,14 @@ def test_not_subperiod(self): pytest.raises(ValueError, lambda: ts.resample('M').mean()) pytest.raises(ValueError, lambda: ts.resample('w-thu').mean()) - def test_basic_upsample(self): + @pytest.mark.parametrize('freq', ['D', '2D']) + def test_basic_upsample(self, freq): ts = _simple_pts('1/1/1990', '6/30/1995', freq='M') result = ts.resample('a-dec').mean() - resampled = result.resample('D', convention='end').ffill() - - expected = result.to_timestamp('D', how='end') - expected = expected.asfreq('D', 'ffill').to_period() - + resampled = result.resample(freq, convention='end').ffill() + expected = result.to_timestamp(freq, how='end') + expected = expected.asfreq(freq, 'ffill').to_period(freq) assert_series_equal(resampled, expected) def test_upsample_with_limit(self): @@ -2440,16 +2443,15 @@ def test_resample_basic(self): result2 = s.resample('T', kind='period').mean() assert_series_equal(result2, expected) - def test_resample_count(self): - + @pytest.mark.parametrize('freq,expected_vals', [('M', [31, 29, 31, 9]), + ('2M', [31 + 29, 31 + 9])]) + def test_resample_count(self, freq, expected_vals): # GH12774 - series = pd.Series(1, index=pd.period_range(start='2000', - periods=100)) - result = series.resample('M').count() - - expected_index = pd.period_range(start='2000', freq='M', periods=4) - expected = pd.Series([31, 29, 31, 9], index=expected_index) - + series = pd.Series(1, index=pd.period_range(start='2000', periods=100)) + result = series.resample(freq).count() + expected_index = pd.period_range(start='2000', freq=freq, + periods=len(expected_vals)) + expected = pd.Series(expected_vals, index=expected_index) assert_series_equal(result, expected) def test_resample_same_freq(self): @@ -2587,12 +2589,15 @@ def test_cant_fill_missing_dups(self): s = Series(np.random.randn(5), index=rng) pytest.raises(Exception, lambda: s.resample('A').ffill()) - def test_resample_5minute(self): + @pytest.mark.parametrize('freq', ['5min']) + @pytest.mark.parametrize('kind', ['period', None, 'timestamp']) + def test_resample_5minute(self, freq, kind): rng = period_range('1/1/2000', '1/5/2000', freq='T') ts = Series(np.random.randn(len(rng)), index=rng) - - result = ts.resample('5min').mean() - expected = ts.to_timestamp().resample('5min').mean() + expected = ts.to_timestamp().resample(freq).mean() + if kind != 'timestamp': + expected = expected.to_period(freq) + result = ts.resample(freq, kind=kind).mean() assert_series_equal(result, expected) def test_upsample_daily_business_daily(self): @@ -2812,18 +2817,96 @@ def test_evenly_divisible_with_no_extra_bins(self): result = df.resample('7D').sum() assert_frame_equal(result, expected) - def test_apply_to_empty_series(self): - # GH 14313 - series = self.create_series()[:0] + @pytest.mark.parametrize('kind', ['period', None, 'timestamp']) + @pytest.mark.parametrize('agg_arg', ['mean', {'value': 'mean'}, ['mean']]) + def test_loffset_returns_datetimeindex(self, frame, kind, agg_arg): + # make sure passing loffset returns DatetimeIndex in all cases + # basic method taken from Base.test_resample_loffset_arg_type() + df = frame + expected_means = [df.values[i:i + 2].mean() + for i in range(0, len(df.values), 2)] + expected_index = self.create_index(df.index[0], + periods=len(df.index) / 2, + freq='2D') - for freq in ['M', 'D', 'H']: - with pytest.raises(TypeError): - series.resample(freq).apply(lambda x: 1) + # loffset coerces PeriodIndex to DateTimeIndex + expected_index = expected_index.to_timestamp() + expected_index += timedelta(hours=2) + expected = DataFrame({'value': expected_means}, index=expected_index) + + result_agg = df.resample('2D', loffset='2H', kind=kind).agg(agg_arg) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result_how = df.resample('2D', how=agg_arg, loffset='2H', + kind=kind) + if isinstance(agg_arg, list): + expected.columns = pd.MultiIndex.from_tuples([('value', 'mean')]) + assert_frame_equal(result_agg, expected) + assert_frame_equal(result_how, expected) + + @pytest.mark.parametrize('freq, period_mult', [('H', 24), ('12H', 2)]) + @pytest.mark.parametrize('kind', [None, 'period']) + def test_upsampling_ohlc(self, freq, period_mult, kind): + # GH 13083 + pi = PeriodIndex(start='2000', freq='D', periods=10) + s = Series(range(len(pi)), index=pi) + expected = s.to_timestamp().resample(freq).ohlc().to_period(freq) + + # timestamp-based resampling doesn't include all sub-periods + # of the last original period, so extend accordingly: + new_index = PeriodIndex(start='2000', freq=freq, + periods=period_mult * len(pi)) + expected = expected.reindex(new_index) + result = s.resample(freq, kind=kind).ohlc() + assert_frame_equal(result, expected) + + @pytest.mark.parametrize('periods, values', + [([pd.NaT, '1970-01-01 00:00:00', pd.NaT, + '1970-01-01 00:00:02', '1970-01-01 00:00:03'], + [2, 3, 5, 7, 11]), + ([pd.NaT, pd.NaT, '1970-01-01 00:00:00', pd.NaT, + pd.NaT, pd.NaT, '1970-01-01 00:00:02', + '1970-01-01 00:00:03', pd.NaT, pd.NaT], + [1, 2, 3, 5, 6, 8, 7, 11, 12, 13])]) + @pytest.mark.parametrize('freq, expected_values', + [('1s', [3, np.NaN, 7, 11]), + ('2s', [3, int((7 + 11) / 2)]), + ('3s', [int((3 + 7) / 2), 11])]) + def test_resample_with_nat(self, periods, values, freq, expected_values): + # GH 13224 + index = PeriodIndex(periods, freq='S') + frame = DataFrame(values, index=index) + + expected_index = period_range('1970-01-01 00:00:00', + periods=len(expected_values), freq=freq) + expected = DataFrame(expected_values, index=expected_index) + result = frame.resample(freq).mean() + assert_frame_equal(result, expected) + + def test_resample_with_only_nat(self): + # GH 13224 + pi = PeriodIndex([pd.NaT] * 3, freq='S') + frame = DataFrame([2, 3, 5], index=pi) + expected_index = PeriodIndex(data=[], freq=pi.freq) + expected = DataFrame([], index=expected_index) + result = frame.resample('1s').mean() + assert_frame_equal(result, expected) class TestTimedeltaIndex(Base): _index_factory = lambda x: timedelta_range + @pytest.fixture + def _index_start(self): + return '1 day' + + @pytest.fixture + def _index_end(self): + return '10 day' + + @pytest.fixture + def _series_name(self): + return 'tdi' + def create_series(self): i = timedelta_range('1 day', '10 day', freq='D') @@ -3167,13 +3250,6 @@ def test_fails_on_no_datetime_index(self): "instance of %r" % name): df.groupby(TimeGrouper('D')) - # PeriodIndex gives a specific error message - df = DataFrame({'a': np.random.randn(n)}, index=tm.makePeriodIndex(n)) - with tm.assert_raises_regex(TypeError, - "axis must be a DatetimeIndex, but " - "got an instance of 'PeriodIndex'"): - df.groupby(TimeGrouper('D')) - def test_aaa_group_order(self): # GH 12840 # check TimeGrouper perform stable sorts From cdbbf80ec3dd919414560855a1ea3f1efd6c5332 Mon Sep 17 00:00:00 2001 From: ruiann <534676033@qq.com> Date: Sun, 1 Oct 2017 12:48:56 -0500 Subject: [PATCH 179/188] BUG:Time Grouper bug fix when applied for list groupers (#17587) closes #17530 --- doc/source/whatsnew/v0.21.0.txt | 1 + pandas/core/groupby.py | 119 ++++++++++++++++++----- pandas/core/resample.py | 27 +---- pandas/tests/groupby/test_timegrouper.py | 19 ++++ 4 files changed, 116 insertions(+), 50 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 1094e96bd0d201..3276310fa3e6e2 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -704,6 +704,7 @@ Groupby/Resample/Rolling - Bug in ``DataFrame.groupby`` where index and column keys were not recognized correctly when the number of keys equaled the number of elements on the groupby axis (:issue:`16859`) - Bug in ``groupby.nunique()`` with ``TimeGrouper`` which cannot handle ``NaT`` correctly (:issue:`17575`) - Bug in ``DataFrame.groupby`` where a single level selection from a ``MultiIndex`` unexpectedly sorts (:issue:`17537`) +- Bug in ``TimeGrouper`` differs when passes as a list and as a scalar (:issue:`17530`) Sparse ^^^^^^ diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 2f2056279558d3..9379ade4be7a69 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -256,11 +256,13 @@ def __init__(self, key=None, level=None, freq=None, axis=0, sort=False): def ax(self): return self.grouper - def _get_grouper(self, obj): + def _get_grouper(self, obj, validate=True): """ Parameters ---------- obj : the subject object + validate : boolean, default True + if True, validate the grouper Returns ------- @@ -271,7 +273,8 @@ def _get_grouper(self, obj): self.grouper, exclusions, self.obj = _get_grouper(self.obj, [self.key], axis=self.axis, level=self.level, - sort=self.sort) + sort=self.sort, + validate=validate) return self.binner, self.grouper, self.obj def _set_grouper(self, obj, sort=False): @@ -326,12 +329,6 @@ def _set_grouper(self, obj, sort=False): self.grouper = ax return self.grouper - def _get_binner_for_grouping(self, obj): - """ default to the standard binner here """ - group_axis = obj._get_axis(self.axis) - return Grouping(group_axis, None, obj=obj, name=self.key, - level=self.level, sort=self.sort, in_axis=False) - @property def groups(self): return self.grouper.groups @@ -1733,16 +1730,34 @@ class BaseGrouper(object): """ This is an internal Grouper class, which actually holds the generated groups + + Parameters + ---------- + axis : int + the axis to group + groupings : array of grouping + all the grouping instances to handle in this grouper + for example for grouper list to groupby, need to pass the list + sort : boolean, default True + whether this grouper will give sorted result or not + group_keys : boolean, default True + mutated : boolean, default False + indexer : intp array, optional + the indexer created by Grouper + some groupers (TimeGrouper) will sort its axis and its + group_info is also sorted, so need the indexer to reorder + """ def __init__(self, axis, groupings, sort=True, group_keys=True, - mutated=False): + mutated=False, indexer=None): self._filter_empty_groups = self.compressed = len(groupings) != 1 self.axis = axis self.groupings = groupings self.sort = sort self.group_keys = group_keys self.mutated = mutated + self.indexer = indexer @property def shape(self): @@ -1888,6 +1903,15 @@ def group_info(self): comp_ids = _ensure_int64(comp_ids) return comp_ids, obs_group_ids, ngroups + @cache_readonly + def label_info(self): + # return the labels of items in original grouped axis + labels, _, _ = self.group_info + if self.indexer is not None: + sorter = np.lexsort((labels, self.indexer)) + labels = labels[sorter] + return labels + def _get_compressed_labels(self): all_labels = [ping.labels for ping in self.groupings] if len(all_labels) > 1: @@ -2288,11 +2312,42 @@ def generate_bins_generic(values, binner, closed): class BinGrouper(BaseGrouper): - def __init__(self, bins, binlabels, filter_empty=False, mutated=False): + """ + This is an internal Grouper class + + Parameters + ---------- + bins : the split index of binlabels to group the item of axis + binlabels : the label list + filter_empty : boolean, default False + mutated : boolean, default False + indexer : a intp array + + Examples + -------- + bins: [2, 4, 6, 8, 10] + binlabels: DatetimeIndex(['2005-01-01', '2005-01-03', + '2005-01-05', '2005-01-07', '2005-01-09'], + dtype='datetime64[ns]', freq='2D') + + the group_info, which contains the label of each item in grouped + axis, the index of label in label list, group number, is + + (array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4]), array([0, 1, 2, 3, 4]), 5) + + means that, the grouped axis has 10 items, can be grouped into 5 + labels, the first and second items belong to the first label, the + third and forth items belong to the second label, and so on + + """ + + def __init__(self, bins, binlabels, filter_empty=False, mutated=False, + indexer=None): self.bins = _ensure_int64(bins) self.binlabels = _ensure_index(binlabels) self._filter_empty_groups = filter_empty self.mutated = mutated + self.indexer = indexer @cache_readonly def groups(self): @@ -2460,6 +2515,19 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, self.grouper, self._labels, self._group_index = \ index._get_grouper_for_level(self.grouper, level) + # a passed Grouper like, directly get the grouper in the same way + # as single grouper groupby, use the group_info to get labels + elif isinstance(self.grouper, Grouper): + # get the new grouper; we already have disambiguated + # what key/level refer to exactly, don't need to + # check again as we have by this point converted these + # to an actual value (rather than a pd.Grouper) + _, grouper, _ = self.grouper._get_grouper(self.obj, validate=False) + if self.name is None: + self.name = grouper.result_index.name + self.obj = self.grouper.obj + self.grouper = grouper + else: if self.grouper is None and self.name is not None: self.grouper = self.obj[self.name] @@ -2482,16 +2550,6 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, categories=c, ordered=self.grouper.ordered)) - # a passed Grouper like - elif isinstance(self.grouper, Grouper): - - # get the new grouper - grouper = self.grouper._get_binner_for_grouping(self.obj) - self.obj = self.grouper.obj - self.grouper = grouper - if self.name is None: - self.name = grouper.name - # we are done if isinstance(self.grouper, Grouping): self.grouper = self.grouper.grouper @@ -2536,6 +2594,10 @@ def ngroups(self): @cache_readonly def indices(self): + # we have a list of groupers + if isinstance(self.grouper, BaseGrouper): + return self.grouper.indices + values = _ensure_categorical(self.grouper) return values._reverse_indexer() @@ -2553,9 +2615,14 @@ def group_index(self): def _make_labels(self): if self._labels is None or self._group_index is None: - labels, uniques = algorithms.factorize( - self.grouper, sort=self.sort) - uniques = Index(uniques, name=self.name) + # we have a list of groupers + if isinstance(self.grouper, BaseGrouper): + labels = self.grouper.label_info + uniques = self.grouper.result_index + else: + labels, uniques = algorithms.factorize( + self.grouper, sort=self.sort) + uniques = Index(uniques, name=self.name) self._labels = labels self._group_index = uniques @@ -2566,7 +2633,7 @@ def groups(self): def _get_grouper(obj, key=None, axis=0, level=None, sort=True, - mutated=False): + mutated=False, validate=True): """ create and return a BaseGrouper, which is an internal mapping of how to create the grouper indexers. @@ -2583,6 +2650,8 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True, are and then creates a Grouping for each one, combined into a BaseGrouper. + If validate, then check for key/level overlaps + """ group_axis = obj._get_axis(axis) @@ -2707,7 +2776,7 @@ def is_in_obj(gpr): elif is_in_axis(gpr): # df.groupby('name') if gpr in obj: - if gpr in obj.index.names: + if validate and gpr in obj.index.names: warnings.warn( ("'%s' is both a column name and an index level.\n" "Defaulting to column but " diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 083fbcaaabe460..6edbb99641542d 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -250,7 +250,7 @@ def _get_binner(self): """ binner, bins, binlabels = self._get_binner_for_time() - bin_grouper = BinGrouper(bins, binlabels) + bin_grouper = BinGrouper(bins, binlabels, indexer=self.groupby.indexer) return binner, bin_grouper def _assure_grouper(self): @@ -1105,35 +1105,12 @@ def _get_resampler(self, obj, kind=None): "TimedeltaIndex or PeriodIndex, " "but got an instance of %r" % type(ax).__name__) - def _get_grouper(self, obj): + def _get_grouper(self, obj, validate=True): # create the resampler and return our binner r = self._get_resampler(obj) r._set_binner() return r.binner, r.grouper, r.obj - def _get_binner_for_grouping(self, obj): - # return an ordering of the transformed group labels, - # suitable for multi-grouping, e.g the labels for - # the resampled intervals - binner, grouper, obj = self._get_grouper(obj) - - l = [] - for key, group in grouper.get_iterator(self.ax): - l.extend([key] * len(group)) - - if isinstance(self.ax, PeriodIndex): - grouper = binner.__class__(l, freq=binner.freq, name=binner.name) - else: - # resampling causes duplicated values, specifying freq is invalid - grouper = binner.__class__(l, name=binner.name) - - # since we may have had to sort - # may need to reorder groups here - if self.indexer is not None: - indexer = self.indexer.argsort(kind='quicksort') - grouper = grouper.take(indexer) - return grouper - def _get_time_bins(self, ax): if not isinstance(ax, DatetimeIndex): raise TypeError('axis must be a DatetimeIndex, but got ' diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index fafcbf947e3df7..c8503b16a0e16a 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -623,3 +623,22 @@ def test_nunique_with_timegrouper_and_nat(self): result = test.groupby(grouper)['data'].nunique() expected = test[test.time.notnull()].groupby(grouper)['data'].nunique() tm.assert_series_equal(result, expected) + + def test_scalar_call_versus_list_call(self): + # Issue: 17530 + data_frame = { + 'location': ['shanghai', 'beijing', 'shanghai'], + 'time': pd.Series(['2017-08-09 13:32:23', '2017-08-11 23:23:15', + '2017-08-11 22:23:15'], + dtype='datetime64[ns]'), + 'value': [1, 2, 3] + } + data_frame = pd.DataFrame(data_frame).set_index('time') + grouper = pd.Grouper(freq='D') + + grouped = data_frame.groupby(grouper) + result = grouped.count() + grouped = data_frame.groupby([grouper]) + expected = grouped.count() + + assert_frame_equal(result, expected) From 458c1dc81b7e6f90180b06179ac91d9ed868cb05 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Sun, 1 Oct 2017 13:05:02 -0700 Subject: [PATCH 180/188] DEPR: Deprecate convert parameter in take (#17352) xref gh-16948. The parameter is not respected, nor is it a parameter in many 'take' implementations. --- doc/source/whatsnew/v0.21.0.txt | 1 + pandas/core/frame.py | 12 +-- pandas/core/generic.py | 96 +++++++++++++++---- pandas/core/groupby.py | 10 +- pandas/core/indexing.py | 18 ++-- pandas/core/series.py | 35 +++---- pandas/core/sparse/series.py | 13 ++- .../tests/frame/test_axis_select_reindex.py | 8 +- pandas/tests/indexing/test_loc.py | 4 +- pandas/tests/series/test_indexing.py | 17 ++++ pandas/tests/sparse/test_series.py | 3 + 11 files changed, 144 insertions(+), 73 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 3276310fa3e6e2..c8a0a6bff5cc75 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -566,6 +566,7 @@ Deprecations ~~~~~~~~~~~~ - :func:`read_excel()` has deprecated ``sheetname`` in favor of ``sheet_name`` for consistency with ``.to_excel()`` (:issue:`10559`). +- The ``convert`` parameter has been deprecated in the ``.take()`` method, as it was not being respected (:issue:`16948`) - ``pd.options.html.border`` has been deprecated in favor of ``pd.options.display.html.border`` (:issue:`15793`). - :func:`SeriesGroupBy.nth` has deprecated ``True`` in favor of ``'all'`` for its kwarg ``dropna`` (:issue:`11038`). - :func:`DataFrame.as_blocks` is deprecated, as this is exposing the internal implementation (:issue:`17302`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a12e611f6618a0..5d439f88bca15a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2034,7 +2034,7 @@ def _ixs(self, i, axis=0): return self.loc[:, lab_slice] else: if isinstance(label, Index): - return self.take(i, axis=1, convert=True) + return self._take(i, axis=1, convert=True) index_len = len(self.index) @@ -2116,10 +2116,10 @@ def _getitem_array(self, key): # be reindexed to match DataFrame rows key = check_bool_indexer(self.index, key) indexer = key.nonzero()[0] - return self.take(indexer, axis=0, convert=False) + return self._take(indexer, axis=0, convert=False) else: indexer = self.loc._convert_to_indexer(key, axis=1) - return self.take(indexer, axis=1, convert=True) + return self._take(indexer, axis=1, convert=True) def _getitem_multilevel(self, key): loc = self.columns.get_loc(key) @@ -3355,7 +3355,7 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None, else: raise TypeError('must specify how or thresh') - result = self.take(mask.nonzero()[0], axis=axis, convert=False) + result = self._take(mask.nonzero()[0], axis=axis, convert=False) if inplace: self._update_inplace(result) @@ -3486,7 +3486,7 @@ def sort_values(self, by, axis=0, ascending=True, inplace=False, new_data = self._data.take(indexer, axis=self._get_block_manager_axis(axis), - convert=False, verify=False) + verify=False) if inplace: return self._update_inplace(new_data) @@ -3547,7 +3547,7 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, baxis = self._get_block_manager_axis(axis) new_data = self._data.take(indexer, axis=baxis, - convert=False, verify=False) + verify=False) # reconstruct axis if needed new_data.axes[baxis] = new_data.axes[baxis]._sort_levels_monotonic() diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 4f6fd0828693e1..5dd770b2600a09 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -38,6 +38,7 @@ from pandas.core.index import (Index, MultiIndex, _ensure_index, InvalidIndexError) import pandas.core.indexing as indexing +from pandas.core.indexing import maybe_convert_indices from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.indexes.period import PeriodIndex, Period from pandas.core.internals import BlockManager @@ -1822,7 +1823,8 @@ def _iget_item_cache(self, item): if ax.is_unique: lower = self._get_item_cache(ax[item]) else: - lower = self.take(item, axis=self._info_axis_number, convert=True) + lower = self._take(item, axis=self._info_axis_number, + convert=True) return lower def _box_item_values(self, key, values): @@ -2057,8 +2059,63 @@ def __delitem__(self, key): except KeyError: pass - def take(self, indices, axis=0, convert=True, is_copy=True, **kwargs): + _shared_docs['_take'] = """ + Return the elements in the given *positional* indices along an axis. + + This means that we are not indexing according to actual values in + the index attribute of the object. We are indexing according to the + actual position of the element in the object. + + This is the internal version of ``.take()`` and will contain a wider + selection of parameters useful for internal use but not as suitable + for public usage. + + Parameters + ---------- + indices : array-like + An array of ints indicating which positions to take. + axis : int, default 0 + The axis on which to select elements. "0" means that we are + selecting rows, "1" means that we are selecting columns, etc. + convert : bool, default True + Whether to convert negative indices into positive ones. + For example, ``-1`` would map to the ``len(axis) - 1``. + The conversions are similar to the behavior of indexing a + regular Python list. + is_copy : bool, default True + Whether to return a copy of the original object or not. + + Returns + ------- + taken : type of caller + An array-like containing the elements taken from the object. + + See Also + -------- + numpy.ndarray.take + numpy.take """ + + @Appender(_shared_docs['_take']) + def _take(self, indices, axis=0, convert=True, is_copy=True): + self._consolidate_inplace() + + if convert: + indices = maybe_convert_indices(indices, len(self._get_axis(axis))) + + new_data = self._data.take(indices, + axis=self._get_block_manager_axis(axis), + verify=True) + result = self._constructor(new_data).__finalize__(self) + + # Maybe set copy if we didn't actually change the index. + if is_copy: + if not result._get_axis(axis).equals(self._get_axis(axis)): + result._set_is_copy(self) + + return result + + _shared_docs['take'] = """ Return the elements in the given *positional* indices along an axis. This means that we are not indexing according to actual values in @@ -2073,9 +2130,12 @@ def take(self, indices, axis=0, convert=True, is_copy=True, **kwargs): The axis on which to select elements. "0" means that we are selecting rows, "1" means that we are selecting columns, etc. convert : bool, default True - Whether to convert negative indices to positive ones, just as with - indexing into Python lists. For example, if `-1` was passed in, - this index would be converted ``n - 1``. + .. deprecated:: 0.21.0 + + Whether to convert negative indices into positive ones. + For example, ``-1`` would map to the ``len(axis) - 1``. + The conversions are similar to the behavior of indexing a + regular Python list. is_copy : bool, default True Whether to return a copy of the original object or not. @@ -2131,19 +2191,17 @@ class max_speed numpy.ndarray.take numpy.take """ + + @Appender(_shared_docs['take']) + def take(self, indices, axis=0, convert=True, is_copy=True, **kwargs): nv.validate_take(tuple(), kwargs) - self._consolidate_inplace() - new_data = self._data.take(indices, - axis=self._get_block_manager_axis(axis), - convert=True, verify=True) - result = self._constructor(new_data).__finalize__(self) - # maybe set copy if we didn't actually change the index - if is_copy: - if not result._get_axis(axis).equals(self._get_axis(axis)): - result._set_is_copy(self) + if not convert: + msg = ("The 'convert' parameter is deprecated " + "and will be removed in a future version.") + warnings.warn(msg, FutureWarning, stacklevel=2) - return result + return self._take(indices, axis=axis, convert=convert, is_copy=is_copy) def xs(self, key, axis=0, level=None, drop_level=True): """ @@ -2244,9 +2302,9 @@ def xs(self, key, axis=0, level=None, drop_level=True): if isinstance(loc, np.ndarray): if loc.dtype == np.bool_: inds, = loc.nonzero() - return self.take(inds, axis=axis, convert=False) + return self._take(inds, axis=axis, convert=False) else: - return self.take(loc, axis=axis, convert=True) + return self._take(loc, axis=axis, convert=True) if not is_scalar(loc): new_index = self.index[loc] @@ -5112,7 +5170,7 @@ def at_time(self, time, asof=False): """ try: indexer = self.index.indexer_at_time(time, asof=asof) - return self.take(indexer, convert=False) + return self._take(indexer, convert=False) except AttributeError: raise TypeError('Index must be DatetimeIndex') @@ -5136,7 +5194,7 @@ def between_time(self, start_time, end_time, include_start=True, indexer = self.index.indexer_between_time( start_time, end_time, include_start=include_start, include_end=include_end) - return self.take(indexer, convert=False) + return self._take(indexer, convert=False) except AttributeError: raise TypeError('Index must be DatetimeIndex') diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 9379ade4be7a69..9518f17e5f4f17 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -322,8 +322,8 @@ def _set_grouper(self, obj, sort=False): # use stable sort to support first, last, nth indexer = self.indexer = ax.argsort(kind='mergesort') ax = ax.take(indexer) - obj = obj.take(indexer, axis=self.axis, - convert=False, is_copy=False) + obj = obj._take(indexer, axis=self.axis, + convert=False, is_copy=False) self.obj = obj self.grouper = ax @@ -640,7 +640,7 @@ def get_group(self, name, obj=None): if not len(inds): raise KeyError(name) - return obj.take(inds, axis=self.axis, convert=False) + return obj._take(inds, axis=self.axis, convert=False) def __iter__(self): """ @@ -2226,7 +2226,7 @@ def _aggregate_series_fast(self, obj, func): # avoids object / Series creation overhead dummy = obj._get_values(slice(None, 0)).to_dense() indexer = get_group_index_sorter(group_index, ngroups) - obj = obj.take(indexer, convert=False).to_dense() + obj = obj._take(indexer, convert=False).to_dense() group_index = algorithms.take_nd( group_index, indexer, allow_fill=False) grouper = lib.SeriesGrouper(obj, func, group_index, ngroups, @@ -4523,7 +4523,7 @@ def __iter__(self): yield i, self._chop(sdata, slice(start, end)) def _get_sorted_data(self): - return self.data.take(self.sort_idx, axis=self.axis, convert=False) + return self.data._take(self.sort_idx, axis=self.axis, convert=False) def _chop(self, sdata, slice_obj): return sdata.iloc[slice_obj] diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index b7a51afcedabfe..2ea1b8a2389134 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1093,7 +1093,7 @@ def _getitem_iterable(self, key, axis=0): if is_bool_indexer(key): key = check_bool_indexer(labels, key) inds, = key.nonzero() - return self.obj.take(inds, axis=axis, convert=False) + return self.obj._take(inds, axis=axis, convert=False) else: # Have the index compute an indexer or return None # if it cannot handle; we only act on all found values @@ -1126,15 +1126,15 @@ def _getitem_iterable(self, key, axis=0): keyarr) if new_indexer is not None: - result = self.obj.take(indexer[indexer != -1], axis=axis, - convert=False) + result = self.obj._take(indexer[indexer != -1], axis=axis, + convert=False) result = result._reindex_with_indexers( {axis: [new_target, new_indexer]}, copy=True, allow_dups=True) else: - result = self.obj.take(indexer, axis=axis, convert=False) + result = self.obj._take(indexer, axis=axis) return result @@ -1265,7 +1265,7 @@ def _get_slice_axis(self, slice_obj, axis=0): if isinstance(indexer, slice): return self._slice(indexer, axis=axis, kind='iloc') else: - return self.obj.take(indexer, axis=axis, convert=False) + return self.obj._take(indexer, axis=axis, convert=False) class _IXIndexer(_NDFrameIndexer): @@ -1350,7 +1350,7 @@ def _getbool_axis(self, key, axis=0): key = check_bool_indexer(labels, key) inds, = key.nonzero() try: - return self.obj.take(inds, axis=axis, convert=False) + return self.obj._take(inds, axis=axis, convert=False) except Exception as detail: raise self._exception(detail) @@ -1367,7 +1367,7 @@ def _get_slice_axis(self, slice_obj, axis=0): if isinstance(indexer, slice): return self._slice(indexer, axis=axis, kind='iloc') else: - return self.obj.take(indexer, axis=axis, convert=False) + return self.obj._take(indexer, axis=axis, convert=False) class _LocIndexer(_LocationIndexer): @@ -1707,7 +1707,7 @@ def _get_slice_axis(self, slice_obj, axis=0): if isinstance(slice_obj, slice): return self._slice(slice_obj, axis=axis, kind='iloc') else: - return self.obj.take(slice_obj, axis=axis, convert=False) + return self.obj._take(slice_obj, axis=axis, convert=False) def _get_list_axis(self, key, axis=0): """ @@ -1723,7 +1723,7 @@ def _get_list_axis(self, key, axis=0): Series object """ try: - return self.obj.take(key, axis=axis, convert=False) + return self.obj._take(key, axis=axis, convert=False) except IndexError: # re-raise with different error message raise IndexError("positional indexers are out-of-bounds") diff --git a/pandas/core/series.py b/pandas/core/series.py index a05324142b223a..97f39a680c8c97 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2563,35 +2563,24 @@ def memory_usage(self, index=True, deep=False): v += self.index.memory_usage(deep=deep) return v - def take(self, indices, axis=0, convert=True, is_copy=False, **kwargs): - """ - return Series corresponding to requested indices - - Parameters - ---------- - indices : list / array of ints - convert : translate negative to positive indices (default) - - Returns - ------- - taken : Series - - See also - -------- - numpy.ndarray.take - """ - if kwargs: - nv.validate_take(tuple(), kwargs) - - # check/convert indicies here + @Appender(generic._shared_docs['_take']) + def _take(self, indices, axis=0, convert=True, is_copy=False): if convert: indices = maybe_convert_indices(indices, len(self._get_axis(axis))) indices = _ensure_platform_int(indices) new_index = self.index.take(indices) new_values = self._values.take(indices) - return (self._constructor(new_values, index=new_index, fastpath=True) - .__finalize__(self)) + + result = (self._constructor(new_values, index=new_index, + fastpath=True).__finalize__(self)) + + # Maybe set copy if we didn't actually change the index. + if is_copy: + if not result._get_axis(axis).equals(self._get_axis(axis)): + result._set_is_copy(self) + + return result def isin(self, values): """ diff --git a/pandas/core/sparse/series.py b/pandas/core/sparse/series.py index 2aecb9d7c4ffbd..5166dc927989e5 100644 --- a/pandas/core/sparse/series.py +++ b/pandas/core/sparse/series.py @@ -602,16 +602,15 @@ def sparse_reindex(self, new_index): sparse_index=new_index, fill_value=self.fill_value).__finalize__(self) + @Appender(generic._shared_docs['take']) def take(self, indices, axis=0, convert=True, *args, **kwargs): - """ - Sparse-compatible version of ndarray.take + convert = nv.validate_take_with_convert(convert, args, kwargs) - Returns - ------- - taken : ndarray - """ + if not convert: + msg = ("The 'convert' parameter is deprecated " + "and will be removed in a future version.") + warnings.warn(msg, FutureWarning, stacklevel=2) - convert = nv.validate_take_with_convert(convert, args, kwargs) new_values = SparseArray.take(self.values, indices) new_index = self.index.take(indices) return self._constructor(new_values, diff --git a/pandas/tests/frame/test_axis_select_reindex.py b/pandas/tests/frame/test_axis_select_reindex.py index fb9b8c2ed7affe..219c1df301c4b6 100644 --- a/pandas/tests/frame/test_axis_select_reindex.py +++ b/pandas/tests/frame/test_axis_select_reindex.py @@ -822,7 +822,7 @@ def test_take(self): expected = df.loc[:, ['D', 'B', 'C', 'A']] assert_frame_equal(result, expected, check_names=False) - # neg indicies + # negative indices order = [2, 1, -1] for df in [self.frame]: @@ -830,6 +830,10 @@ def test_take(self): expected = df.reindex(df.index.take(order)) assert_frame_equal(result, expected) + with tm.assert_produces_warning(FutureWarning): + result = df.take(order, convert=False, axis=0) + assert_frame_equal(result, expected) + # axis = 1 result = df.take(order, axis=1) expected = df.loc[:, ['C', 'B', 'D']] @@ -854,7 +858,7 @@ def test_take(self): expected = df.loc[:, ['foo', 'B', 'C', 'A', 'D']] assert_frame_equal(result, expected) - # neg indicies + # negative indices order = [4, 1, -2] for df in [self.mixed_frame]: diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 3e863a59df67e6..17316a714e2609 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -581,11 +581,11 @@ def gen_test(l, l2): def gen_expected(df, mask): l = len(mask) - return pd.concat([df.take([0], convert=False), + return pd.concat([df.take([0]), DataFrame(np.ones((l, len(columns))), index=[0] * l, columns=columns), - df.take(mask[1:], convert=False)]) + df.take(mask[1:])]) df = gen_test(900, 100) assert not df.index.is_unique diff --git a/pandas/tests/series/test_indexing.py b/pandas/tests/series/test_indexing.py index 83d6a09d38f415..272e8c7de5e498 100644 --- a/pandas/tests/series/test_indexing.py +++ b/pandas/tests/series/test_indexing.py @@ -1066,6 +1066,23 @@ def test_setitem_with_tz_dst(self): s.iloc[[1, 2]] = vals tm.assert_series_equal(s, exp) + def test_take(self): + s = Series([-1, 5, 6, 2, 4]) + + actual = s.take([1, 3, 4]) + expected = Series([5, 2, 4], index=[1, 3, 4]) + tm.assert_series_equal(actual, expected) + + actual = s.take([-1, 3, 4]) + expected = Series([4, 2, 4], index=[4, 3, 4]) + tm.assert_series_equal(actual, expected) + + pytest.raises(IndexError, s.take, [1, 10]) + pytest.raises(IndexError, s.take, [2, 5]) + + with tm.assert_produces_warning(FutureWarning): + s.take([-1, 3, 4], convert=False) + def test_where(self): s = Series(np.random.randn(5)) cond = s > 0 diff --git a/pandas/tests/sparse/test_series.py b/pandas/tests/sparse/test_series.py index 451f3695933470..8c0ed322028e8c 100644 --- a/pandas/tests/sparse/test_series.py +++ b/pandas/tests/sparse/test_series.py @@ -520,6 +520,9 @@ def _compare(idx): exp = pd.Series(np.repeat(nan, 5)) tm.assert_series_equal(sp.take([0, 1, 2, 3, 4]), exp) + with tm.assert_produces_warning(FutureWarning): + sp.take([1, 5], convert=False) + def test_numpy_take(self): sp = SparseSeries([1.0, 2.0, 3.0]) indices = [1, 2] From dead59ab80a971d7cd0cdbebd5dc421def3df9b7 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 1 Oct 2017 13:47:43 -0700 Subject: [PATCH 181/188] remove unused time conversion funcs (#17711) --- pandas/_libs/index.pyx | 31 ++------------------- pandas/_libs/lib.pyx | 63 ------------------------------------------ pandas/_libs/tslib.pxd | 2 ++ pandas/_libs/tslib.pyx | 13 ++++++++- pandas/io/pytables.py | 4 +-- 5 files changed, 18 insertions(+), 95 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 884117799ec5be..c96251a0293d66 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -13,29 +13,19 @@ cimport util import numpy as np -cimport tslib +from tslib cimport _to_i8 from hashtable cimport HashTable -from tslibs.timezones cimport is_utc, get_utcoffset -from pandas._libs import tslib, algos, hashtable as _hash +from pandas._libs import algos, hashtable as _hash from pandas._libs.tslib import Timestamp, Timedelta from datetime import datetime, timedelta -from datetime cimport (get_datetime64_value, _pydatetime_to_dts, - pandas_datetimestruct) - from cpython cimport PyTuple_Check, PyList_Check -cdef extern from "datetime.h": - bint PyDateTime_Check(object o) - void PyDateTime_IMPORT() - cdef int64_t iNaT = util.get_nat() -PyDateTime_IMPORT - cdef extern from "Python.h": int PySlice_Check(object) @@ -540,23 +530,6 @@ cpdef convert_scalar(ndarray arr, object value): return value -cdef inline _to_i8(object val): - cdef pandas_datetimestruct dts - try: - return val.value - except AttributeError: - if util.is_datetime64_object(val): - return get_datetime64_value(val) - elif PyDateTime_Check(val): - tzinfo = getattr(val, 'tzinfo', None) - # Save the original date value so we can get the utcoffset from it. - ival = _pydatetime_to_dts(val, &dts) - if tzinfo is not None and not is_utc(tzinfo): - offset = get_utcoffset(tzinfo, val) - ival -= tslib._delta_to_nanoseconds(offset) - return ival - return val - cdef class MultiIndexObjectEngine(ObjectEngine): """ diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 01548e17d39abf..503badd0ca8bc3 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -45,19 +45,11 @@ cdef double NaN = np.NaN cdef double nan = NaN cdef double NAN = nan -from datetime import datetime as pydatetime - # this is our tseries.pxd from datetime cimport ( get_timedelta64_value, get_datetime64_value, npy_timedelta, npy_datetime, PyDateTime_Check, PyDate_Check, PyTime_Check, PyDelta_Check, - PyDateTime_GET_YEAR, - PyDateTime_GET_MONTH, - PyDateTime_GET_DAY, - PyDateTime_DATE_GET_HOUR, - PyDateTime_DATE_GET_MINUTE, - PyDateTime_DATE_GET_SECOND, PyDateTime_IMPORT) @@ -132,61 +124,6 @@ def memory_usage_of_objects(ndarray[object, ndim=1] arr): s += arr[i].__sizeof__() return s -#---------------------------------------------------------------------- -# datetime / io related - -cdef int _EPOCH_ORD = 719163 - -from datetime import date as pydate - -cdef inline int64_t gmtime(object date): - cdef int y, m, d, h, mn, s, days - - y = PyDateTime_GET_YEAR(date) - m = PyDateTime_GET_MONTH(date) - d = PyDateTime_GET_DAY(date) - h = PyDateTime_DATE_GET_HOUR(date) - mn = PyDateTime_DATE_GET_MINUTE(date) - s = PyDateTime_DATE_GET_SECOND(date) - - days = pydate(y, m, 1).toordinal() - _EPOCH_ORD + d - 1 - return (( (((days * 24 + h) * 60 + mn))) * 60 + s) * 1000 - - -cpdef object to_datetime(int64_t timestamp): - return pydatetime.utcfromtimestamp(timestamp / 1000.0) - - -cpdef object to_timestamp(object dt): - return gmtime(dt) - - -def array_to_timestamp(ndarray[object, ndim=1] arr): - cdef int i, n - cdef ndarray[int64_t, ndim=1] result - - n = len(arr) - result = np.empty(n, dtype=np.int64) - - for i from 0 <= i < n: - result[i] = gmtime(arr[i]) - - return result - - -def time64_to_datetime(ndarray[int64_t, ndim=1] arr): - cdef int i, n - cdef ndarray[object, ndim=1] result - - n = len(arr) - result = np.empty(n, dtype=object) - - for i from 0 <= i < n: - result[i] = to_datetime(arr[i]) - - return result - - #---------------------------------------------------------------------- # isnull / notnull related diff --git a/pandas/_libs/tslib.pxd b/pandas/_libs/tslib.pxd index ee8adfe67bb5ed..147320b108cc8c 100644 --- a/pandas/_libs/tslib.pxd +++ b/pandas/_libs/tslib.pxd @@ -4,3 +4,5 @@ cdef convert_to_tsobject(object, object, object, bint, bint) cpdef convert_to_timedelta64(object, object) cdef bint _nat_scalar_rules[6] cdef bint _check_all_nulls(obj) + +cdef _to_i8(object val) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index b0b70bb8102047..096ebe9a5627b0 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -3416,7 +3416,18 @@ def cast_to_nanoseconds(ndarray arr): return result -def pydt_to_i8(object pydt): +cdef inline _to_i8(object val): + cdef pandas_datetimestruct dts + try: + return val.value + except AttributeError: + if is_datetime64_object(val): + return get_datetime64_value(val) + elif PyDateTime_Check(val): + return Timestamp(val).value + return val + +cpdef pydt_to_i8(object pydt): """ Convert to int64 representation compatible with numpy datetime64; converts to UTC diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index ea69116ec363da..ca1b4d031d3ced 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -25,7 +25,7 @@ import numpy as np from pandas import (Series, DataFrame, Panel, Panel4D, Index, - MultiIndex, Int64Index, isna, concat, + MultiIndex, Int64Index, isna, concat, to_datetime, SparseSeries, SparseDataFrame, PeriodIndex, DatetimeIndex, TimedeltaIndex) from pandas.core import config @@ -4529,7 +4529,7 @@ def _unconvert_index(data, kind, encoding=None): def _unconvert_index_legacy(data, kind, legacy=False, encoding=None): kind = _ensure_decoded(kind) if kind == u('datetime'): - index = lib.time64_to_datetime(data) + index = to_datetime(data) elif kind in (u('integer')): index = np.asarray(data, dtype=object) elif kind in (u('string')): From 408ecd21759979f94952a48a0e4a2b2a608ee84d Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 1 Oct 2017 18:58:01 -0400 Subject: [PATCH 182/188] TST: add backward compat for offset testing for pickles (#17733) closes #17721 --- .../0.17.0/0.17.0_x86_64_darwin_3.5.3.pickle | Bin 0 -> 129175 bytes .../0.18.1/0.18.1_x86_64_darwin_3.5.2.pickle | Bin 125826 -> 127853 bytes .../tests/io/generate_legacy_storage_files.py | 69 ++++++++++++++++-- pandas/tests/io/test_pickle.py | 22 ++---- 4 files changed, 70 insertions(+), 21 deletions(-) create mode 100644 pandas/tests/io/data/legacy_pickle/0.17.0/0.17.0_x86_64_darwin_3.5.3.pickle diff --git a/pandas/tests/io/data/legacy_pickle/0.17.0/0.17.0_x86_64_darwin_3.5.3.pickle b/pandas/tests/io/data/legacy_pickle/0.17.0/0.17.0_x86_64_darwin_3.5.3.pickle new file mode 100644 index 0000000000000000000000000000000000000000..537864af7028b2c282150d54bf58aa3f3d553c01 GIT binary patch literal 129175 zcmdqq1(a0B-Y9y4ySux)ySuwP3=Hn>Zi5dbSOO%F5Wy1MlRzK@NN@-N5+H#DcjxuM zk8^U)ckXxZUGJ{<-g-5Q{`Kx%dv{gss`}S%hGulcc`>WM6D}-`2+bN2rC*13-P`r; z7aFz^xpz?CF1))m$>De)8aA-*MHa&v|wW;=cV{T}0`jF_g26XS= zDg4{rdME(4l{z`9k9Uy&&z|^$Rpbtjc|R4Grp9yJET5#{`-q z+=S2y@3apMt4P*B+eDoZ8YoHF8wBjwe@O43urr14k}r_6#_M*5a-g)yAmMbVETEAX> z`}gYGF{p2#UqtR6G*CT*GX|O|(%)MsVpt1>24@e67`V3WLWA@Eqoh#+J?(E5`sZ6l z47`7dknnYi{=N9YW&XABWx@&{*6tD7x9b}k_Bb@~*r`{qz+;~QgF}M?`v*)6G*F;O zwE~waxK416;JShOHTc*1g%7-0^EW5z)ico85&946_0NNYUZ0|0NYsA4+jQz1R+B)N ziPWikuXceeoxr8)*E=wdf^xsv9}=g*>&=RR+E#vj#6QMFtTr8kI<*U2{5IVJJuGmZ z_T77R=oYZe>!t}w^!H2OrDy-3zCGIox=%>V@?rUEwd>igbD+InpDt^l=KrWz?f+D- zutt1SuXoA?Mosy^p9+CL75`bckc8gv?RX9pDy-Ybf8Ff^<$paA-(Jh$86mO${k1@| zhD7YtH>gi&aFmc3uz4cz(?4`#STCJYp>DWvM^Yas z_aI)laH-d}niXb`a(B;%x&7$GTn_`r=$K-7n2(d3>HH{Ql~p}9hB?1@_(+cfUO70w zL6`+wp!y>=U6;`Q;a{V{O4(pzQVh6g}-_!s4b$WAD%Cc`v%k=+p{r+=x!mdzQ z+lC$a-?x5rIsTh!M1E6^ykRH%XZoL4U!Uf;Xaba)s zp94&sKpzS$HM;yS%*KP?4vj#oy-wj7Hd%zF*Ml)&q}N+-rkSv(Ve?;DdOi09jP`mf zT)4228J1$aAzVzxVr+(u@7F=6U7)Z3m#H(_o55REX8g|w@0*NC%f2xyv;X_t6X(Cq z>u)E?up`6D{GaEax4FVdK7hS5O@|6%K*~JgX@GuY5%sv zNB>VFrfZ|H!vZ%IVYghNuh&ccLc6vMy!hr$D|lGfwr@59Yr^1J!E?e|G&Fd=y@B;t z@WTJN@Csh?KU#Ge6#u&F`Y%hA!2A5CHCNzb{hwWPg$tK1-nHpZ0&B3|%SW6Gb60`* zIi3cLy?5o1Fk3Y)vH$5m*Jl6!8Ycgzu4vaafnM@@Ed9TIdH%;^X?viE|J_(x9@zi4 zu@v7}3j3tsZA1wSsA%o`_cL6&E)E>=kEQVIq11Ir*h$(C2#L@k5TgT~I3!ZwR;xhZ zBY-4-zx@jA4C>sgZbkdU;l0MOI>ER^i)t zU(I%P)?iK6Vr|yp+k2LJw(GM28?q4_vk9BB8Jn{OTe1~fvklv_9ow@5J2Hr!*qOoX z!mjMb?(D&y?8V;f!@lgt{v5!89K=AFd!2@GD2H)4M{p!ZaWuzpEXQ#?CvYMsaWbcH zDyK1o(>a4PIg7J7hjSUqd7RI8`5qT=As2Bmm+*alz@=Qq<@}HzaRpcMV}8O_{FI;Z zbAG|qT*I|o$MyV@UvUFBauYXm3%7C`w{r(~au;`V55ML&{FZyUkKb`W5Ab^)ftGvcP`4_MA25<5f zZ}SfC@*eN=0Uz=aAM**H@)@7=1z++N10SZpPVX>0BQPQ(F*2htDx)zvV=yLTF*f5c zF5@vi6EGnYF)@=cDU&fdQ!ph{F*VaLEz>bQGcY4FF*CC;E3+{>b1)}!F*oxtFY_@! z3$P#yu`rA9?Gmn-?cyxKk}SoryT-6ohGiM{F-}-2&kC%_u#a}aQe{?QRaT?F*$aFO z^d^5z)?#heVO@sZqlTsW47*7UOAQ%z_ZgNNvk9BB8Jn{OTe1~fvkk-US;JC0wr2;1 zeKru5g4l_j8O$#1%5Dt%4&!i+;7E?*XpZ4n zj^lVv;6zU1WKQ8!PGbnCa|UN}7H4w~=Q5P@IG^wGJucuvF5+S?;rsl6OSz28`5`~z z3a;eG{DiCcDL>=q{DP~whHJTw>-itLzw$R;;$>dp@BD*T zd5wSaFJ9*j-sCOb<{jSUJ>KU7KI9`l<`X{UGd|}FzT_*u{!BD5KfM$F^)wYOJR>k7 zBQY|gFe;-lI%6; zFe|e$J9986b1^sbFfa2lKMSxR3$ZYZuqcbMI7_f3OR+S|uq?~5JS(swE3q=Guqvyu zI%}{dYq2)#urBMdJ{zzh8?iB)uqm6dIa{zLTd_6Uur1rMJv*=?gV>3k8O$#1%5Ln= z9_-0p?9D#x%YN+70UXFd9Lymc%3&PN5gf@;9L+Ht%W)jf37p7DoXjbl%4rPYbk5*R z&f;v&;arAt9_RC2zQ+Yz$VFVtC48SBa4DB@IX~n_T)~z6n4fSJKjmlqoL_J?*KjS@ zaXr7}SKPpj+{De?!mZrK?cBkg+{NA8!>{=bzvW);<9FQ81N@!``2!E}Fpuy@{=}m^ z#^XG}lRU-KJj1j6ndkTm&+`H=@>l-GOT5f0{GEUBDzEWR{>AIO!JE9r+q}cOyvO@| zz=wRq$9%%4e8%T|!Iyl+a1q|b{dX9i5%~J|Y64jz*^bO8jLK+?&KQizSd7g$jLUe8 z&jd`!L`=*iOv+?T&J;|^R7}k@Ov`jk&kW4SOw7zI%*t%c&K%6iT+Gcp%*%Yt&jKvS zLM+T8EXram&JrxiQY_6fEX#5%&kC%_O03K(tjcPv&Kj)ATCB}Ftjl_=&jxJBMr_O` zY|3VA&K7LRR&32SY|D0R&kpR!Aa-JB2D1yhvKzaz2Ya#?d$SMwvLE|%00(jq2XhFA zau|nm1V?fdM{^9vavaBV0w;13Cvys?avDQ8oijL-vpAb`IG3TE$N7Ah?{NVaauFAE z3E$@jT*_r!&JXz!S8ydi<|kamPx%=?=NDYfHC)SeT+c806*q7rH*qt!a4WZQJ9ls= zcX2oO@N0g#%p?4fKk+D!@ibQ zGcY4FF*CC;E3+{>b1)}!F*oxtFY_@!3$P#yu`r9UD2uT;ORywMu{6uDEX%Px-~38% zxNsG1S7K#WVO3URb=F`_)?#heVO`c^eKuf2HezEoVN*6^bGBehwqk3xVOzFidv;() z2C)-6Gnif2mEG8#J=l}I*qeRWm;KnE12~X_IG95?l*2fjBRG72otoW8e#^bw$M3kG2lzb? z@&_K`VIJX+{E0_-jK_I`CwYped4^~CGtcoCp63N#bQGcY4FF*CC;E3+{>b1)}!F*oxt zFY_@!3$P#yu`r9UD2uT;ORywMu{6uDEX%PxE3hIfu`;W$Dyy+NYp^D3u{P_lF6*&A z8?Yf8u`!#lDVwo5Td*Ztu{GPUE!(j@JFp{z*omDP%r5N8ZtTt;?8#p2%|7hQe(cWy z9LPZ&%pn}gVI0m89LZ4}%`qIyaU9PHoXAO>%qg78X$;|X&frYW;%v_0T!wNU=kr~@ z#|2!-MO@4!e4ih1DVK3MKjcST!Ik`&pKujF$3qHvJo4z37fJRo3jO5vK3pi4coFE z+p_~ZGKihnnZfMBuI$F{?7^Pw#op}0zU;^T9KeAb#K9cGp&Z8H9Kn$s#nBwYu^h+o zoWO~k#L1k(shq|TPUj5H6w8UnTeU1g;|-6 z*_nemnTxrZhk2Qg`B{JkS%`&Mghg45#aV(SS&F4uhGkifZs!i}!9)8Vl_$~KxAHU;%9^m&p$RBu!hk1lQ@+ThUF&^g$ zp5!T><{6&l&pgLpc%Bz{k-zdcUgBk5;qUx|S9y(p@-JTJ4c_D}-sT{)#nep0v`okJ%)pGy#LUdXtjxyj%)y+@#oWxpyv)b^EWm;+#KJ7XqAbSZ zEWwg2#nLRpvMk5)eDe)ufxrK^U5S-hg;iON)meizS&OwZs!i}! z9)8Vl_$~KxAHU;%9^m&p$RBu!hk1lQ@+ThUF&^g$p5!T><{6&l&pgLpc%Bz{k-zdc zUgBk5;qUx|S9y(p@-JTJ4c_D}-sT{)#nep0v`okJ%)pGy z#LUdXtjxyj%)y+@#oWxpyv)b^EWm;+#KJ7XqAbSZEWwg2#nLRpvMk5)tiXz_#LBF~ zs;tK9tihVB#oDaHx~#|gY`}(W#KvsGrfkOMY{8an#nx=Ywrt1t?7)r;VkdTHFuSlT zyRkcauqS)5H~X+J`>{Uap9v5&S7jZF{@O^&3rCi44{E#1U1y}N8e!^A!l%Mf)e!V$^ zHe++PU`w`QYqnuqwqtvCU`GbA6FW1QUD%b~*quGtlfBrReb|@%*q;M9kb^jwLpYSf zIGiImlA}19V>p)MIGz(Yk&`%?Q#h5=7{ckC!I_-J*_^|<4COq|=evB53%HPrxR^`$ zK0n}6F5_~3$d9;!EBP@$;VOR0&-gjN;A*bnTCU@Ie#x)6fg8Dro4JKsxsBVogFCs4 zySay7^BaE4z1+v|xSt33JrD8+9^zph;g9@@M|q6Ld4eZ-il=#oXZbVF@fV)w1zzN@ z{Ee4*nOFEb|KL?###2Cu|6BHAsewVo3JUHu{m3?C0nsI+psO$u{}GmBZJt9of*t7 z?89LixF&Ji5RQ5?-N9LsSW&k3B!Nu10noXTko z;dIX6OwQtL&f#2!avtaNUB1T!T*yUS%q4uEA8;v`aXCNaM_j>`{Ft9`6+h)?{G4BK zHP>)0*Ks|+%ko{F&$Y3(xZcFY;IZ#!I}+EBu{*@G7tIPyWU0yuq8i#oN5Y zyS&Hye87i%#K(NXr+miee8HD|#kc?4%R4c}e@0+LMq*?}VN^zAbjDyz#$s&7VO+*z zd?sK*CSqbHVNxbza;9KPrebQQVOpkRdS+loW@2V$VOC~icIIGC=3;K72otoW8e#^bw$M3kG2lzb?@&_K`VIJX+{E0_-jK_I`CwYped4^~CGtcoCp63N# zbQ zGcY4FF*CC;E3+{>b1)}!F*oxtFY_@!3$P#yu`r9UD2uT;ORywMu{6uDEX%PxE3hIf zu`;W$Dyy+NYp^D3u{P_lF6*&A8?Yf8u`!#lDVwo5Td*Ztu{GPUE!(j@JFp{z*omDP z%r5N8ZtTt;?8#p2%|7hQe(cWy9LPZ&%pn}gVI0m89LZ4}%`qIyaU9PHoXAO>%qg78 zX$;|X&frYW;%v_0T!wNU=kr~@#|2!-MO@4!e4ih1DVK3MKjcST!Ik`&pKujFJXAb6MF6L$)=4C$SX8{&u zAr@v47G*IOX9<>MDVAm#mSs7XX9ZSdC01q?R%JC-XARb5E!Jio)@41`X9G55BQ|Ce zHf1w5XA8DuE4F4Ewq-lEX9sp<5IeCmgV}{$*^S-VgFV@cz1fF-*^m7>fCD**gE@pl zIgGtBYiI;hWzw-}XhnIm{0hW&-k1#_>!*}E{^!m z@QlESjKs){!l;bK=#0UbjK$cD!?=vc_)NfrOvJ=Y!lX>ba4+1Y{k}W!?tY4_Uyop3}PpCW-z<3 zE4#5fd$1>au{Zm$FZ;1S2XG(a4PIg7J7hjSUqd7RI8`5qT=As2Bmm+*alz@=Qq<@}HzaRpcMV}8O_{FI;ZbAG|q zT*I|o$MyV@UvUFBauYXm3%7C`w{r(~au;`V55ML&{FZyUkKb`W5Ab^)ftGvcP`4_MA25<5fZ}SfC z@*eN=0Uz=aAM**H@)@7=1z++N!^IW<8J-ask&zggQ5coc7@aW~ld%|^aTu5J7@rB4 zkcpU>Ntl$$n4Bq?lBt-QX_%Jjn4TG!k(rp8S(ugCn4LM8lew6id6<{^n4bk$kcC*7 zMOc)@SezwTlBHOhWmuNwSe_MFk(F4PRalkPSe-RkleJizby%16Sf35pkd4@wP1uyp z*qklclC9X9ZP=FW*q$BOkwNUl&J1Q3c4aqqXAkydFZO01_GLfz=Kv1mAP(jb4&^Wo z=LnAED30bBj^#Lx=LAmVBu?fOPUSR)a5`sjCTDRr=Ws4VIgj)CF5lw$sj@@+)rOMsDI}ZsAsL<96=gPVVAv z?%~(`hTn28_whUK=K+4tgZzPqc$i1{BY)yi9^-MI;7Ok1X`bO({>*dyh39#J7x^oH z<0W3^75>gYc$L@qC;#Gg-r!B%;%(mHUEbq;KHx(>;$uGHQ$FK!zTiu~Vz_wXKf^Nu zBQg>rGYX?J8ly7?V=@+FGY;c29^*3s6EYDKGYOM28Iv;wQ!*7(GY!)+9n&)dGcpr1 zGYhja8?!S9b21lmGY|7JAM>*S3$hRkvj~f_7>lz6OR^M8vkc3!9Luu;E3y(RvkI%S z8mqGgYqAz=vkvRB9_zCK8?q4_vk9BB8Jn{OTe1~fvklv_9ow@5J2Hr!*qOoX!mjMb z?(D&y?8V;f!@lgt{v5!89K^vK!l4|-;T*w{9L3Qb!?7I4@tnYkoW#kT!l|6b5KiX| z&g3l4<{ZvtDCcoL-{pH;z=d4I#azPo`2m-58JF`ze#8}A$&dL7SMgJR#?Sc$S91;5 zavj(6OMb--+{jJb%q`r?ZQRZs+{siR}2?l{AYMZU_?e@WJY0BMq_lwU`)nhY{p?+#$$XYU_vHh zVkTiyCS!7@U`nQ9YNlaYrek_$U`A$QW@celW@C2dU{2;@ZsuWL=3{;qU_lmQVHROg z7GrUiU`du@X_jGGmScHVU`1A9WmaKTR%3P6U`^IyZPsC3)?V$^He++P zU`w`QYqnuqwqtvCU`GbA6FW1QUD%b~*quGtlfBrReb|@%*q;M9kb^jwLpYSfIGiIm zlA}19V>p)MIGz(Yk&`%?Q#h5=7{ckC!I_-J*_^|<4COq|=evB53%HPrxR^`$K0n}6 zF5_~3$d9;!EBP@$;VOR0&-gjN;A*bnTCU@Ie#x)6fg8Dro4JKsxsBVogFCs4ySay7 z^BaE4z1+v|xSt33JrD8+9^zph;g9@@M|q6Ld4eZ-il=#oXZbVF@fV)w1zzN@{Ee4* znOFEb|KL?#vnSjLkTV%Xo~>1Wd?8Ow1%q%4AH=6imrfOwBY*%XCc749v((%*-sz z%52Qe9L&jF%*{N^%Y4kw0xZZvEX*P-%3>_e5-iD5EX^`3%W^Ew3arRVtjsE`%4)36 z8m!4$tj#*C%X+NO25iViY|JKX%4TfN7Hr8@Y|S=o%XVzf4(!Mvc4B7+vkSYj8@say zd$JdMvk&{SANz9v2XYVxa|nlW7>9ENM{*QLa}39F9LIA4Cvp-ea|)+&8bdgpGdPp8 zIGb}gm!X`;`Fxk}aRC={5f^g_-{%Kh%4J;65BU*Sa3w$HCtSr(`58aw7hKIXT+4M_ z&oB8EH*h02aWl7YE4OhwcW@_naX0tyYktFTxtII+9ryD9zvn^zz(YLDBm9v+@hFe+ zI8X2-Pw_O*@GO7kIsU@)yuge6mA~;4FY^k2=O4VvYy6Xc@j7qtCU5aJ@9-|~@jf5$ zAs_KEpYSQ4@i|}cC0{XILh+yB8G#WQiIEwFQ5lWV8G|tyi?JDpaT$;CnScqIh>4km zNtukvnSv>qim91~X_=1cnSmLZiJ6&&S(%O5nS(i*i@BMHd6|#-S%3vuh=o~%MOlo+ zS%M{5ilteGWm%5pS%DQh8VP1%gi*@7+E zimlm(ZP||P*?}Dy#7^wYV0K|wc4K$;U{Cg9Z}wqd_G5nz;6M)IU=HC>4&!i+;7E?* zXpZ4nj^lVv;6zU1WKQ8!PGbnCa|UN}7H4w~=Q5P@IG^wGJucuvF5+S?;rsl6OSz28 z`5`~z3a;eG{DiCcDL>=q{DP~whHJTw>-itLzw$R;;$>dp z@BD*Td5wSaFJ9*j-sCOb<{jSUJ>KU7KI9`l<`X{UGd|}FzT_*0OCk7BQY|g zFe;-lI%6;Fe|e$ zJ9986b1^sbFfa2lKMSxR3$ZYZuqcbMI7_f3OR+S|uq?~5JS(swE3q=GuqvyuI%}{d zYq2)#urBMdJ{zzh8?iB)uqm6dIa{zLTd_6Uur1rMJv*=?gV>3k8O$#1%5Ln=9_-0p z?9D#x%YN+70UXFd9Lymc%3&PN5gf@;9L+Ht%W)jf37p7DoXjbl%4rPYbk5*R&f;v& z;arAt9_RC2zQ+Yz$VFVtC48SBa4DB@IX~n_T)~z6n4fSJKjmlqoL_J?*KjS@aXr7} zSKPpj+{De?!mZrK?cBkg+{NA8!>{=bzvW);<9FQ81N@!``2!E}Fpuy@{=}m^#^XG} zlRU-KJj1j6ndkTm&+`H=@>l-GOT5f0{GEUBDzEWR{>AIO!JE9r+q}cOyvO@|z=wRq z$9%%4e8%T|!Iyl+aEZl#hGzsuWF$sr6h>t?WG&Wa9oA(%)@K7YWFt0a6Eh=VzVLphAYIf5fOilaG( zV>yoFIe`;7iIX{nQ#p+xoX#1X$yuDuIh@N-&f|Q(%lEi|3%Q7kxrFcY11{wHKl2=a;dx%*MgGd)c!`&Jg}?I; zUgb6Z$-j7=H+Yk`c$;^4m-l#|5BQLe_?S=ll+XB_FZhzL7%qwU&+v@Eh>XO@jKZjl z#^{W}n2g2PjKjE$$M{UZgiOT5Ov0p0#^g-FluX6cOvAKH$Mnp=jLgK$%)+e9#_Y_& zoXo}C%)`9Q$NVh7f-JNjtpWac4jcUuq(TCi2XQcma43gyI7e_KM{zXAa4g4hJST7>Cvh^Ta4M%Ugwr{LGdYX1Ifru@ z%6Xj6cljO{a3L3QF_-Xte!!(%#^wBwA8`d&@?(C&Rs58n@pFE`)m+21T*vkNl3#HH zH*ym^95h>6~iSJ{~4YU7?F_}nNb*((HNaE7?ZIWn{gPI@fe>8n2?E>m`RwF$(Woe zn3AcOnrWDp>6o4wn30*7nOT^X*_fR?yQ zj^_kU84j-r{ZE;a%S2eLmnrKH_6O;Zr{2bH3n9zGAp!;y=SP0wXdKBQpx4G8&^Z z24gZ7V>1rpG9KeI0TVJ26Eg{uG8vOI1yeE=Q!@?IG9A-112ZxcGcyabG8?lq2XitP zb2AU~G9UA^01L7Z3$qA|vKWiA1WU3MOS25ivK-5^0xPl-E3*o#vKp(i25YhwYqJjP zvL5TR0UNRr8?y#_sIFp6tcm?8Cn7 z$Nn6^fgHra9KxX-#^D^nksQU*9K*33$MKxNiJZjAoWiM`#t=^D49?^%&gLA>Whm!y zKHuegT)>4~#Kl~~_xS;rav7KNLw>{+T*;6530Lt`e#X!F1y^$o*K!@#^Gklk4cy30 z+{`W9%5B`v9o)%X+|51wn&0qS?&Usy$NfCO?|G0v@DLC42!G^HJj!D{&J#SzQ#{Qx zJj9yv`fE$y>b5JG{$#yw3-G$VYt4Cw$6h ze9jkq$yW@QT>NKvMqornVq`{PR7PWT#$ZgwVr<4?T*hO3CSXD)Vqzv?QYK?^reI2@ zVrr&gTBc)qW?)8UVrFJxR%T;%=3q|dVs7SPUgl$d7GOaZVqq3xQ5IuymS9PiVriCP zS(amYR$xU|Vr5ogRaRql)?iK6Vr|x8UDjiLHef?GVq-R8Q#NCBwqQ%PVr#ZxTef3+ zc3?*au@gHpm|fVF-PoNy*pt23n|;`q{n(!aIFN%lm_s;}!#JEHIFh3{nqxSY<2arZ zIFXY$nNv8G(-^|(oWYr##o3(0xeVnz&gZ*)j|;evi@2Ce_&z`2QZD0ie#nowf-Ctk zKjA8V%Fp;Yzu;=F;aaZadVa~TxPcqFiJQ5FTe*$fxq~~oi@UjpU-KJ&%e~yk@3@}_ z_&pEu2Oi>K9^sGtiAQ;i$9aM$d5WibhG+RR&+!+Y=LKHmul$Xdc$ruDJOAKSUgMwq zi`RLBH+hS#%p?4fKk+D!@i6w8UnTeU1g;|-6*_nemnTxrZhk2Qg`B{JkS%`&Mghg45#aV(SS&F4uhGkif zZs!i}!9)8Vl_$~KxAHU;%9^m&p z$RBu!hk1lQ@+ThUF&^g$p5!T><{6&l&pgLpc%Bz{k-zdcUgBk5;qUx|S9y(p@-JTJ z4c_D}-sTGdv?OA|o*}qcAF?F*;)~CSx%+ z<1jAcF+LM8Armn%lQ1chF*#E(B~vjq(=aX5F+DRdBQr5GvoI^OF*|cGCv!13^Dr;- zF+U5iAPccDi?Aq*u{cYxBulY0%djlVu{##2Cu|6BH zAsewVo3JUHu{m3?C0nsI+psO$u{}GmBZJt9of*t7?89LixF&Ji5RQ5?-N9LsSW&k3B!Nu10noXTko;dIX6OwQtL&f#2!avuLLmg+HB zcOFo~aG4@ao9f-RZQEGewr$%s*S2ljwr$(C&Uemv@}t+ZnaO0@Y3`|T8k`Piz?pCs zoDJu|xo{qw4;R3Na1mS#m%ycP8C(umz?E!P#a1-1Nx4^A%8{7_e zz@2ax+zt1@y>K7g4-deD@DMxo44IRj#3q9z=0ERFE zW5Ad&7K{z!z_>6Tj1LpQgfJ0I43og5Fd0k^Q^1rk6-*7&z_c(OOb;`_j4%_-470$j zFdNJcbHJQ17t9Uwz`QUY%nu8|g0K)Q42!^`uox^3OTdz_6f6y+U>R5zmV@PC1y~VQ zf|X$vSQS=-)nN@-6V`&YVI5c()`Rt71K1EYf{kGl*c3K{&0!1J61IY^VH?;Mwu9|q z2iOsIf}LR(*cEnz-C+;d6ZV3=VISBR_JjT505}j1f`j1@I1~dB60cXNla5kI+=fZh#K3o77!bNZ~TmqNEWpFuM0awCR za5Y>5*TQvhJ=_2{!cA~9+yb}4ZE!o>0e8Y(a5vlo_riT}KRf^r!b9*dJOYoxWAHdU z0Z+nH@H9LF&%$%?JiGue!b|WnyaKPnYw$X}0dK-v@HV^y@4|cVK70Tl!bk8id;*`s zXYe_E0bjyb@HKn`-@xi~(c9STHt>1LMMYFg{EG6T(C=F-!uJ!elTx zOaW8ER4_G61JlBEFg?rwGr~+TGt2_B!fY@*%mH)4TrfAx1M|XsFh48+3&KLMFf0O# z!eX#EECEZxQm{0Pf@NS?SPqtl6<|eJ308(xU{zQRR);lUO;`)ohIL?FSP#~R4PZmq z2sVaIU{lx(His=>OV|pwhHYS5*bcUb9biY;33i5EU{}};c85J+PuL6ghJ9dP*bnxH z1K>b72o8or;7~XW4u>P)NH_|PhGXDZI1Y}76W~NR2~LL5a0;9Xr@`rP2Am0J!P#&Q zoD1i{`EUVT2p7S{a0y%rm%-(51zZVN!PRgLTnpF1^>72+2sgpaa0}cDx54dj2iysF z!QF5V+za=?{qO)h2oJ%-@CZB#kHO>c1Uv~(!PD>zJPXgk^Y8+^2rt3Q@Cv*Nufgl^ z2D}Mx!Q1c-ybJHa`|tsL2p_@6@CkehpTXzw1$+r#!PoE&d<);f_wWP!2tUEk@C*D3 zzrpYD2mA?t!Qb!?{0sjd=5Dq|ky4+R%Xt1!mtP|3X8$wummg#OTp4G3YLLoVL4bHR)7^@C0H3& zfmLBOSRK}YHDN7S8`gn!VLezMHh>LbBiI-=flXmE*c`TiEnzFz8n%ILVLR9!c7PpW zC)gQwfn8xY*d6wOJz+1{8}@;HVL#X(4uAvUAUGHffkWXiI2?|EBjG4G8jgWu;W#)R zPJk2PBsdvH!zpkooCc@E8E__?1!u!Ka4wt&=feeXAzTC(!zFMjTn3lJ6>ue71y{p0 za4lR1*TW5PBisZx!!2+t+y=M99dIYy1$V@GLwB&%+DwBD@4I!z=JAyauns8}KH)1#iPU@GiUu@52Z1A$$ZM!zb`5dC^x!e+2JYyn%sR;ZeiUa&Xp1N*{$us<9C z2f{&cFdPDh!eMYY905ndQE)UI1INN~a6FsJqaU`bdCmWENV3@i)F!Sb*ItOzT?%CHKo z3ai2Dum-FNYr)#E4y+67!TPWPYzP~{#;^%&3Y)>^umx-hTfx?_4QvbB!S=8N>3>*u`!SQec zoCqhu$uJsDfm7i$I33P_GvO>a8_t1q;XF7WE`ST+BDfeXflJ{sxE!v4E8!}*8m@tB z;X1e;Zh#x%Cb$`Hfm`7=xE=0*JK-+48}5O7;Xb$@9)JhoA$S-bfk)vncpRR9C*di0 z8lHh?;W>C7UVsybdeDaf3}FPufH7e#7#qfcabY|dA0~haVIr6qCV@#|GMF5ufGJ@r zm>Q;mX<<5;9%g_UVJ4UvW`S8@HkcjefH`3cGSd0{@79~OWGVIf!;7J)@!F<2ay zfF)rmSQup+DkE5j;-$nKCmzB2m8YTa3CB6 z2g4z7C>#cd!x3;K90fov#2gkz+a3Y)pC&Oqs1x|(2;B+_x&V;kzY&Zwbh4bKi zxBxDMi{N6o1TKZk;BvSEu7s=LYPbfjh3nvYxB+g2o8V@+1#X4g;C8qJ?u5JGZny{T zh5O)scmN)Rhu~p&1RjOQ;Bj~Yo`k31X?O;nh3DXTcmZC7m*8c11zv^M;B|Nd-h{W{ zZFmRXh42UEtPHEbs<0ZY z4r{=guokQh>%h9O9;^==z=p69Yz&*grmz`o4qL#MuoY|#+rYN49c&Lfz>csJ>I4tv0!uovtN`@p`iAM6hYz=3cO91MrRp>P-+4oAR|a1#7tVw8;R3i2E`p2U61WsDgUjIxxDu{{tKk~B7OsQq z;Rd)7Zi1WP7Pu8|gWKT_xD)PzyWt+V7w&`m;Q@FM9)gGA5qK0HgU8_scoLq1r{NiR z7M_FW;RSdRUV@k56?he1gV*5=coW`&x8WUl7v6*S;RE;(K7xSVp#wQ|p$B~!zz{}Y3>Xu}g0W#77#GHa@nHg(5GI0&VG@`WCWFag3YZe6f~jE| zm=>mk>0t(#5oUs!VHTJbW`o&b4ww_>g1KQHm>1@Q`C$QA5Eg=kVG&pq7K6oM30M-A zf~8>;ECb8Ja~Yy;cEcCbC{06W4?urureyTWd;JL~~_!d|d9>;wD4ey~3r00+WBa4;MK zhr(fSI2-{-!clNE90SL~ad14G04Kspa59XBQ{YrM4NiwM;7m9R&W3Z~TsRNThYR3B zxCkzWOW;zt3@(Q&;7Yg(u7+#iTDT6bha2EVxCw5CTi{l>4Q_`!;7+&;?uL8dUbqkL zhX>$6cnBVbN8nL-3?7Fk;7NE2o`z@OS$GbfhZo>QcnMyHSKw864PJ*g;7xc7-iCMJ zU3d@PhY#RG_y|6RPvBGd3_gc1;7j-lzJ_n$TlfyXhacca_z8Z7U*K2x4St6|;7|Aq z{)T_xU-%CS$C)hXynug%)Jch7RP=g&y=_07DpoF3&w_VU|bjv z#)k=DLYN39hDl&jm<%R|DPT&N3Z{l>U|N_CriU3|Mwkg^hFM@%maYf^ z32VXHunw#X>%sc40c;2x!N#x&Yzmvf=CB2930uL|unlYr+rjp*1MCPp!OpM?>5lZh>3jHn<(`fIHzXxEt<)d*ME~A0B`Q;URb!9)U;UF?bxFfG6Q8cp9F8XW=<` z9$tVK;U#z(UV&HPHFzD~fH&bScpKh2mZjo*HfAcA$#Iiiae|LGt zNG&r*M+s5l|38*nEGbv6T*uCA%l=n$RuPIvItp)Y(jP+hmcdqCFBg;m08VU4g>SSPF(HV7MqO~Phj zi?CJLCTtgW2s?#c!fs)Yuvge8>=zCQ2ZckzVd02yR5&Ia7fuK#g;T<5;f!!rI47JJ zE(jNeOTuO0if~o9CR`V82sedW!foM>a96k|+!r1Q4~0jhH%#F%0%F}4^-j4Q?yMy#Npxyailm(94(F!$BN^`@!|w=qBu#MEJlk{#Hr#m zak@A|oGH!{XNz;hx#B!=zPLbKC@vBgi%Z0%;xci$xI$bht`b*^Ys9tUI&rA z5;u!m#I52sal5!f+$ru7cZ++(z2ZJ`zj#1AC>|0Ii$}zx;xX~KctSiWo)S-sXT-DO zIq|%BLA)ql5-*Ea#H->p@w#|JyeZxiZ;N-tyW&0ZzW6|VC_WM&i%-O-;xqBN_(FUs zz7k)HZ^XCaJMq2vLHsCw55?It5|u28Nw(xjTyiB(@})ourHB+miYdjCVoPzPxKcbRzLY>pC?%2-OG%`pQZgyI zltM}=rIJ!hX{5ALIw`%BLCPp)k}^wKq^wdlDZ7+I$|>cNa!Yxnyiz_Xzf?dfC>4?l zOGTujQZcExR6;5#m6A$JQBoPHtW-`aFIA8#N|mI_QWdGHR86Wb)sSjRwWQio9jUHV zPpU69kQz#jq{dPcsj1XVYA&^qT1u^?)>0d(t<+9xFLjVQN}Z(6QWvSK)J^Ix^^kf> zy`a)hX|uFN+A3|6wo5ytozgC8x3ovvEA5l^O9!Nb(jn=vbVNET9g~hrC!~|oDe1Iy zMmj5*Qq>Iue>9TZ1x+-0hu1hzho6;@mwsc3jE8UasOAn-n(j)1y^hA0pJ(HeG zFQk{!E9tfLMtUo~lio`oq>s`k>9h1j`YL^szDqx(pVBYsxAaH)EB*WL=oe*4mSsg& zWg=^`E*r8bQ`wT4Y|DC@j>s|Om~t#Rwj4)}E60=L%L(L!aw0jgoJ3A4 zCzF%QDdd!LDmk^BMoufIlhex?*9C6|%Q%H`zpas|1fTuH7hSCOmA)#U1O4Y{UVORg=~ zk?YF!sq!>=x;#UkDbJE;%X8$p@;rIIyg*(kFOnC_OXQ{UGI_bYLS8Aal2^-XvDc_QB%Xj3v@;&*!{6KyvKawBIPvoca zGx@pvLVhW~l3&Yj{wRNvKg(a_ukttfyZl4`DgTmx%YWp*^1uJWFDjBE zD~h5jM9~yoF%(mwils2cRvd*ZuHq@a5-6b(QDP`Dl~_t_C5{qTiKoO@5-16kL`q^M ziIP-FrX*KVC@GaxN@^vIl2%Elq*pR18I?>*W+jV~RmrAgS8^yhm0U`0C6AI<$*1I3 z3Md7YLP}w!h*DH3rW997C?%CrN@*oZDWjBC$|>cQ3Q9$#l2TczqEuC?DbDf zQd_B`)K%&!^_2!nL#2_@SZSg(RhlWyl@>}%rIpfJX`{4N+9~ao4oXL*lhRq~qI6Zd zDczMGN>8Pi(p%}H^i}#P{gnaAKxL3JSQ(-WRfZ|Ul@ZEFWt1{n8KaC<#wp{K3Ccue zk}_F|R;DOZm1)X!Wri|SnWfBD<|uQOdCGicfwE9pq%2mJC`*-P%5r6evQk;4tX9@2 zYn64%dS!#MQQ4$yR<QZ#nlpONwt((T8&c6sAbi1YI(JST2ZZ}R#vO1Rn=;0b+v|C zQ>~@eR_myB)p}}uwSn4DZKO6Y_h1ybWrM6bvsBP7DYJ0VV+EMMKc2>Kn zUDa-CceRJwQ|+bpR{N-Z)qZM!b$~ih9i$Fchp0oUed6 zI#HdZPFADUDe6>pnmS#bq0Urisk7BN>Rff6I$vF&E>st(i`6CSQgxZSTwS5AR9C61 z)ivr`b)C9i-Jot%H>sP|E$UWvo4Q@yq3%?7sk_xZ>Rxr9x?eq@9#jvhht(tMQT3R5 zTs@(lR8Og=)idf@^_+TMy`WxHFR7Q+E9zDCntENmq25$)skhZT>Rt7odS88@K2#s6 zkJTsYQ}vnpTz#RwR9~sD)i>%}^_}`&{h)qSKdGP9FX~tIoBCb-q5f2VslU}f>R*hLT}qI2l1kl2K$d8AHaBab!H1Kqit&WHN~+Q^-^@jZ7yq z$V@Ve%qDZlTr!W$Ckx0zvWP4uOUP2Pj4UTB$V#$`tR`#7TC$F;CmYB{vWaXaTgX?V82Ub2tuCkMzua)=xzN61lfj2tH?$VqaFoF-?;S#pk?Cl|;?a*13f zSIAXzja(-;$W3yK+$ML(U2>1yClAO&@`yYpPsmg9j65eV$V>8yye4nRTk?*)Cm+a1 z@`-#VU&vSTjeI9R$WQW%{3d_MU-FL#ny5*dtSOqR5lz!{&CpDZYL><{TXQt7xtgc> zTA+nmM2n%t)M9C|wK!T_EuI!%OQ0px5^0IGBwA7}nU-8jp{3MPX{og|T3RigmR`%C zWz;fhnYAohRxO*BUCW{6)N*OLwLDs0EuWTOE1(tB3TcJ4B3eI%plWPFiQJi`G@^rghhPXg#%FT5qk7)>rGN_16Yy1GPcg zU~PytR2!xZ*G6a~wNct=ZHzWn8>fxeCTJ72N!nyBTAQLx)uw6FwHew>ZI(7$o1@Lu z=4tb_1=>Pwk+xV{qAk^yY0I@0+DdJewpv@Gt<~0P>$MHqMs1U}S=*v*)wXHdwH?|{ zZI`xN+oSE(_G$aI1KL6Dkak!*q8-(aX~(q_+DYw{c3L~5oz>21=d}ykMeUMyS-YZL z)vjsRwHw+^?Ur_1yQAIJ?rHb62iimJk@i@7qCM4~Y0tG6+Dq+~_F8+Rz17}n@3jxw zN9~jLS^J`W)xK%pwIA9~?U(jj`=kBU{%L|P>XI(&imvKJ*K}PsbW^9gr8C{u9i8j0 z?&-cB=%F6bW9TvUSbA(djviN!r^nY5=n3^idSX3^o>Wh!C)ZQxDfLu(YCVmfR!^s= z*E8rD^-OwZJ&T@I&!%VBbLctsTzYOjkDgc0r{~uT=mqscdSShYUQ{op7uQSZCG}Ez zX+26WqnFjo>E-nbdPTjGURkfASJkWO)%6;BO}&<0Td$+n)$8f?^#*!Fy^-EnZ=yHV zo9WH<7J5s)mEKx!qqo)D>FxCndPlvJ-dXRWch$S;-Sr-NPraAkTkoUy)%)rF^#S@o zeULs_AEFP{hv~!h5&B4dls;M?qmR|c>ErbY`b2$_K3R{}r|47lY5H`1hCWlDrO(#q z=yUaX`h0zXzEEGJFV>gnOZ8>?a(#uqQeUO7*4OB3^>zAseS^MH-=uHWx9D5-ZTfb7 zhrUzarSI1F=zH~j`hNX@eo#N8AJ&iPNA+X+as7mTQa`1i*3al?^>g}p{epf`zocK* zujp6xYx;HlhJI7OrQg=?=y&yd`hER@{!o9UKh~eQh%kt*5BxF^>_Mv z{e%8d|D=D`zvy4}Z~AxrhyGLlrT^Cd=zsNpx?qTgWXOhMs0J}KLpKb=G^k-2%&-l| z;D&2>hHnH$Xhe({Moc4?5!;Ak#5Lj>@r?vVLL-rp*hpd|HIf<0jTA;oBbAZbNMocm z(i!QE3`RyHlablTVq`V48QF~-MouG_k=w{)KJv6dPaStfzi-tWHdIK z7)_04MsuTu(b8yTv^LrpZH;zDd!vKV(dcA!Ho6#Hjc!JFqleMc=wSw(U@dRHlmFw##CdPG2NJ9%rs^h zvyC~%Tw|Uw-&kNQG!_|)jU~oXW0|qsSYfO*RvD{}HO5+Fow457U~Dut8JmqQ##UpS zvEA5V>@;>6yNx}@USprJ-#B0#G!7YujU&cU6)JDn}Hdc5i^Dv(~M=tHshFa&3I;fGl7}VOk^fDlbA`(WM*J46HZ&WV zjm;)zQ?r@b+-zaCG+UXi%{FFRvz^)A>|k~@JDHu$E@oG=o7vs$VfHk8nZ3#+-L4L510qdL*`-ghLRCtrMs;dXlTvC? zMs4a)PF?Czp9VCf5gLQWq_Jph8i&TE@o0RSfF`7gXkwa#CZ)+}a+-psq^W3XnueyO z>1cYIfo7zcXl9y)W~JF^cAA6cq`7Ernuq44`DlJxfEJ{MXkl7}7Nx~#aaw|wq@`$S z8b!;{va}p6Pb<)hv=XgMtI(>n8m&%i(3-RstxfCDy0jjxPaDvNv=MDgo6x4T8EsBm z(3Z3nZB5(IwzM5>Pdm_#v=i-2yU?z*8|_Ye(4Mpx?M?g8zO*0hPY2L}bPyd(htQ#P z7#&VW(2;Z$9Zkp3v2+|APbbicbP}CRqv;enl}@A6=?prP&Z4vF96FcIqx0zkx{xlS zi|G=&lrE#o=?c1%uA-~y8oHLQqwDDgx{+?8o9Pz1m2RWk=?=P+?xMTt9=ezAqxMqwnbl`jLL3pXnF+m42h&=@0so{-VF> zANrU6qk<(`k|kS;rCP+&EZs6J)1sDTG0V0bi(9VcS-urmp%t-WSTU_wR%|Pd71xSq z#kUez39Up{Vk?Q2)JkS0w^CRstyETOD~*-bN@u0FGFTa{Ojc$qiSUIg+ zR&FbgmDkE=<+lo01+79>VXKH$)GB5bw@O$gtx{HLE6OTkm9@%Q<*f=RI)z23A9>k=590Vl}mzSSOh_`dR(00oFikkTuvEVhy#1S;MUn)<|oVHQE|u zjkU&E%PbFF#Sd~1QV&{||Iww72+t!377 zYlXGaT4k-a)>vz;b=G=ogSFAxWNo&#SX-@a)^=-$wbR;V?Y8zx6aEI%S=<&RA!ybJlt5f_2flWL>tdSXZrU)^+QKbxK2wdS$(~-dJy~ch-CBgZ0t+WPP^2SYNGg)_3cN_0#%g z{kHyCf31I(z(gi7nJG+VglSA?1~VCD7Gun24&%&a9`jkiLKa~$SWFg+#b$9>To#YT zX9-wBmWU;0Nmx>rj3s9&SW1?PrDkbZT9%HbXBk*VmWgF%Sy)zljm1a?_3@gjZvGS|}tH>&`%B%{j%Br#ItOl#e zYO&g^4y()RvHGk5Yseb0#;gfz%9^p}tOaYyTCvuw4QtEVvG%M3>&QB>&a4aT%DS=c ztOx7Kda>TD59`bNvHolT8^{K+!E6W{%7(GwYy=z0MzPUs3>(YFvGHsIo5&`y$t;>p zVN=;OHl592GubRQo6TW!**rF%Eno}TBDR<Zxnyq1L**dnKZD1SO zCbpSvVO!ZYww>)@JJ~L_o9$tH**>nw?>1**SKe zU0@g4C3cxzVOQBTcAec|H`y(Ao84h|**$ijJzx*nBleg*VNcmJ_ME+7FWD>hn!RCf z**o^0ePAEiC-#|rVPDxd_MQD;KiMz#oBd&b**_-OqAl67t=OtfY|Ykf!!~VdTQ;+8 z+p)Rr+Mey(fgRcrJBA(8j%CNT*R*Tdwe31~UAvxL-)>+xv>Vxt z?Iw0pyP4hGZeh2yTiLDcHg;RPo!#E0&XV146*bD7N_F{X9z0_W2FSl3NEA3VGYI}{n)?R0?w>Q`u?M?P(dyBo* z-ezyNci21aUG{E!kGYTBy{+)3f2bW%B~ zoit8bC!Let$>3ykGC7%@EKXJ@o0Hwi;pB93Ik}xYPF^RUliw-e6m$wXg`FZ!QKy(w z+$rIdbV@m;ohYY_Q`RZxly@pP6`e{>Wv7Z$)v4xGcWO8_omx(9r;bzCspr&p8aNG| zMowd=iPO|+<}`O&I4zx4PHU%))7EL{w0Al<9i2{2XQzwP)#>JRcX~KIonB6Fr;pRu z>F4x!1~>zqLC#=jh%?j~<_vd6I3t}=&S+In$jP z&P->PGuxTt%ys5D^PL6GLT8b)*jeH%b(T5HofXbXXO*+sS>vpA);a5)4bDbqle5{` z;%s%cIoq8b&Q52Sv)kF@>~;1z`<(;MLFbTj*g4`Hb&fg5ofFPU=ah5WIpdsl&N=6u z3(iI7l5^R);#_sEIoF*V&Q0f*bKAM&+;#3b_nimML+6q6*m>eSb)GrTofpna=auu? zdE>lw-Z}4`56(yDlk?g6;(T?!Ip3Wh&QIr;^V|93{B{000vEZ&Wv+0Q6RvTc8{FiS zTbyy5JDhWud)(&%4|#;g;4yhD9-GJEad|u*pC{l6c_N;eC*etXGM=2L;3;`3o|>oO zX?Z%Ho@d}0c_yBjXW?0SHlCg5;5m6No}1_4d3ipbpBLZ-c_Chy7vV*DF@5OubKD;mQ$NTdEd>|jh2lF9( zC?Cd$^AUU`AH_%WF?=i^$H(&td?KI3C-Z1Ng-_+v_;fyl&*ZcCY(9t2<@5M_zJM>} zi}+%`gfHdG_;S92ujH%vYQBcAU8C_;$X7@8rAqZoY@_<@@-4 zet;k3hxlQBgdgR{_;G%MpX8_bX?}*E<>&Z$et}=)m-uCVgz;|Kz{; zZ~lk><^Qw2#525#s^+!$_5HD>%&MmLk2+0Ei+b+ftI-5hRC zHiFsc89n_-C^!@-6`%=cbYrho#D=OXSuW8IqqC{o;%-N;4X9*xr^N;?oxM| zyWCyju5?$qtKBv3T6dkh-reACbT_%1-7W4`cbmK2-Qn(Zce%UWJ?>t2pS#~Z;2v}j zxrf~&?os!cd)z(Yo^(&Sr`WybU(SD-7oG}_nZ6O{o(#} zf4RTiKki@mpDTExCwa1`c&bM{&C@-@Gd=2A9`kI^@wn%Dp67dk7kUvdh8NR|<;C{m zcyYaWUVJZsm(WY(CH9hdNxfuVaxaCK(o5x~_R@H1y>woBFN2rS%j9MDvUpj&Y+iOR zhnLgK<>mJBczL~iUVg8DSI{fu750jFMZIEPaj%3|(ktba_M*HpURkf4SKh1ORrD%( zmAxunRj-;?-K*i%^lEvvy*gf9ubx-mYv48X8hMSqCSFsonb+KF;kEQyd9A%RUR$r7 z*WT;kb@V!UoxLtzSFfAb-Rt4?^m=){y*^%FubL4Lm^a)T;f?f0 zd855C-dJy(H{P4zP4p&tlf7tfiZ|7p=1uozcr(3O-fVAg-f!=Z_t*R93BKq{zU(W$>Jwk{b>Hw!pZb>1eA{EX{RRF)f04h~ zU*a$Im-);675++pmA~3wPJfrb+u!5w_4oPv z{R93%|B!#!KjI(tkNL;_6aGp6lz-Yk zgIGcAAWjfBh!?~U5(Ei@L_y*pNsu&179ySU1=WKZLCv66P&=p-)D7wd^@9dM!=O>nIA{_y4VnebgBC%{pjFU1 zXcM#z+6C={4nfDDQ_wl+5_Ao^1>J)lLC>I9&^zc8^bPt2{euC)z+g}?I2aNP4Tc57 zgAu{VU{o+V7!!;Q#s%Yp3Bkl*QZPA)4yFWCgK5F^U`8-Am=(+p<^*$tdBOZ(L9j4b z6f6#w1WSWu!SY~5urgQ`tPa)$YlC&c`d~w_G1wGr4z>hagKfd~U`Mbs*cI#!_5^!_ zeZl_VKyWZP6dVqY1V@8o!SUcka56X*oDR+eXM=OW`QSouF}M_54z2`OgKNR{;6`vW zxE0(E?gV#(d%^wSLGUnm6g&=|1W$u!!Smon@G^K6ybj(3Z-aNi``|=Rt~F#Rl{mw^{_@*GprTX4(o(!#CmE@Ll*m{1AQ&KZT#e zFX7klTlhWv5&jH+g}=i;;otCIC`81F6p5JW?W3 zGEyp1IuaEr6Db=h7bzd95UCic6sa7k5~&)g7O5Vo5vdue6{#Jm6R8`i7pWg<5NQ}` z6lol35@{M~7HR%J?cD`<)Xcsx@H+00Op-}vl1x(E-Q5bsT}q*aZGjdlxO-vo;_mM5 z?(XjH?)GKde+%WMh4Y?!&bi-nPk7khZX=r=m+XeKaRFQ)*P3g?wdI1iV6Gk4o(thZ zxiBuA>%ev7I&q!3E?ifx8`qud!S&>NalN@dTwksq*Pk1}4de!K5!_&I2xsSpa>Kac z+z4(YH;Nn0MRH@fvD`RrJU4-x$W7uVb5ppf+%#@FH-nqW&EjTrbGW(OJZ?U>fLq8d z;udpDxTV}OZaKGtTgk2BR(wcI*xJ-30|$Zg^_b6dEr+%|4Iw}acs?c#QGd$_&a zK5jpEfIG+?;tq31xTD-L?l^aXJIS5mPIG6tv)noEJa>V+$X((tb62>l+%@hxcZ0jh z-QsR@ceuOUJ?=jDfP2V2;vREPxToAR?m72@d&#}xUUP4_x7<7KJ@cx z+&9jNcjjGqSKf_x=RNpDd}2NcpOjC=C+Ab}Dfv`F*iLcCi@m2V$yfW_>KG~elx#?-^y>}xAQyro%}9-H@}D9 z%kSg&^9T5Y{2~4@e}q5EALEbnC-{^6DgHEnhCj=n24{{xW}szsg_Zuk$zf zoBS>QHh+h|%irVg^AGrk{3HG`|Ac?aKjWYCFZh@IEB-bAhJVYycT2{xkoD z|H^;kosctfL9WORxg!sh2qi{IP*Ri(B}XYxN|XwvMrlx5ln$jw84!nfgphzlBq5As zq#%MQQjvyq#E^kZ2LZ~n*f{LPI zs5mNtN}^JzG%AD2qH?G_s(>n@N~kjOLRC;zZk^)iE5$Rs1B-&>Y@6m0cwaE zp~k2QYKnYNGvte!qZY^y`JWO-x-lz}ii~6DdXaE|B2B8Qv7!5&oG!zX(!_f#d5{*KmQ6w6J#-ed( zJeq(eqDg2nnu4aHX=pl{fo7svXf~RI=AwCMK3aelqD5#iT7s6MWoS8CfmWhbXf;}c z)}nQ2J=%aaqD^Qs+Jd&CZD>2%fp(%@XgAt}_M&}gKRSR8qC@B~I)aX(W9T?Kfli`R z=rlTm&Z2YZJi34`qD$y9x`M8vYv?+K6-#2qDSa4dV-#!XXrV4 zfnK6l=rwwS-lBKtJ^Fw?qEF~E`hvcqZ^%h-7F+~Z!A)=%JcL9-Vj+o;R7fTy7g7i* zg;YXnA&rn$NGGHhG6_QG9r;tm?E#wjM3i*WmLII(mP)H~&6cLIF#f0KQ38AD=N+>Oq5y}eXgz`cKp`uVp zs4RF1RfMX7w@^)}F4Pce3blmVLLH&5P*12YG!PmJjfBQR6QQZ#BQz6yh2}yF!B6lP zS_-WMn-Cxb3ay1ULR%q72o~B2?S&8_R0tEog$_bTp_9;A=pu9#x(VHd9zsu{m(W}2 zBlH#e3H^lu!a!k=5Frc}h6r|Hs4z?zE{qUH3ZsP4LZmQ87%Pku#tRdKiNYjdvM@!M zDohim3p0e7!YpC7Fh`gx%oFAd3xtKjB4M$xL|7^;6P61rgq6Z7VYRSESSzd()(abi zjlw2jv#>?jDr^(B3p<3J!Y*OAut(S{>=X722ZV#dA>puaL^vuO6OIcfgp$v#G))JA`z*mikhg4Of*DO^b|9SnZ(Ru7BQ=s zP0TLl5Oa#T#N1*YF|U|U%r6!Y3yOur!eSAzs8~!aE|w5WilxNTVi~clSWYZ2RuC(S zmBh-TmsmxtDte36#Oh)Vv8Gr{tS#0N>x%Wn`eFmIq1Z@lEH)9Fiaug9(N}CPwh;Y9 zf3c<5O0A5;u!m#I52sal5!f+$ru7cZ++(z2ZJ`zj#1AC>|0Ii$}zx;xX~KctSiWo)S-s zXT-DOIq|%BLA)ql5-*Ea#H->p@w#|JyeZxiZ;N-tyW&0ZzW6|VC_WM&i%-O-;xqBN z_(FUsz7k)HZ^XCaJMq2vLHsCw5&n!q(o9;DT$O+N+uNOozcG)x*UjgUr4qomPNq%=kvD~*%JOB1At(j;lJ zG)0;!O_QceGo+c)ENQkhN17|mljchcq=nKVX|c3KS}HA*mP;$7mC`C{wX{ZBE3K2( zOB9BM}Iw~ELj!P$`lhP^a zv~)%~E1i?hOBbYz(k1D#bVa%8`=E$Oy&N4hKBlkQ6oq=(WY>9O=gdMZ7W zo=Y#Jm(nZgwe&`ME4`E6OCO|<(kJP&^hNqAeUqH9Gj_qQ*bTd551a@m#z}BeoD3(& zDR4@h3a7?ta9W%Wr^gvEhk1;!fJH1}jAg7~f+<$9hIP!aflcg*GvZ7*GtPpu;%qoO z&Vh5{TsSw*gY)8iI6p3c3*th!FfM|N;$pZsE`dwpQn)lOgUjM_xIC_aE8M$tKsUn2Cj)~;o7(ku8Zs8`nUmZh#TR?xCw5GeQ-1Ei<{#X*bn>TmbewR;Q$IfG^@p_%gnNui|U?I=+E# z;#>GOzJu@Ld-y(nfFI&V_%VKhpW#|iJVkUCMTCu$SLJia%wq^oK{XJra!0w7+*$4-ca^)z-Q^x~Pq~-eTka$GmHWy4c6q2gOdc+e zkVnd+ z`L=vVzAN98@5>M5hw>x&vHV1SDnFB-%P-`Y@+B&zmwm~ALNhnC;7AdMgA&( zlbsZ2#YJ&d+!S}kLrJ70R+1=5m1IhCC54hwNu{J#(kN+_bV_<9gTg7if)qg!6-mL0 ztSAamsG=&GqAN@>6jSk3GAfyr%t{s|tCCI0uH;a1D!G*0N**Pzl26I66i^B(g_Ocd z5v8b7OewCEP)aJLl+sEWrL0m;DX&yeDk_zf%8Hj#MX9QIE7g?hN)4r^QcJ0=)KTgx z^_2Qb1ErzTNNKDzQJN|~N;Ab*X|A+T{1ktsrP4~VDFI5L(pqVwv{izXV5ObXUI|e` zl`th->7aB}Iw_r%E=pIWo6=qBq4ZRGDZQ0GN?)a)(q9>%3{(aw5z1g?h+itWx29KS*fg2Rx4|iwaPkWy|O{ssBBU;D_fMU$~I-YvP0Ra>{50sdz8J( zK4rghKsl%!QVuIel%vWq<+yS}IjNjdPAg}Wv&uQ;ymCRgs9aJmD_4}O$~EP>aznYP z+){2Uca*!zJ>|agKzXPX;CCNx~l7gfpsYq&)hNLCwNP3ciaD*p_2t*_j!9*qs zA%qf@XhbKB7{nx=BqPa0GLtMME6GN(lN=-`$whLLJR~p4NAi;bq#!9o3X>wFC@DsY zlM7}JWGPujmXj4^C0RvQlQm>5 zSx45B4P+zPL^hKxWGmT5wv!!XC)q`IlRacF*+=%11LPn%L=KZ9m!lRM-txkv7k2jn4nL>`kTFCDh$f~js!&2HRjEdG%BVq2>Pa)wOf)miLbK9rG&{{fbJAQi zH_b!y(tI>OEkFy>LbNa~LW|O3v^Xt6OVU!bG%Z8R(sHystw1Z%O0+WdqE%>B>P@TB z>a+%}No&#Cv<|IH>(TnO0c}Vd(Z;k1ZAyJ;GwMs5(-zc^`qP%Q6}8a-8c18yHnc4b zqQSHsZBIjJC=H|Gv;*x(JJHUx3++m~(eAVd?MZvl-n0+xOZ(CObO0Sl2hj*Rm=2+K zI+PBh!|4b*l8&OIX(Sy($I@|hJe@!%(n)kOokFM5X>>ZBL1)rgbT*ws=hAs}K3zZ; z(nWMJT|$@AWpp`RL08gMbTwT=*V1)#J>5Vz(oJ+T-9oq0ZFD=`L3h$!bT{2Y_tJfI zKRrMX(nItxJwlJtWAr#ZK~K_C^fWy~&(d@BJiS0K(o6I*y+W_jYxFw3L2uGq^ftXi z@6vnpK7BwR(ns_$eL|noXY@IJL0{5W^fi4$-_m#VJ^esG(ogg={X)M|>lT-)v+AO{ zs&1;g>Y*l56RSzoq-ruXxtc;vsism>t7+7)imIeyRaO<1s8m%| zP1RMV8mg&!su|TxYGyTynpMrFW><5lIn`WhZZ(gZSIwv9R|}{G)k11vwTN0&Ev6P% zOQ~@eR_myB)p}}uwSn4D zZKO6R5H0I$oWiPE;qUlhrBeRCSsRNT3x?bI&Zd5m^o7FAqR&|@YUEQJXRClSn)jjH7b)ULlJ)j;` z52=UMBkEE0n0j12p`KJvsi)O5>RI)idS1PtUQ{osm(?rkRrQ*BUA>{+RBx%b)jR54 z^`3fPeV{&6AE}SkC+bu6nfhFPp}tgKsjt;H>Ra`l`dd71fGq z#kCSzNv)JtS}UWK)yiq*wF+8At&&z*^U|tlRW)y|npRz_q1DuCX|=UFT3xN4R$ptN zHPjkujkP9PQ_V+fruk~kwHBJ6=C8HXT4^>dKnv7bYi+c)T96j3wbR;bAzG*wriE)A zw2oRQt+Uoe>#B9rx@$eOo?0)hx7J7NtM$|RYXh`_+8`}L8>|h{?AlOmm^NG+p^el= zX`{7BZHzWn8>fxeCTJ72N!ny>iZ)f7rcKvoXfw50+H7r(HdmXc&DR!a3$;bsVr_}G zR9mJk*H&mNwN=_`ZH=~8Tc@qpHfS5QP14cPugegi}qEsZgHbK z>n^&h?xwrz9(p1@v7SUvswdNv>nZeirr_9-FXV$amS@mpsc0GrlQ_rR6*7N9j^?Z7My?|a&FQgaNi|9r5 zVtR4CgkDlFrI*&r=wC5#M`bvG3zFJ?SuhrM->-7!#MtzgMS>K{>)wk)}^&R?7eV4vl-=pu<_v!of z1NuSzkbYP{q94_d>BsdG`bqtiep)}HpViOl=k*KvMg5X~S-+xR)vxK-^&9$4{g!@P zzoXyP@9FpT2l_+(k^WeJqCeH2>Cg2S`b+(l{#t*dzt!LA@AVJ*NBxujS^uJc)va3` zFlXk%T$vklXC5pOOU#n6q%0Xr&Qh?HEEP-5(y+8F9ZSzLFplvIF@cFpVwlNHVT4hp zGL7ksF@u@RlVxO?SZ0=mWo6k|c9w(XWVu*wmWSnK`B;8dfE8qgSYcL#6=lU(aaMwr zWTjYXR)&>j}G(o`tYb7RJI^2iB2wVx3tR)|GW* z-B}OTll5Y~Ss&Jy^<(|n05*^fVi9aG8^Y{tC>zFxvk`108^uPmNH&IzW#ia*Hi1oK zlh|Z7g-vDC*mO37&1AFKY&M6@W%Jm4wty{Ui`Zhege_&u*mAaltz@g%YPN>0W$V~_ zwt;PAo7iTyg>7Zq*mkyq?PR;yZnlT*W&7BEc7PpZhuC3ugdJta*l~7(on)ujX?BL4 zW#`y=c7a`Fm)K=?gSI;MiL{bk<3VLq%cw%sf^S{ z8Y8Wd&PZ=$FgSxZkRceNAsN_^4aFb^HB>`0bb}d&VH%!BMkAAv*~nsKHL@AmjT}Z! zBbSle$YbO+@)`M!0!BfjkWttuViYxs8O4nfMoFWTQQ9bDlr_p3<&6qPMWd2Y+3+%| z7*!2#qnc6OsA1GJY8kbSI!0Zio>AXuU^Fxu8I6r5MpMJbXlD2t&5ag@pW$z`G+G%p zBftnWS{rSQwnmTqqot==xg*d z`Wpj`fyN*s!We7}G3>@rW0*1A7-5VwMj4}xNMnpK));4uHzpVpjY-C2V~R1=m}X2j zW*9S#S;lN*jxpDmXUsPi7z>R>#$scMvD8>*EH_pdD~(mgYGaMD)>vn(H#Qg>jZMa8 zV~erX*k)`ub{IR2UB+%>kFnR-XY4l)7zd3*#$n@#anv|w95+rFCyi6aY2%D>);MRJ zH!c_#jZ4O5TgGkUj&awxXWTa)7!Qp{#$)4&@zi)`JU3n#FO65m zYvYaa)_7;UH$E62jZemBzzsncd7` z<}`Dexy?LgUNfJW-z;DjGz*!9%_3$|vzS@jEMb;3OPQt3GGk#%yZ_nZaf|v%MK&hMHk!xY@z%Xm&C?n_bMVW;e6D*~9E<_A+~$eayaQ zKeN9%z#M1}G9%2v<`C0v4mF3F!_5)qNOP1q+Ke>Em}AXx=6G|0InkVCPBy2QQ_X4S zbaRF|)0}0_Hs_dg&3Wd0bAh?gTx2dbmzYbIrno2R>{hi4+s#GXk!lX@oeOzxS& zGo@!LPsfkX*%SK&hxqx2`1?ghgg3Xl2Ly%X${!i{#M#LyQ}4*`kr@M@TTg8)(9gnp zX{+XKtNz!Yrn~RjKQge5wMj(jk|g?v?4o~vF8a9({K*w`1KmLnFcFv-OadkalYz;> z6ktj)6_^@K1EvMjf$6~vAP4dw0tHY6B@lx$sDLpaox*Ds{ON9k)-UQsFF*YXam=p~ zTEDaxy<`G2gIU0=U^Xy2m;=lS<^pqrdBD72J}^I604xX=0tv_6G-m1HnOH1UMKR0@}f$;4pAFI076Ajsi!6 zk>D6`EI1Av4^99lf|J0>;1qBwI1QW*&H!hEv%uNl9B?i;51bD!02hLbz{TJaa4EP9 zTn?@PSAwg+)!-U%Ew~O`4{iWAf}6n2;1+NzxDDJ6?f`d!yTIMx9&j(X58MwP01twP zz{B7X@F;i;JPw`!PlBhw)8HBKEO-t)4_*K-f|tO{;1%#Hcn!P`-T-fcx4_%r9q=xA z54;aP03U*nz{lVd@G1BVd=9<K_zkpvcs6Qr z23c>|hQsCzuP&4dwy!g89JwU;(foSO_c(76FTb z#lYfV39uws3M>tl0n38r!17=Pup(FqtPFaARlusCH&_j<4%Pr`g0;ZfU>&e7SP!fZ zHUJxfjljlW6R;`h12zMF!RBBK(9xK7EdF3iuoY+n1HeGAHRxz*JC?R!5Eu-$1KWck zU?>;{hJzizj$kLSGuQ>}3U&j#gFV2WU@x#Y*az$j_5=Ha1HggcATRS;!9(C-@CbMmJO&;IPk<-EQ{ZXv40ski2c8EnfEU3_;AQX%con<`UI%Z0 zH^E!rZSW3w7rY1F2OoeB!AIa@@Co=7d#4fqy(2fhbCfFHq6;Aij) z_!ayHT0c7Yn0SQP!71Tf-WS{|>peL9S%miizvw&H_Y+!aU2bdGg1?C3xfO)}uV1BRw zSP(1(76yxeMZscVaj*nf5-bIl2FrkD!E#`EumV^StOQmDy}&A9RnQx(237}afHlEd zU~RAtSQo4Z)(0Da4Z%iWW3UO>6!Za`fxcjKum$J``hzXOR-g?G00Y6+U>mS47z74` z?ZEb62p9^6f#F~Wup`(B>DfFr?C;Ak)s90QI8$ARO)3E)I<5;z&00!{^|fz!bm;7o89I2)V;&IRXz z^T7q+LU0kd7+eA_1($)#!4=?2a22>3Tm!BJ*MaN74d6y_6Sx`N0&WGjf!o0y;7)KC zxEtI9?gjUO`@sX?LGTcG7(4Xg1)qV>!5835@D=zPd;`7(-+}MJ58y}e z6ZjeY0)7R*f!0lDqU(Rq1#|`7KzGmsOavwdlYmLVWMFbI1(*^{1*QhmfN8;WV0thE z$bmeFKmim%3B;fbDj)$VsDc`(gA6o46Z8Z#f|XOK!30$*b20P0bn558f*i$ z1%tp~upQVQ3;{#IFfbhK0CogBft|rFU{|mk*d6Qv_5^!@y}>?UU$7t89~=M<1P6f; z;9zhFXa|Ra!@%L-2yi4g3LFhaf@8q3;5cwRI02jpP68)`Q^2X{ICug)37!H^gJ;0A;5qO-cmcc!UIH(J zSHP>_HSjuk1H1{|0&jzNz`Ni*F!r`^fsUScJk#yYR%IJe`gkHoP3!FB;^g|@8@>Ja zMsLm@&WZkeqqqOw=(eCUUX?JdJ{cmCY_p|={TmM^H|64`+$J~4N*Nu#*gA|V_lgP>0 z#nsK-+^X0>Rn7a5yG+E_xL!9C_P*aq4L*&=L%Z3AP^*w@*e-_;%1B+Up#tMVL;+9RKu9qxjJqxU0MP6WO$Fp+QgO`(+ zd-VN!!+is5IR0ALS|1*2oq%;cr9FvHTmLX$Ki@FlNb7W}`3BZ;Y~q+(3~wGd)$u&P zwfV0CaD;7?BN0T}64_nL`gV?t4d&$CBW)=oY^fq_scq@JoUIdZa@;Q1#(6o1H@8jk z{`F;?UP+^GFkH3Z4>uTgv;BO7Ve2i+xCDhpTF15ig+&fP}93?_cydlh(;8 zLzmz>4*in3&33qI^7xkyU9x8@b=I=RwjIMAPOo0~s>5eD9%emfSv<67SBFF0(jPpR z)^XUVnv>{$odk?Y88N{d z==VMR|9;2@wzH1%FAb*n(_?q*t>dJ}q;b=uueClVJq9**Y}~~0@IykhyZeRtgmsCG zH!c3=5@AR7g!qchp}&i-I0@iyM26k1U1#f*?Jhp;tgMmIp0rM_9OZmN!~8>nA}ded z`0bmQlijsS)!NqkSb5`@S2jnTc67vLbfUhO3-MMurpwzN^>_H~YxC6(y+^!rvC=V} zqS|Ymjhk-&rNI(^ggZv;<3#vR1(0JVBjE_Qh5pB)`kyS?-}j8k!2hAZcT5i?6!?y5 zir)l2i9hgdrlZWa>d%a}KwE2nTUN)joVGl1ZWbR^8(OuuRpdE4J3GQ7flZvPD&Db) zzpawDt+L~AKLnihCV#!)-^(R7H>dtN7dws=RcyyOO6y-8CH7kbWl(%&b?njnbj$I7@?*pw8x4Ps z4cEApcQtE$!m(j@@eBLU)!e@-&Y>}6H3j~x&FF~ksK)wlg8ZLqWyBu^j){hZGDNT6 zMuE-sZ|8=8>NCaHQ;O;y+C03iu0zrq;l3eOqo`bv-}fFw$_RUk&@k)o0YR;*1_y=N zBE75|a9JVbm=o|y9va=1@ap8~Rs=fc1AcmNWT^jOggItB5{fX#%*$``jCxI} zd8mJj;2=N8^T4du`+7Os-E6_(A(7s;EZ){mQCmiO+ui(pJ4fx1ylp^>kl^ooS%Wn; z-%y{hfVTdQ+(V^11Y(8dOC$)5ry{t+F!TSdo*umZEfKd?_d>vSCZc>l1E-OVQ;s70hTXp|^+ z(8wA{auoc<9p?&Xn#hp~9QB2@d)*&)w@xwYGey@G@g)F{s?p(W?cC(YogHZ=-chT- z7LH!kACJT&wk3fPvxJZ8m;w)U%zwky)(42H;asCqaCE#RhYB(_X4?OF*1y;z+$LCk zG`nk%Z&;*Fw5Rm(v7Y&a2ek`m(YlSlRmpnyx2jFsl&I319)9+9_3Jd=fsU*AG6p)X z;j6rSL-gQ8xmwYK6Hx_wWOQMl>+`hPuhaZ-NJLc%c8{RokhZ>U;v6EnzOcJB54LLe zsAD%O(x4T;d!~Sg?W23fE zdzv3diKAE8-K$4!5R-RpU2Q$02Z*hHXhKbRs}BmJ-0qda{M(kYN)Csme;C-G4C;$WJS_wLEp7jtL0>1G#hSxbZ{&8^IC2oIYnzcTvUXL0-jEVoKVZ|nq z){u4{D^da{SsTR+Y5!q-fA6w6W(2;^2K+eA@T*@Z_@{;*e|zvBMna+^({VLM^b*@g z{bh1;{hpj+zOM=U{hSPbPBt;ojY>xEfI$jsAOlS>BbXV?3T6j$g1NxlU>-0pm=DYk z761!^g}}mK5wIv&?9b+2;*0MDlIHWDljcudG?+^~Cs(7eD zH|Y|S=&j5hXNmcHjPb2^ys4E>epBWnuT54x>Pe-jjYmgq9{Fe0b3F4IKTMmsK~y+~%p8jcm-w7xB^cacZ*uY}~kY z^7^S4klkAUL*wk{LfW3fy3{4ABaqv=;w3cl?-WLH&pIMD-~P5$^v_K8MCH4frjEC5 zN}%I=pYgN<9dqVI>@I)rQZMULn3zjo5;nF;7~2AKoL2O}k!zW9wIi#hPU#piay(7= zBAkEbWVa9T5#G6`bNY?HK3T_iZLLw!ImOX#>U;^dzl=&#lpB-xWwjc=yo+pIA;gCI~pc`qttff zmgpH%N2wk6V$^?qmj0(}_~>@tFT3>d*YGj*K}^Rz=CvaY#&@WgE9IOXP5&?=eHtgH z!nucCa~QEp%bvmVeC>7J9D3gMI+P*JuMZX7+Ks+`*{x-8a4xGsZ;?BiJkjKhCST+a zlc9l*?^W62&VT-|$|!?lGfKia>wAMS;)ik0gc>|g;%1ER{gT+M^D6EZzsOu4;%?#U z<@WQa$G5m!xOjOu2E?s#5y*b$tn~?1tjS|4*1uh^eh-eYX9<&;o%@V74QPPqNwA?_C6vjZgMQr7wel5(KqtEPXaCUN_@Ciyou4gZhT zBi0bP)xn9XM*>|O(e3JZaC1DkJ03i&2S@1te_2KRgN?5xjt~6foZVj19v}_Y>H7kA{ zY=7F!_=hS}t8WWIjeZW=n4pYX75t*yagA+vCU}g0Yz+B(r~B_*(;GdE95~HNXx7xw z42!4%N5oZBf|HruUfw)rXO@{Bc0o6- zB9JvL5xdRCR8ndx(K0EE7%fdqDOfR-NsLFCXvL_h!~>EtWi(iYNeub__j-Eznx0t} z8H%2_)9?7-fB)Zmx4stn$9s`;ZVukvHX``&Gpt3GKu=QAun|mWrDf{ciOUbX+chJy zM^i%CV*kKS`0kZ<4v3nnDgN{#H73luz$a;`khxms)QYe|d-TR7E8-E-IzncXHZ{?7!P3*>wCHn`MNCH{V0;Opnne9y$+NDA!ozj%E z5()S@QOpjcd$wgz1LAvt)k-jxlZZxgN`1F6iK^XR=lklTYohy(PgLI$W-h4f%4H?8 zCff_cxImQpY%brQ%%pdTpd`U(zr$I5fX@m?9 zYeXiuf?DeIkv%bi>;+QdY?km$zAMW_arUlw;4E=kbD~%5OBORyVp|%Rp@dSIT+cR5 zeIqjSK;esRq>X%`q7-965PFB&FHBeO5SGf%D$T3GuuyU)b3z5UGfqZ<_{LSmLOLrJ z3T~SKvXyYIudg6V8teyl!}(oIx#)SUfg4JfNzH??AX%&4?!U2-X_2@El&?vh99S^h zZn=G?QLz9Fx*>3r>@agdx5cGqTO@9l)ENwbrJ$d27-S4*`DCOs7}0y6Q1-$*0?D z%spD2Jp!?yxyMD=LsWvvAqa!y@nQkiR)Pr#rkbKoshcnb0u}~EAVdidi1~D`S0ins zjqeC7_>7XBdLZ-kLY0iS8+2}T_!VJ;jH<)rzyhwQkt5_e-Q_U9anE>`fjDjU5$Gap zp-{cYKU4kv)Q#<&h5$rq*rrG9VkXnA&YHSu8XHnn!Uf0Wqc^hLM$2>GdcL)|Q)O&zXVxI8zY#X?FDLIN}dc&~&Y0OlodKWv>O z%;IoTAyj9zx#e78P`$8aimZnOrI`&)XScF2r^T8I7#HYYfrMC7F%=s^Q>5>}pSA~h z-xRLYMt=t{aYrxWm=JvlzmP2YGPZP@hBcbg`)!ABc?)AgS#M#aS?6AZKE))x4Zi4a zu#HLwQ8Un69MI@9@Cz3bQpJVl-+ zhsa^_6zMxi-ef0K_qMe*0K-Kz&=JyG|eFm z=3M7M?SMI^W5VZCr#DZV!|X;>>I=EN#L8YMX2Y3eszg_ly1w~**~p5~Q-XShd_Fd5kibDZ(+vJhi2{l{*`eJ^-u7>4Q2Ao{ShHyXepOqz+y&`;__S zF^E=C+Cwl+DTaOn+jO>qqhcXD9;eNXag{qinetwU{#IqGsyUhj(BMFY{S zAjWQxYGja#(6I!Nc`LWlfZ)#+I)F1sGtQApXs4LW z7ofV43q)3?LTw}#rF5p<3Zp5_*1t>ZE8+fhwkRPq8}3K+4rcOU(YO;EbUQ2v7*der zih1Y~VqKsS;iGtGBmjPsp=9gH0p1mdGL0k)2?VWhwg~^7b#CfU_vCY?A3``adsUYa zlLPv~P`m&p7grih-ErALQQXGf0D{FZxvwKUcqvjx5$ZrOZ%Ado z%UGkl3&1k<*TFuG?y94^>;f9n8yN!`c1YjEWbZ9_aF<#Vy?GHbL~23+l;9@d1S-?_ z4LanqtAr^5bx3D>5cP|cP(QjC`o@8C-)6qGs%urkU0gR{nS;hM-s8DLJ+M@x zI$-r`xmK+)901%28P`$ypDPg0f= zfZiwiGC2Ub=ri!r9Ic;)zfXZ_m_IW&i%csQQMTy93ZunN^nydg_TwEP766`MMh%A} z(^*zyRCvxa8=Meakw%7Ky*>$u+Il3wP=8Y zxCGQjJ-JLJl|V3rj>(W*wq&;oJQGShj;iNz@MYF&*TGn{26e~ZY$h0`4wD0>MB_phT%oG<-s-VXJ1^Owli zphI$GXJfFyvO#$pz9CeL$;uv)!WhAh3s|x5y@a860E3nTi`C9;W{zeDvDl(K!>BM; zJGXg)-Oe7iLx&M(4}UBi^D{V_9c_FQdx$c37XA1K*HImRvMA>%>t>lF=D0y6v`7|6 zXV?NMR}DP0Q14$5RlggpQ#UM%ddF1vMXFn@<)r6LU~Yzt7LW zFVVnh{D6otGC8KjY~Sio0C_e!qxr4;;K0izO$N288$JBC(83anGviNOEnPN8bep9| zVdvhK-i|%=i2iX|EeXI0HCpI(Rq|m&i!7-fT?*546LqyvJjzx#G)C2F?H8T_>r5aU zbnD!RZQWO`11e9Ob+TI)=94UwHF;q%bJ2x!EErO_N?M6WUgHU)E*B?w7)6TdU7#CO z#fOHEyYcZYc;FUJkMhfwyIDqTv_#X>8BC28^wWJA4$E8(%)`F+Rkg1>oIPxjRMmUP zro`NK2jCv;q8sTTw)JeU2{2Qh@JYMyc(F!{bvM}ZIfA>z)}HD`%_skK3Q$Ex-7dMY zqX^?*zd^r&$1nfPRomMZZsIwS6}VLbool+au9ma#q7^prqE#&m*yjIJ+d}=U0FA1) zYHS?2pqpv|dSVi~&37MnYK|>StB1=&CAE0fq!!I3wO9;@x|d$aV=Ksvh|W*qYxG`L z=Vg*5or2izDM$`9W3@vw=qJE|Q2adE50&}3zv1G>>_p2qp4{}-OKmShrIbCZi|etAwTI2&WlfsB0NdQ*0mNPC zNoEjw#^HEHhrR)ixC6kn!GtfI0#Y7x|p1imfF88*% z9Aon6LouCnOL&d&PB+Pz)Em%`64+iGGF=TQ`I?HUv*CbEM~wEA*(+nN>*a_BoKgsa z-yaB|mJzoz(7IAbI1I*r>^!(q;#^QdaB)}6Fy^O&>i+3-_45}4St#|W87(!o3Rkx! zvH!~&eI4X&xE5B)(LZ9MfnM60wL{9@1ULW|o$Tr(A&0j*;ySrkLa*d;+*B>0QdUTl%zsL5VlDccME z5G~{d_`w~beHq@AB_&shf>!rm#vf`-4D8(;*xh+Xj_Z^&;NM)#YI4SZyaiQ>%J z<@(GlxaeD|2pX}?T@VyTFUU1AvGp~+~Vk-_k?zt%v0)9 zQbB)bx*K>NMWRNNPWb=Vf3=U2^Z9Q18;&E>>oCXwWlOjrJ*mK$s~3pGeuEb zC!fH*fpMh3F1A1)T@>?_NXjf*!Y_UvgzKzaxcW=0Ty3X46Wqc5Gbxeb8cDL8w76aT Xu`u>A6JsAPF_u0EEfYhF#rFRL@uhkQ delta 11870 zcmcgyYmgMx72Z2N`*c}g6=b7=8eK+Mad`+x2qG)tG42S8n$qAnv(rn@Wp`$shXTn~ z`vP36jYZL>tyFxal2lR}E66ISBw8_L@`JMUhlxcrsZ^{)li-sd=Aq;}w;$6p)7>+R z;8M)=bl<-B-0zT)eqJFj z=hAY1UUqOWFBkHM zF|EmbSJzrF12G;N6pZ10tbIj86#sVX*XYHh7Aow}W6E@;OF28*uv1rN()~!3MceS> z0xi_PR7Y!O-VJMPxo-U8udLhXA7|HfD7~@bF0_|1ccsht`8b<bj6x?Drdk-ywryzo(-E&<|YWr zUe(kTrH(gx%s1+P;o>@8m_|D72sO`o!#I49-Jm9lG>N0ihiDQqbzS*Hxy!j5ib~Gi zhn%}oH0<1cswivW#5OsvE0c?PvijzDzL*3}=vsSzB$3O@@xdJNfHoKyanZV!2*+$%Oy;#FlL>^mhf)RIn0$zJhm|(vG~=d2Y$oe8b{t|C zUnP$a6Ex-viCh5&q=oZ^EN(U02AaS^R2fktmZ-|YVREqVh%xY6c8S`_PX|u|96k-- zQO2j!R|7wjzDk{s>dL!>;6FB#oB~KZw!z`&pmi~!e8#8H1)&46wAT2rTu9JtbqLp+ z;1ZrX^93_WDYZDF^ETWz`A%u$9p;Y+-tbo>nir{FM47AfC>Ja9l(|I2HL-xHM#u!6 zKO^=Q>oV^66>C#Ze6$+itR+kcoUhS^1rA??AN*4KAXNC`0eYBJB!4vOTPfh?sRlH4 zB8&)8WsziTBdWp@LWxaow{XU7DNZKNI887^X%~2iSX_;nv zewjCncuCsW(7+fo{T2Tqx-a*J=!ySK2-4WV8d!)L%35nan=52?H;B#my%wPvy1e-zA2x7l%DuDQWO})E}mHljew(*xk z?0j{Z*0!+-cOmCi!woqxffpz}lSn2p@g+Q>-*@apw4WLLQ}lSO>w& zoYB`m_WbeVh<&81`Zs~)K+zt=Omujj@$28Q1*(mEO`uD&@&vjR=xnxuURejw>Tr4+ zmbzYj4FKBHql*A&jpe)$x@qlBZaI`rA{SIQAqR!A4-=C2A@TX{tB1dT&(v5hpyJ@$BD}qokvrn%Cxp*pCpB6lZC45p3d8 zI#VxAF7}W4G3zS9$+!eCE8alzaPsWxWqWpqYR;~xHA2tjA$_zFp~(_v{Oj+mTSdM! z3HpJwArg=`I6N)#hO(TYraQBfRZrhM-^!Rmwpt-~u0~Hn$`sly0JypN00q*qZ5r zjLpo~Le!ba*Cbr2NMTzA$H!BY$DS~PrjR75TS)#QRwM|!CH4wK)m|gb|7_sB8n8`liE$F{MN0Q1c#Uf7<9sy7;nDH=6-1m;;GnDw54$uh~eJ8ZRNz^O4of0 zabJKS?kAkctBm>qriLjjPik$5gyYD^2eLyt-;UN3;R;(n>{4MMmzGnhDr(QX%V8(2 z2|@XQB2zs)oJpmJQ5Vbeuhy^?+f?`*G~^Wfl$lyDZjAA>sgnx3QK4UOdS+q}VxoF$ zh3>u!;yqn&(m~0R!ZGC&K5nnYPuy#vHt-VbSZmRa5Z>gS!VUya6I~6594mRl%LiUd z<%WuKpePL65m8K8$FC#cykl{{`s1}X+OE=xt*EDs=m~|NzSW!b{wW<4>(gaGtnb%< z#K^uwR_)%u@GtRsuUG*2Ixm!!va6^|wYbY!gFV*0 zORUcoKcltLCY%Z3O3=H`5uhY!Tf|~E^4uy$Y zlh#;JaWqPNAU1Y+~OYyeaiZ`DwynLp~I(!invu?$+^^WR`vlITMdN0cH zV;paoo$_VmT;FJBS1D`ho#EXL^1}W5ify=obpW3SjfDa(VZD9o)%D*35L{WR&v?gy^DGya78K`7cpNsw@P`{)71 zYY)&xfbOLU_wAB8v9JR>`sLe1Zl8Z5ns>TWm@emmlcIX6w_=(z;Nw7uKSLkS(#HY% zIEW9L(#g?3wN!TZ_HOmK!+Q148KJbZt3oFNwU3l9gv}678f$Ssa4TPj`+;kpm!Clo zNclbnj8MMotFt_l`LHSH8DjCXh!H8n#R62ujg*!MS79LAx+%Z|lq|6#1NM#=OG)Y` zlAmJt4RqmeJldpzgj$H>+(~?JC`$!&&)Ed5uCIu}HsqAA<;^pZ#(Ai>WE1n+*V!!9 z73A}00WEeU4&sZ#d#}ixMG-YrRP1Zz0i`%YILUw$D{_$y6zevtvJfx;Mn~tZ z0~2Nzl^y$H#d_CzfRT|`(zDn#W1YGnc zTYD;$`2fv;lzD&fREjv_lyS=16wh47w?ld=Z_oeVqdz(}b%Ld0Q4B}oVRdru;SW9^ zKQ=XHCL;VAbZvaJk4;yl3|Y?Y_^oxscr_6?Hq>b_gW!uji_rK4U$9gO)3fTS@9SF6!`{4}-9^Sz2TdF?T zKHuC;uyZckjL=jyro2JuXC`9hv&Cc(u8k!K9zqt0; z{qGW){0GW+ATh>{G}nWHc!jFr2U2_3`dL-{fhrTu|KOk@Ee#zrluyY%0tCPbH0bI0 zOYEA34gzHJ;OBrN3}w36Gu4dL zoV;ZLZC_X9y&_Q+d1gHcuM=CA5?N;mJp`5qHF&mH| zW1)~FnVFEsi3qe^VA)s;ofe^g)YzT*=5csrBwjj}rbx!s@3VQv;~%l<8!L`zdG3Z- z^&&^5XrHL+=wvI*7AH^EI`z&O#^}fF%#~ZCldD4o!pJ_n*$$TR&h$hpW8_12cJR5j zl3=bOr%$9PFf%-$F)h;89?)nii5ZOURWUC zi#IJQ{vIw%hg8dsy4l4vWqG-+VN+b`!}ca>aQ1m#Wu50?-pN3ybkwia%)i6tS-);$ z`3G#YV#~AQB$I7LJe4Eyf!i$&bqG};q WlZAX~DLbvDq$Aw68DP(#IPd>$y!DO% diff --git a/pandas/tests/io/generate_legacy_storage_files.py b/pandas/tests/io/generate_legacy_storage_files.py index 996965999724ec..1cb20814093126 100755 --- a/pandas/tests/io/generate_legacy_storage_files.py +++ b/pandas/tests/io/generate_legacy_storage_files.py @@ -1,6 +1,39 @@ #!/usr/env/bin python -""" self-contained to write legacy storage (pickle/msgpack) files """ +""" +self-contained to write legacy storage (pickle/msgpack) files + +To use this script. Create an environment where you want +generate pickles, say its for 0.18.1, with your pandas clone +in ~/pandas + +. activate pandas_0.18.1 +cd ~/ + +$ python pandas/pandas/tests/io/generate_legacy_storage_files.py \ + pandas/pandas/tests/io/data/legacy_pickle/0.18.1/ pickle + +This script generates a storage file for the current arch, system, +and python version + pandas version: 0.18.1 + output dir : pandas/pandas/tests/io/data/legacy_pickle/0.18.1/ + storage format: pickle +created pickle file: 0.18.1_x86_64_darwin_3.5.2.pickle + +The idea here is you are using the *current* version of the +generate_legacy_storage_files with an *older* version of pandas to +generate a pickle file. We will then check this file into a current +branch, and test using test_pickle.py. This will load the *older* +pickles and test versus the current data that is generated +(with master). These are then compared. + +If we have cases where we changed the signature (e.g. we renamed +offset -> freq in Timestamp). Then we have to conditionally execute +in the generate_legacy_storage_files.py to make it +run under the older AND the newer version. + +""" + from __future__ import print_function from warnings import catch_warnings from distutils.version import LooseVersion @@ -9,6 +42,11 @@ Index, MultiIndex, bdate_range, to_msgpack, date_range, period_range, Timestamp, NaT, Categorical, Period) +from pandas.tseries.offsets import ( + DateOffset, Hour, Minute, Day, + MonthBegin, MonthEnd, YearBegin, + YearEnd, Week, + QuarterBegin, QuarterEnd) from pandas.compat import u import os import sys @@ -151,10 +189,28 @@ def create_data(): timestamp = dict(normal=Timestamp('2011-01-01'), nat=NaT, - tz=Timestamp('2011-01-01', tz='US/Eastern'), - freq=Timestamp('2011-01-01', freq='D'), - both=Timestamp('2011-01-01', tz='Asia/Tokyo', - freq='M')) + tz=Timestamp('2011-01-01', tz='US/Eastern')) + + if _loose_version < '0.19.2': + timestamp['freq'] = Timestamp('2011-01-01', offset='D') + timestamp['both'] = Timestamp('2011-01-01', tz='Asia/Tokyo', + offset='M') + else: + timestamp['freq'] = Timestamp('2011-01-01', freq='D') + timestamp['both'] = Timestamp('2011-01-01', tz='Asia/Tokyo', + freq='M') + + off = {'DateOffset': DateOffset(years=1), + 'MonthBegin': MonthBegin(1), + 'MonthEnd': MonthEnd(1), + 'QuarterBegin': QuarterBegin(1), + 'QuarterEnd': QuarterEnd(1), + 'Day': Day(1), + 'YearBegin': YearBegin(1), + 'YearEnd': YearEnd(1), + 'Week': Week(1), + 'Hour': Hour(1), + 'Minute': Minute(1)} return dict(series=series, frame=frame, @@ -166,7 +222,8 @@ def create_data(): ts=_create_sp_tsseries()), sp_frame=dict(float=_create_sp_frame()), cat=cat, - timestamp=timestamp) + timestamp=timestamp, + offsets=off) def create_pickle_data(): diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index d56b36779efe78..91c1f19f5caab9 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -193,26 +193,18 @@ def legacy_pickle_versions(): for v in os.listdir(path): p = os.path.join(path, v) if os.path.isdir(p): - yield v + for f in os.listdir(p): + yield (v, f) -@pytest.mark.parametrize('version', legacy_pickle_versions()) -def test_pickles(current_pickle_data, version): +@pytest.mark.parametrize('version, f', legacy_pickle_versions()) +def test_pickles(current_pickle_data, version, f): if not is_platform_little_endian(): pytest.skip("known failure on non-little endian") - pth = tm.get_data_path('legacy_pickle/{0}'.format(version)) - n = 0 - for f in os.listdir(pth): - vf = os.path.join(pth, f) - with catch_warnings(record=True): - data = compare(current_pickle_data, vf, version) - - if data is None: - continue - n += 1 - assert n > 0, ('Pickle files are not ' - 'tested: {version}'.format(version=version)) + vf = tm.get_data_path('legacy_pickle/{}/{}'.format(version, f)) + with catch_warnings(record=True): + compare(current_pickle_data, vf, version) def test_round_trip_current(current_pickle_data): From 361ef9ee6ad9313a44259c4443b16d295596df4e Mon Sep 17 00:00:00 2001 From: Berkay Date: Mon, 2 Oct 2017 13:18:21 +0300 Subject: [PATCH 183/188] Fixed the memory usage explanation of categorical in gotchas from O(nm) to O(n+m) (#17736) --- doc/source/categorical.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index cadbc895354b71..c5bbc3c004675d 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -979,7 +979,7 @@ Memory Usage .. _categorical.memory: -The memory usage of a ``Categorical`` is proportional to the number of categories times the length of the data. In contrast, +The memory usage of a ``Categorical`` is proportional to the number of categories plus the length of the data. In contrast, an ``object`` dtype is a constant times the length of the data. .. ipython:: python From bf5b08980526e36e03bc8f4637b8028e0e2d6145 Mon Sep 17 00:00:00 2001 From: jschendel Date: Mon, 2 Oct 2017 04:19:54 -0600 Subject: [PATCH 184/188] CLN: replace %s syntax with .format in pandas.core: categorical, common, config, config_init (#17735) Replaced %s syntax with .format in pandas.core: categorical.py, common.py, config.py, config_init.py. Additionally, made some of the existing positional .format code more explicit. --- pandas/core/categorical.py | 36 +++++++++++++----------- pandas/core/common.py | 14 +++++----- pandas/core/config.py | 57 +++++++++++++++++++++----------------- pandas/core/config_init.py | 8 +++--- 4 files changed, 62 insertions(+), 53 deletions(-) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 61e28dde2e34c0..5619f15ac85d99 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -263,7 +263,8 @@ def __init__(self, values, categories=None, ordered=None, dtype=None, if dtype == 'category': dtype = CategoricalDtype(categories, ordered) else: - raise ValueError("Unknown `dtype` {}".format(dtype)) + msg = "Unknown `dtype` {dtype}" + raise ValueError(msg.format(dtype=dtype)) elif categories is not None or ordered is not None: raise ValueError("Cannot specify both `dtype` and `categories`" " or `ordered`.") @@ -931,9 +932,9 @@ def add_categories(self, new_categories, inplace=False): new_categories = [new_categories] already_included = set(new_categories) & set(self.dtype.categories) if len(already_included) != 0: - msg = ("new categories must not include old categories: %s" % - str(already_included)) - raise ValueError(msg) + msg = ("new categories must not include old categories: " + "{already_included!s}") + raise ValueError(msg.format(already_included=already_included)) new_categories = list(self.dtype.categories) + list(new_categories) new_dtype = CategoricalDtype(new_categories, self.ordered) @@ -989,8 +990,8 @@ def remove_categories(self, removals, inplace=False): new_categories = [x for x in new_categories if notna(x)] if len(not_included) != 0: - raise ValueError("removals must all be in old categories: %s" % - str(not_included)) + msg = "removals must all be in old categories: {not_included!s}" + raise ValueError(msg.format(not_included=not_included)) return self.set_categories(new_categories, ordered=self.ordered, rename=False, inplace=inplace) @@ -1443,7 +1444,8 @@ def sort_values(self, inplace=False, ascending=True, na_position='last'): """ inplace = validate_bool_kwarg(inplace, 'inplace') if na_position not in ['last', 'first']: - raise ValueError('invalid na_position: {!r}'.format(na_position)) + msg = 'invalid na_position: {na_position!r}' + raise ValueError(msg.format(na_position=na_position)) codes = np.sort(self._codes) if not ascending: @@ -1653,9 +1655,10 @@ def _tidy_repr(self, max_vals=10, footer=True): head = self[:num]._get_repr(length=False, footer=False) tail = self[-(max_vals - num):]._get_repr(length=False, footer=False) - result = '%s, ..., %s' % (head[:-1], tail[1:]) + result = u('{head}, ..., {tail}').format(head=head[:-1], tail=tail[1:]) if footer: - result = '%s\n%s' % (result, self._repr_footer()) + result = u('{result}\n{footer}').format(result=result, + footer=self._repr_footer()) return compat.text_type(result) @@ -1683,7 +1686,8 @@ def _repr_categories_info(self): dtype = getattr(self.categories, 'dtype_str', str(self.categories.dtype)) - levheader = "Categories (%d, %s): " % (len(self.categories), dtype) + levheader = "Categories ({length}, {dtype}): ".format( + length=len(self.categories), dtype=dtype) width, height = get_terminal_size() max_width = get_option("display.width") or width if com.in_ipython_frontend(): @@ -1708,7 +1712,8 @@ def _repr_categories_info(self): def _repr_footer(self): - return u('Length: %d\n%s') % (len(self), self._repr_categories_info()) + return u('Length: {length}\n{info}').format( + length=len(self), info=self._repr_categories_info()) def _get_repr(self, length=True, na_rep='NaN', footer=True): from pandas.io.formats import format as fmt @@ -1725,9 +1730,8 @@ def __unicode__(self): elif len(self._codes) > 0: result = self._get_repr(length=len(self) > _maxlen) else: - result = ('[], %s' % - self._get_repr(length=False, - footer=True, ).replace("\n", ", ")) + msg = self._get_repr(length=False, footer=True).replace("\n", ", ") + result = ('[], {repr_msg}'.format(repr_msg=msg)) return result @@ -1869,8 +1873,8 @@ def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, """ perform the reduction type operation """ func = getattr(self, name, None) if func is None: - raise TypeError("Categorical cannot perform the operation " - "{op}".format(op=name)) + msg = 'Categorical cannot perform the operation {op}' + raise TypeError(msg.format(op=name)) return func(numeric_only=numeric_only, **kwds) def min(self, numeric_only=None, **kwargs): diff --git a/pandas/core/common.py b/pandas/core/common.py index 515a4010961205..0f7b86f5e74a09 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -96,8 +96,8 @@ def __init__(self, class_instance): self.class_instance = class_instance def __str__(self): - return ("This method must be defined in the concrete class of %s" % - self.class_instance.__class__.__name__) + msg = "This method must be defined in the concrete class of {name}" + return (msg.format(name=self.class_instance.__class__.__name__)) def flatten(l): @@ -150,8 +150,8 @@ def _maybe_match_name(a, b): def _get_info_slice(obj, indexer): """Slice the info axis of `obj` with `indexer`.""" if not hasattr(obj, '_info_axis_number'): - raise TypeError('object of type %r has no info axis' % - type(obj).__name__) + msg = 'object of type {typ!r} has no info axis' + raise TypeError(msg.format(typ=type(obj).__name__)) slices = [slice(None)] * obj.ndim slices[obj._info_axis_number] = indexer return tuple(slices) @@ -214,8 +214,8 @@ def _mut_exclusive(**kwargs): label1, val1 = item1 label2, val2 = item2 if val1 is not None and val2 is not None: - raise TypeError('mutually exclusive arguments: %r and %r' % - (label1, label2)) + msg = 'mutually exclusive arguments: {label1!r} and {label2!r}' + raise TypeError(msg.format(label1=label1, label2=label2)) elif val1 is not None: return val1 else: @@ -517,7 +517,7 @@ def standardize_mapping(into): collections.defaultdict, into.default_factory) into = type(into) if not issubclass(into, collections.Mapping): - raise TypeError('unsupported type: {}'.format(into)) + raise TypeError('unsupported type: {into}'.format(into=into)) elif into == collections.defaultdict: raise TypeError( 'to_dict() only accepts initialized defaultdicts') diff --git a/pandas/core/config.py b/pandas/core/config.py index b406f6724aa6d4..2354b7ca04e7ff 100644 --- a/pandas/core/config.py +++ b/pandas/core/config.py @@ -80,7 +80,7 @@ def _get_single_key(pat, silent): if len(keys) == 0: if not silent: _warn_if_deprecated(pat) - raise OptionError('No such keys(s): %r' % pat) + raise OptionError('No such keys(s): {pat!r}'.format(pat=pat)) if len(keys) > 1: raise OptionError('Pattern matched multiple keys') key = keys[0] @@ -112,8 +112,8 @@ def _set_option(*args, **kwargs): silent = kwargs.pop('silent', False) if kwargs: - raise TypeError('_set_option() got an unexpected keyword ' - 'argument "{0}"'.format(list(kwargs.keys())[0])) + msg = '_set_option() got an unexpected keyword argument "{kwarg}"' + raise TypeError(msg.format(list(kwargs.keys())[0])) for k, v in zip(args[::2], args[1::2]): key = _get_single_key(k, silent) @@ -436,9 +436,11 @@ def register_option(key, defval, doc='', validator=None, cb=None): key = key.lower() if key in _registered_options: - raise OptionError("Option '%s' has already been registered" % key) + msg = "Option '{key}' has already been registered" + raise OptionError(msg.format(key=key)) if key in _reserved_keys: - raise OptionError("Option '%s' is a reserved key" % key) + msg = "Option '{key}' is a reserved key" + raise OptionError(msg.format(key=key)) # the default value should be legal if validator: @@ -449,22 +451,21 @@ def register_option(key, defval, doc='', validator=None, cb=None): for k in path: if not bool(re.match('^' + tokenize.Name + '$', k)): - raise ValueError("%s is not a valid identifier" % k) + raise ValueError("{k} is not a valid identifier".format(k=k)) if keyword.iskeyword(k): - raise ValueError("%s is a python keyword" % k) + raise ValueError("{k} is a python keyword".format(k=k)) cursor = _global_config + msg = "Path prefix to option '{option}' is already an option" for i, p in enumerate(path[:-1]): if not isinstance(cursor, dict): - raise OptionError("Path prefix to option '%s' is already an option" - % '.'.join(path[:i])) + raise OptionError(msg.format(option='.'.join(path[:i]))) if p not in cursor: cursor[p] = {} cursor = cursor[p] if not isinstance(cursor, dict): - raise OptionError("Path prefix to option '%s' is already an option" % - '.'.join(path[:-1])) + raise OptionError(msg.format(option='.'.join(path[:-1]))) cursor[path[-1]] = defval # initialize @@ -516,8 +517,8 @@ def deprecate_option(key, msg=None, rkey=None, removal_ver=None): key = key.lower() if key in _deprecated_options: - raise OptionError("Option '%s' has already been defined as deprecated." - % key) + msg = "Option '{key}' has already been defined as deprecated." + raise OptionError(msg.format(key=key)) _deprecated_options[key] = DeprecatedOption(key, msg, rkey, removal_ver) @@ -614,11 +615,12 @@ def _warn_if_deprecated(key): print(d.msg) warnings.warn(d.msg, DeprecationWarning) else: - msg = "'%s' is deprecated" % key + msg = "'{key}' is deprecated".format(key=key) if d.removal_ver: - msg += ' and will be removed in %s' % d.removal_ver + msg += (' and will be removed in {version}' + .format(version=d.removal_ver)) if d.rkey: - msg += ", please use '%s' instead." % d.rkey + msg += ", please use '{rkey}' instead.".format(rkey=d.rkey) else: msg += ', please refrain from using it.' @@ -633,7 +635,7 @@ def _build_option_description(k): o = _get_registered_option(k) d = _get_deprecated_option(k) - s = u('%s ') % k + s = u('{k} ').format(k=k) if o.doc: s += '\n'.join(o.doc.strip().split('\n')) @@ -641,12 +643,13 @@ def _build_option_description(k): s += 'No description available.' if o: - s += u('\n [default: %s] [currently: %s]') % (o.defval, - _get_option(k, True)) + s += (u('\n [default: {default}] [currently: {current}]') + .format(default=o.defval, current=_get_option(k, True))) if d: s += u('\n (Deprecated') - s += (u(', use `%s` instead.') % d.rkey if d.rkey else '') + s += (u(', use `{rkey}` instead.') + .format(rkey=d.rkey if d.rkey else '')) s += u(')') s += '\n\n' @@ -718,7 +721,7 @@ def config_prefix(prefix): def wrap(func): def inner(key, *args, **kwds): - pkey = '%s.%s' % (prefix, key) + pkey = '{prefix}.{key}'.format(prefix=prefix, key=key) return func(pkey, *args, **kwds) return inner @@ -754,7 +757,8 @@ def is_type_factory(_type): def inner(x): if type(x) != _type: - raise ValueError("Value must have type '%s'" % str(_type)) + msg = "Value must have type '{typ!s}'" + raise ValueError(msg.format(typ=_type)) return inner @@ -777,11 +781,12 @@ def is_instance_factory(_type): from pandas.io.formats.printing import pprint_thing type_repr = "|".join(map(pprint_thing, _type)) else: - type_repr = "'%s'" % _type + type_repr = "'{typ}'".format(typ=_type) def inner(x): if not isinstance(x, _type): - raise ValueError("Value must be an instance of %s" % type_repr) + msg = "Value must be an instance of {type_repr}" + raise ValueError(msg.format(type_repr=type_repr)) return inner @@ -797,10 +802,10 @@ def inner(x): if not any([c(x) for c in callables]): pp_values = pp("|".join(lmap(pp, legal_values))) - msg = "Value must be one of {0}".format(pp_values) + msg = "Value must be one of {pp_values}" if len(callables): msg += " or a callable" - raise ValueError(msg) + raise ValueError(msg.format(pp_values=pp_values)) return inner diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index ea5c213dbe0577..5652424a8f75b7 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -453,10 +453,10 @@ def use_inf_as_na_cb(key): cf.register_option(ext + '.writer', default, doc, validator=str) def _register_xlsx(engine, other): - cf.register_option('xlsx.writer', engine, - writer_engine_doc.format(ext='xlsx', default=engine, - others=", '%s'" % other), - validator=str) + others = ", '{other}'".format(other=other) + doc = writer_engine_doc.format(ext='xlsx', default=engine, + others=others) + cf.register_option('xlsx.writer', engine, doc, validator=str) try: # better memory footprint From 2781b18008a7dca575a4f3496c8f11c1ea05cced Mon Sep 17 00:00:00 2001 From: jschendel Date: Mon, 2 Oct 2017 05:28:30 -0600 Subject: [PATCH 185/188] DEPR: Deprecate cdate_range and merge into bdate_range (#17691) --- doc/source/api.rst | 1 - doc/source/timeseries.rst | 241 ++++++++++-------- doc/source/whatsnew/v0.21.0.txt | 9 +- pandas/core/api.py | 3 +- pandas/core/indexes/datetimes.py | 38 ++- pandas/tests/api/test_api.py | 12 +- .../indexes/datetimes/test_date_range.py | 189 ++++++++------ pandas/tests/indexes/datetimes/test_ops.py | 8 +- pandas/tests/indexes/datetimes/test_setops.py | 7 +- pandas/tseries/offsets.py | 3 +- 10 files changed, 299 insertions(+), 212 deletions(-) diff --git a/doc/source/api.rst b/doc/source/api.rst index 4ffeb5035912f5..28d4567027572f 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -218,7 +218,6 @@ Top-level dealing with datetimelike to_timedelta date_range bdate_range - cdate_range period_range timedelta_range infer_freq diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index 7399deb1319d88..d2d5ee344591ae 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -76,21 +76,21 @@ Overview Following table shows the type of time-related classes pandas can handle and how to create them. -================= =============================== ================================================== +================= =============================== =================================================================== Class Remarks How to create -================= =============================== ================================================== -``Timestamp`` Represents a single time stamp ``to_datetime``, ``Timestamp`` -``DatetimeIndex`` Index of ``Timestamp`` ``to_datetime``, ``date_range``, ``DatetimeIndex`` +================= =============================== =================================================================== +``Timestamp`` Represents a single timestamp ``to_datetime``, ``Timestamp`` +``DatetimeIndex`` Index of ``Timestamp`` ``to_datetime``, ``date_range``, ``bdate_range``, ``DatetimeIndex`` ``Period`` Represents a single time span ``Period`` ``PeriodIndex`` Index of ``Period`` ``period_range``, ``PeriodIndex`` -================= =============================== ================================================== +================= =============================== =================================================================== .. _timeseries.representation: -Time Stamps vs. Time Spans --------------------------- +Timestamps vs. Time Spans +------------------------- -Time-stamped data is the most basic type of timeseries data that associates +Timestamped data is the most basic type of time series data that associates values with points in time. For pandas objects it means using the points in time. @@ -149,10 +149,10 @@ future releases. Converting to Timestamps ------------------------ -To convert a Series or list-like object of date-like objects e.g. strings, +To convert a ``Series`` or list-like object of date-like objects e.g. strings, epochs, or a mixture, you can use the ``to_datetime`` function. When passed -a Series, this returns a Series (with the same index), while a list-like -is converted to a DatetimeIndex: +a ``Series``, this returns a ``Series`` (with the same index), while a list-like +is converted to a ``DatetimeIndex``: .. ipython:: python @@ -175,11 +175,9 @@ you can pass the ``dayfirst`` flag: can't be parsed with the day being first it will be parsed as if ``dayfirst`` were False. -If you pass a single string to ``to_datetime``, it returns single ``Timestamp``. - -Also, ``Timestamp`` can accept the string input. -Note that ``Timestamp`` doesn't accept string parsing option like ``dayfirst`` -or ``format``, use ``to_datetime`` if these are required. +If you pass a single string to ``to_datetime``, it returns a single ``Timestamp``. +``Timestamp`` can also accept string input, but it doesn't accept string parsing +options like ``dayfirst`` or ``format``, so use ``to_datetime`` if these are required. .. ipython:: python @@ -191,9 +189,7 @@ Providing a Format Argument ~~~~~~~~~~~~~~~~~~~~~~~~~~~ In addition to the required datetime string, a ``format`` argument can be passed to ensure specific parsing. -It will potentially speed up the conversion considerably. - -For example: +This could also potentially speed up the conversion considerably. .. ipython:: python @@ -203,7 +199,7 @@ For example: For more information on how to specify the ``format`` options, see https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior. -Assembling datetime from multiple DataFrame columns +Assembling Datetime from Multiple DataFrame Columns ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. versionadded:: 0.18.1 @@ -238,28 +234,24 @@ Invalid Data In version 0.17.0, the default for ``to_datetime`` is now ``errors='raise'``, rather than ``errors='ignore'``. This means that invalid parsing will raise rather that return the original input as in previous versions. -Pass ``errors='coerce'`` to convert invalid data to ``NaT`` (not a time): - -Raise when unparseable, this is the default +The default behavior, ``errors='raise'``, is to raise when unparseable: .. code-block:: ipython In [2]: pd.to_datetime(['2009/07/31', 'asd'], errors='raise') ValueError: Unknown string format -Return the original input when unparseable +Pass ``errors='ignore'`` to return the original input when unparseable: -.. code-block:: ipython +.. ipython:: python - In [4]: pd.to_datetime(['2009/07/31', 'asd'], errors='ignore') - Out[4]: array(['2009/07/31', 'asd'], dtype=object) + pd.to_datetime(['2009/07/31', 'asd'], errors='ignore') -Return NaT for input when unparseable +Pass ``errors='coerce'`` to convert unparseable data to ``NaT`` (not a time): -.. code-block:: ipython +.. ipython:: python - In [6]: pd.to_datetime(['2009/07/31', 'asd'], errors='coerce') - Out[6]: DatetimeIndex(['2009-07-31', 'NaT'], dtype='datetime64[ns]', freq=None) + pd.to_datetime(['2009/07/31', 'asd'], errors='coerce') .. _timeseries.converting.epoch: @@ -267,12 +259,11 @@ Return NaT for input when unparseable Epoch Timestamps ~~~~~~~~~~~~~~~~ -It's also possible to convert integer or float epoch times. The default unit -for these is nanoseconds (since these are how ``Timestamp`` s are stored). However, -often epochs are stored in another ``unit`` which can be specified. These are computed -from the starting point specified by the :ref:`Origin Parameter `. - -Typical epoch stored units +pandas supports converting integer or float epoch times to ``Timestamp`` and +``DatetimeIndex``. The default unit is nanoseconds, since that is how ``Timestamp`` +objects are stored internally. However, epochs are often stored in another ``unit`` +which can be specified. These are computed from the starting point specified by the +``origin`` parameter. .. ipython:: python @@ -299,6 +290,10 @@ Typical epoch stored units pd.to_datetime([1490195805.433, 1490195805.433502912], unit='s') pd.to_datetime(1490195805433502912, unit='ns') +.. seealso:: + + :ref:`timeseries.origin` + .. _timeseries.converting.epoch_inverse: From Timestamps to Epoch @@ -319,15 +314,13 @@ We convert the ``DatetimeIndex`` to an ``int64`` array, then divide by the conve .. _timeseries.origin: -Using the Origin Parameter -~~~~~~~~~~~~~~~~~~~~~~~~~~ +Using the ``origin`` Parameter +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. versionadded:: 0.20.0 Using the ``origin`` parameter, one can specify an alternative starting point for creation -of a ``DatetimeIndex``. - -Start with 1960-01-01 as the starting date +of a ``DatetimeIndex``. For example, to use 1960-01-01 as the starting date: .. ipython:: python @@ -345,8 +338,8 @@ Commonly called 'unix epoch' or POSIX time. Generating Ranges of Timestamps ------------------------------- -To generate an index with time stamps, you can use either the DatetimeIndex or -Index constructor and pass in a list of datetime objects: +To generate an index with timestamps, you can use either the ``DatetimeIndex`` or +``Index`` constructor and pass in a list of datetime objects: .. ipython:: python @@ -360,37 +353,36 @@ Index constructor and pass in a list of datetime objects: index = pd.Index(dates) index -Practically, this becomes very cumbersome because we often need a very long +In practice this becomes very cumbersome because we often need a very long index with a large number of timestamps. If we need timestamps on a regular -frequency, we can use the pandas functions ``date_range`` and ``bdate_range`` -to create timestamp indexes. +frequency, we can use the :func:`date_range` and :func:`bdate_range` functions +to create a ``DatetimeIndex``. The default frequency for ``date_range`` is a +**calendar day** while the default for ``bdate_range`` is a **business day**: .. ipython:: python - index = pd.date_range('2000-1-1', periods=1000, freq='M') + start = datetime(2011, 1, 1) + end = datetime(2012, 1, 1) + + index = pd.date_range(start, end) index - index = pd.bdate_range('2012-1-1', periods=250) + index = pd.bdate_range(start, end) index -Convenience functions like ``date_range`` and ``bdate_range`` utilize a -variety of frequency aliases. The default frequency for ``date_range`` is a -**calendar day** while the default for ``bdate_range`` is a **business day** +Convenience functions like ``date_range`` and ``bdate_range`` can utilize a +variety of :ref:`frequency aliases `: .. ipython:: python - start = datetime(2011, 1, 1) - end = datetime(2012, 1, 1) - - rng = pd.date_range(start, end) - rng + pd.date_range(start, periods=1000, freq='M') - rng = pd.bdate_range(start, end) - rng + pd.bdate_range(start, periods=250, freq='BQS') ``date_range`` and ``bdate_range`` make it easy to generate a range of dates -using various combinations of parameters like ``start``, ``end``, -``periods``, and ``freq``: +using various combinations of parameters like ``start``, ``end``, ``periods``, +and ``freq``. The start and end dates are strictly inclusive, so dates outside +of those specified will not be generated: .. ipython:: python @@ -402,15 +394,45 @@ using various combinations of parameters like ``start``, ``end``, pd.bdate_range(start=start, periods=20) -The start and end dates are strictly inclusive. So it will not generate any -dates outside of those dates if specified. +.. _timeseries.custom-freq-ranges: + +Custom Frequency Ranges +~~~~~~~~~~~~~~~~~~~~~~~ + +.. warning:: + + This functionality was originally exclusive to ``cdate_range``, which is + deprecated as of version 0.21.0 in favor of ``bdate_range``. Note that + ``cdate_range`` only utilizes the ``weekmask`` and ``holidays`` parameters + when custom business day, 'C', is passed as the frequency string. Support has + been expanded with ``bdate_range`` to work with any custom frequency string. + +.. versionadded:: 0.21.0 + +``bdate_range`` can also generate a range of custom frequency dates by using +the ``weekmask`` and ``holidays`` parameters. These parameters will only be +used if a custom frequency string is passed. + +.. ipython:: python + + weekmask = 'Mon Wed Fri' + + holidays = [datetime(2011, 1, 5), datetime(2011, 3, 14)] + + pd.bdate_range(start, end, freq='C', weekmask=weekmask, holidays=holidays) + + pd.bdate_range(start, end, freq='CBMS', weekmask=weekmask) + +.. seealso:: + + :ref:`timeseries.custombusinessdays` .. _timeseries.timestamp-limits: -Timestamp limitations +Timestamp Limitations --------------------- -Since pandas represents timestamps in nanosecond resolution, the timespan that +Since pandas represents timestamps in nanosecond resolution, the time span that can be represented using a 64-bit integer is limited to approximately 584 years: .. ipython:: python @@ -418,7 +440,9 @@ can be represented using a 64-bit integer is limited to approximately 584 years: pd.Timestamp.min pd.Timestamp.max -See :ref:`here ` for ways to represent data outside these bound. +.. seealso:: + + :ref:`timeseries.oob` .. _timeseries.datetimeindex: @@ -426,20 +450,20 @@ Indexing -------- One of the main uses for ``DatetimeIndex`` is as an index for pandas objects. -The ``DatetimeIndex`` class contains many timeseries related optimizations: +The ``DatetimeIndex`` class contains many time series related optimizations: - A large range of dates for various offsets are pre-computed and cached under the hood in order to make generating subsequent date ranges very fast (just have to grab a slice) - Fast shifting using the ``shift`` and ``tshift`` method on pandas objects - - Unioning of overlapping DatetimeIndex objects with the same frequency is + - Unioning of overlapping ``DatetimeIndex`` objects with the same frequency is very fast (important for fast data alignment) - Quick access to date fields via properties such as ``year``, ``month``, etc. - Regularization functions like ``snap`` and very fast ``asof`` logic -DatetimeIndex objects has all the basic functionality of regular Index objects -and a smorgasbord of advanced timeseries-specific methods for easy frequency -processing. +``DatetimeIndex`` objects have all the basic functionality of regular ``Index`` +objects, and a smorgasbord of advanced time series specific methods for easy +frequency processing. .. seealso:: :ref:`Reindexing methods ` @@ -447,8 +471,7 @@ processing. .. note:: While pandas does not force you to have a sorted date index, some of these - methods may have unexpected or incorrect behavior if the dates are - unsorted. So please be careful. + methods may have unexpected or incorrect behavior if the dates are unsorted. ``DatetimeIndex`` can be used like a regular index and offers all of its intelligent functionality like selection, slicing, etc. @@ -466,7 +489,7 @@ intelligent functionality like selection, slicing, etc. Partial String Indexing ~~~~~~~~~~~~~~~~~~~~~~~ -You can pass in dates and strings that parse to dates as indexing parameters: +Dates and strings that parse to timestamps can be passed as indexing parameters: .. ipython:: python @@ -485,9 +508,9 @@ the year or year and month as strings: ts['2011-6'] -This type of slicing will work on a DataFrame with a ``DateTimeIndex`` as well. Since the +This type of slicing will work on a ``DataFrame`` with a ``DatetimeIndex`` as well. Since the partial string selection is a form of label slicing, the endpoints **will be** included. This -would include matching times on an included date. Here's an example: +would include matching times on an included date: .. ipython:: python @@ -523,7 +546,7 @@ We are stopping on the included end-point as it is part of the index .. versionadded:: 0.18.0 -DatetimeIndex Partial String Indexing also works on DataFrames with a ``MultiIndex``. For example: +``DatetimeIndex`` partial string indexing also works on a ``DataFrame`` with a ``MultiIndex``: .. ipython:: python @@ -541,14 +564,14 @@ DatetimeIndex Partial String Indexing also works on DataFrames with a ``MultiInd .. _timeseries.slice_vs_exact_match: -Slice vs. exact match +Slice vs. Exact Match ~~~~~~~~~~~~~~~~~~~~~ .. versionchanged:: 0.20.0 -The same string used as an indexing parameter can be treated either as a slice or as an exact match depending on the resolution of an index. If the string is less accurate than the index, it will be treated as a slice, otherwise as an exact match. +The same string used as an indexing parameter can be treated either as a slice or as an exact match depending on the resolution of the index. If the string is less accurate than the index, it will be treated as a slice, otherwise as an exact match. -For example, let us consider ``Series`` object which index has minute resolution. +Consider a ``Series`` object with a minute resolution index: .. ipython:: python @@ -593,7 +616,7 @@ If the timestamp string is treated as a slice, it can be used to index ``DataFra .. warning:: - However if the string is treated as an exact match, the selection in ``DataFrame``'s ``[]`` will be column-wise and not row-wise, see :ref:`Indexing Basics `. For example ``dft_minute['2011-12-31 23:59']`` will raise ``KeyError`` as ``'2012-12-31 23:59'`` has the same resolution as index and there is no column with such name: + However, if the string is treated as an exact match, the selection in ``DataFrame``'s ``[]`` will be column-wise and not row-wise, see :ref:`Indexing Basics `. For example ``dft_minute['2011-12-31 23:59']`` will raise ``KeyError`` as ``'2012-12-31 23:59'`` has the same resolution as the index and there is no column with such name: To *always* have unambiguous selection, whether the row is treated as a slice or a single selection, use ``.loc``. @@ -616,7 +639,7 @@ Note also that ``DatetimeIndex`` resolution cannot be less precise than day. Exact Indexing ~~~~~~~~~~~~~~ -As discussed in previous section, indexing a ``DateTimeIndex`` with a partial string depends on the "accuracy" of the period, in other words how specific the interval is in relation to the resolution of the index. In contrast, indexing with ``Timestamp`` or ``datetime`` objects is exact, because the objects have exact meaning. These also follow the semantics of *including both endpoints*. +As discussed in previous section, indexing a ``DatetimeIndex`` with a partial string depends on the "accuracy" of the period, in other words how specific the interval is in relation to the resolution of the index. In contrast, indexing with ``Timestamp`` or ``datetime`` objects is exact, because the objects have exact meaning. These also follow the semantics of *including both endpoints*. These ``Timestamp`` and ``datetime`` objects have exact ``hours, minutes,`` and ``seconds``, even though they were not explicitly specified (they are ``0``). @@ -640,8 +663,8 @@ A ``truncate`` convenience function is provided that is equivalent to slicing: ts.truncate(before='10/31/2011', after='12/31/2011') -Even complicated fancy indexing that breaks the DatetimeIndex's frequency -regularity will result in a ``DatetimeIndex`` (but frequency is lost): +Even complicated fancy indexing that breaks the ``DatetimeIndex`` frequency +regularity will result in a ``DatetimeIndex``, although frequency is lost: .. ipython:: python @@ -652,7 +675,7 @@ regularity will result in a ``DatetimeIndex`` (but frequency is lost): Time/Date Components -------------------- -There are several time/date properties that one can access from ``Timestamp`` or a collection of timestamps like a ``DateTimeIndex``. +There are several time/date properties that one can access from ``Timestamp`` or a collection of timestamps like a ``DatetimeIndex``. .. csv-table:: :header: "Property", "Description" @@ -688,10 +711,10 @@ Furthermore, if you have a ``Series`` with datetimelike values, then you can acc .. _timeseries.offsets: -DateOffset objects +DateOffset Objects ------------------ -In the preceding examples, we created DatetimeIndex objects at various +In the preceding examples, we created ``DatetimeIndex`` objects at various frequencies by passing in :ref:`frequency strings ` like 'M', 'W', and 'BM to the ``freq`` keyword. Under the hood, these frequency strings are being translated into an instance of pandas ``DateOffset``, @@ -704,7 +727,7 @@ which represents a regular frequency increment. Specific offset logic like DateOffset, "Generic offset class, defaults to 1 calendar day" BDay, "business day (weekday)" - CDay, "custom business day (experimental)" + CDay, "custom business day" Week, "one week, optionally anchored on a day of the week" WeekOfMonth, "the x-th day of the y-th week of each month" LastWeekOfMonth, "the x-th day of the last week of each month" @@ -805,7 +828,7 @@ These operations (``apply``, ``rollforward`` and ``rollback``) preserves time (h hour.apply(pd.Timestamp('2014-01-01 23:00')) -Parametric offsets +Parametric Offsets ~~~~~~~~~~~~~~~~~~ Some of the offsets can be "parameterized" when created to result in different @@ -840,7 +863,7 @@ Another example is parameterizing ``YearEnd`` with the specific ending month: .. _timeseries.offsetseries: -Using offsets with ``Series`` / ``DatetimeIndex`` +Using Offsets with ``Series`` / ``DatetimeIndex`` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Offsets can be used with either a ``Series`` or ``DatetimeIndex`` to @@ -1091,7 +1114,7 @@ frequencies. We will refer to these aliases as *offset aliases*. :widths: 15, 100 "B", "business day frequency" - "C", "custom business day frequency (experimental)" + "C", "custom business day frequency" "D", "calendar day frequency" "W", "weekly frequency" "M", "month end frequency" @@ -1326,10 +1349,10 @@ or calendars with additional rules. .. _timeseries.advanced_datetime: -Time series-related instance methods +Time Series-Related Instance Methods ------------------------------------ -Shifting / lagging +Shifting / Lagging ~~~~~~~~~~~~~~~~~~ One may want to *shift* or *lag* the values in a time series back and forward in @@ -1360,7 +1383,7 @@ all the dates in the index by a specified number of offsets: Note that with ``tshift``, the leading entry is no longer NaN because the data is not being realigned. -Frequency conversion +Frequency Conversion ~~~~~~~~~~~~~~~~~~~~ The primary function for changing frequencies is the ``asfreq`` function. @@ -1381,13 +1404,13 @@ method for any gaps that may appear after the frequency conversion ts.asfreq(BDay(), method='pad') -Filling forward / backward +Filling Forward / Backward ~~~~~~~~~~~~~~~~~~~~~~~~~~ Related to ``asfreq`` and ``reindex`` is the ``fillna`` function documented in the :ref:`missing data section `. -Converting to Python datetimes +Converting to Python Datetimes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ``DatetimeIndex`` can be converted to an array of Python native datetime.datetime objects using the @@ -1471,10 +1494,10 @@ labels. ts.resample('5Min', label='left', loffset='1s').mean() The ``axis`` parameter can be set to 0 or 1 and allows you to resample the -specified axis for a DataFrame. +specified axis for a ``DataFrame``. ``kind`` can be set to 'timestamp' or 'period' to convert the resulting index -to/from time-stamp and time-span representations. By default ``resample`` +to/from timestamp and time span representations. By default ``resample`` retains the input representation. ``convention`` can be set to 'start' or 'end' when resampling period data @@ -1482,8 +1505,8 @@ retains the input representation. frequency periods. -Up Sampling -~~~~~~~~~~~ +Upsampling +~~~~~~~~~~ For upsampling, you can specify a way to upsample and the ``limit`` parameter to interpolate over the gaps that are created: @@ -1559,13 +1582,13 @@ We can select a specific column or columns using standard getitem. r[['A','B']].mean() -You can pass a list or dict of functions to do aggregation with, outputting a DataFrame: +You can pass a list or dict of functions to do aggregation with, outputting a ``DataFrame``: .. ipython:: python r['A'].agg([np.sum, np.mean, np.std]) -On a resampled DataFrame, you can pass a list of functions to apply to each +On a resampled ``DataFrame``, you can pass a list of functions to apply to each column, which produces an aggregated result with a hierarchical index: .. ipython:: python @@ -1573,7 +1596,7 @@ column, which produces an aggregated result with a hierarchical index: r.agg([np.sum, np.mean]) By passing a dict to ``aggregate`` you can apply a different aggregation to the -columns of a DataFrame: +columns of a ``DataFrame``: .. ipython:: python :okexcept: @@ -1890,7 +1913,7 @@ frequencies ``Q-JAN`` through ``Q-DEC``. .. _timeseries.interchange: -Converting between Representations +Converting Between Representations ---------------------------------- Timestamped data can be converted to PeriodIndex-ed data using ``to_period`` @@ -1934,7 +1957,7 @@ the quarter end: .. _timeseries.oob: -Representing out-of-bounds spans +Representing Out-of-Bounds Spans -------------------------------- If you have data that is outside of the ``Timestamp`` bounds, see :ref:`Timestamp limitations `, @@ -2031,7 +2054,7 @@ which gives you more control over which time zone is used: rng_dateutil.tz == tz_dateutil Timestamps, like Python's ``datetime.datetime`` object can be either time zone -naive or time zone aware. Naive time series and DatetimeIndex objects can be +naive or time zone aware. Naive time series and ``DatetimeIndex`` objects can be *localized* using ``tz_localize``: .. ipython:: python @@ -2099,8 +2122,8 @@ Localization of ``Timestamp`` functions just like ``DatetimeIndex`` and ``Series rng[5].tz_localize('Asia/Shanghai') -Operations between Series in different time zones will yield UTC -Series, aligning the data on the UTC timestamps: +Operations between ``Series`` in different time zones will yield UTC +``Series``, aligning the data on the UTC timestamps: .. ipython:: python @@ -2180,7 +2203,7 @@ constructor as well as ``tz_localize``. .. _timeseries.timezone_series: -TZ aware Dtypes +TZ Aware Dtypes ~~~~~~~~~~~~~~~ .. versionadded:: 0.17.0 diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index c8a0a6bff5cc75..d69a5c22acc035 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -488,7 +488,7 @@ Additionally, DataFrames with datetime columns that were parsed by :func:`read_s Consistency of Range Functions ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -In previous versions, there were some inconsistencies between the various range functions: :func:`date_range`, :func:`bdate_range`, :func:`cdate_range`, :func:`period_range`, :func:`timedelta_range`, and :func:`interval_range`. (:issue:`17471`). +In previous versions, there were some inconsistencies between the various range functions: :func:`date_range`, :func:`bdate_range`, :func:`period_range`, :func:`timedelta_range`, and :func:`interval_range`. (:issue:`17471`). One of the inconsistent behaviors occurred when the ``start``, ``end`` and ``period`` parameters were all specified, potentially leading to ambiguous ranges. When all three parameters were passed, ``interval_range`` ignored the ``period`` parameter, ``period_range`` ignored the ``end`` parameter, and the other range functions raised. To promote consistency among the range functions, and avoid potentially ambiguous ranges, ``interval_range`` and ``period_range`` will now raise when all three parameters are passed. @@ -571,8 +571,9 @@ Deprecations - :func:`SeriesGroupBy.nth` has deprecated ``True`` in favor of ``'all'`` for its kwarg ``dropna`` (:issue:`11038`). - :func:`DataFrame.as_blocks` is deprecated, as this is exposing the internal implementation (:issue:`17302`) - ``pd.TimeGrouper`` is deprecated in favor of :class:`pandas.Grouper` (:issue:`16747`) +- ``cdate_range`` has been deprecated in favor of :func:`bdate_range`, which has gained ``weekmask`` and ``holidays`` parameters for building custom frequency date ranges. See the :ref:`documentation ` for more details (:issue:`17596`) -.. _whatsnew_0210.deprecations.argmin_min +.. _whatsnew_0210.deprecations.argmin_min: Series.argmax and Series.argmin ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -738,9 +739,9 @@ Numeric Categorical ^^^^^^^^^^^ -- Bug in :func:`Series.isin` when called with a categorical (:issue`16639`) +- Bug in :func:`Series.isin` when called with a categorical (:issue:`16639`) - Bug in the categorical constructor with empty values and categories causing the ``.categories`` to be an empty ``Float64Index`` rather than an empty ``Index`` with object dtype (:issue:`17248`) -- Bug in categorical operations with :ref:`Series.cat ' not preserving the original Series' name (:issue:`17509`) +- Bug in categorical operations with :ref:`Series.cat ` not preserving the original Series' name (:issue:`17509`) PyPy ^^^^ diff --git a/pandas/core/api.py b/pandas/core/api.py index a012ccce839653..2f818a400162b3 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -16,8 +16,7 @@ PeriodIndex, NaT) from pandas.core.indexes.period import Period, period_range, pnow from pandas.core.indexes.timedeltas import Timedelta, timedelta_range -from pandas.core.indexes.datetimes import (Timestamp, date_range, bdate_range, - cdate_range) +from pandas.core.indexes.datetimes import Timestamp, date_range, bdate_range from pandas.core.indexes.interval import Interval, interval_range from pandas.core.series import Series diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 9127864eab8a16..1419da3fa8861b 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -17,6 +17,7 @@ is_period_dtype, is_bool_dtype, is_string_dtype, + is_string_like, is_list_like, is_scalar, pandas_dtype, @@ -37,7 +38,8 @@ Resolution) from pandas.core.indexes.datetimelike import ( DatelikeOps, TimelikeOps, DatetimeIndexOpsMixin) -from pandas.tseries.offsets import DateOffset, generate_range, Tick, CDay +from pandas.tseries.offsets import ( + DateOffset, generate_range, Tick, CDay, prefix_mapping) from pandas.core.tools.datetimes import ( parse_time_string, normalize_date, to_time) from pandas.core.tools.timedeltas import to_timedelta @@ -2049,7 +2051,8 @@ def date_range(start=None, end=None, periods=None, freq='D', tz=None, def bdate_range(start=None, end=None, periods=None, freq='B', tz=None, - normalize=True, name=None, closed=None, **kwargs): + normalize=True, name=None, weekmask=None, holidays=None, + closed=None, **kwargs): """ Return a fixed frequency DatetimeIndex, with business day as the default frequency @@ -2071,6 +2074,20 @@ def bdate_range(start=None, end=None, periods=None, freq='B', tz=None, Normalize start/end dates to midnight before generating date range name : string, default None Name of the resulting DatetimeIndex + weekmask : string or None, default None + Weekmask of valid business days, passed to ``numpy.busdaycalendar``, + only used when custom frequency strings are passed. The default + value None is equivalent to 'Mon Tue Wed Thu Fri' + + .. versionadded:: 0.21.0 + + holidays : list-like or None, default None + Dates to exclude from the set of valid business days, passed to + ``numpy.busdaycalendar``, only used when custom frequency strings + are passed + + .. versionadded:: 0.21.0 + closed : string, default None Make the interval closed with respect to the given frequency to the 'left', 'right', or both sides (None) @@ -2088,6 +2105,18 @@ def bdate_range(start=None, end=None, periods=None, freq='B', tz=None, rng : DatetimeIndex """ + if is_string_like(freq) and freq.startswith('C'): + try: + weekmask = weekmask or 'Mon Tue Wed Thu Fri' + freq = prefix_mapping[freq](holidays=holidays, weekmask=weekmask) + except (KeyError, TypeError): + msg = 'invalid custom frequency string: {freq}'.format(freq=freq) + raise ValueError(msg) + elif holidays or weekmask: + msg = ('a custom frequency string is required when holidays or ' + 'weekmask are passed, got frequency {freq}').format(freq=freq) + raise ValueError(msg) + return DatetimeIndex(start=start, end=end, periods=periods, freq=freq, tz=tz, normalize=normalize, name=name, closed=closed, **kwargs) @@ -2099,6 +2128,8 @@ def cdate_range(start=None, end=None, periods=None, freq='C', tz=None, Return a fixed frequency DatetimeIndex, with CustomBusinessDay as the default frequency + .. deprecated:: 0.21.0 + Parameters ---------- start : string or datetime-like, default None @@ -2137,6 +2168,9 @@ def cdate_range(start=None, end=None, periods=None, freq='C', tz=None, ------- rng : DatetimeIndex """ + warnings.warn("cdate_range is deprecated and will be removed in a future " + "version, instead use pd.bdate_range(..., freq='{freq}')" + .format(freq=freq), FutureWarning, stacklevel=2) if freq == 'C': holidays = kwargs.pop('holidays', []) diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index c593290410b961..fad455d6391c33 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -63,7 +63,7 @@ class TestPDApi(Base): # top-level functions funcs = ['bdate_range', 'concat', 'crosstab', 'cut', 'date_range', 'interval_range', 'eval', - 'factorize', 'get_dummies', 'cdate_range', + 'factorize', 'get_dummies', 'infer_freq', 'isna', 'isnull', 'lreshape', 'melt', 'notna', 'notnull', 'offsets', 'merge', 'merge_ordered', 'merge_asof', @@ -240,3 +240,13 @@ def test_deprecation_access_func(self): [c1, c2], sort_categories=True, ignore_order=True) + + +class TestCDateRange(object): + + def test_deprecation_cdaterange(self): + # GH17596 + from pandas.core.indexes.datetimes import cdate_range + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + cdate_range('2017-01-01', '2017-12-31') diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index c373942cb4c63c..3b40ef092f3643 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -1,6 +1,5 @@ """ -test date_range, bdate_range, cdate_range -construction from the convenience range functions +test date_range, bdate_range construction from the convenience range functions """ import pytest @@ -12,10 +11,9 @@ import pandas as pd import pandas.util.testing as tm from pandas import compat -from pandas.core.indexes.datetimes import bdate_range, cdate_range -from pandas import date_range, offsets, DatetimeIndex, Timestamp -from pandas.tseries.offsets import (generate_range, CDay, BDay, - DateOffset, MonthEnd) +from pandas import date_range, bdate_range, offsets, DatetimeIndex, Timestamp +from pandas.tseries.offsets import (generate_range, CDay, BDay, DateOffset, + MonthEnd, prefix_mapping) from pandas.tests.series.common import TestData @@ -241,9 +239,6 @@ def test_precision_finer_than_offset(self): class TestBusinessDateRange(object): - def setup_method(self, method): - self.rng = bdate_range(START, END) - def test_constructor(self): bdate_range(START, END, freq=BDay()) bdate_range(START, periods=20, freq=BDay()) @@ -258,33 +253,31 @@ def test_constructor(self): def test_naive_aware_conflicts(self): naive = bdate_range(START, END, freq=BDay(), tz=None) - aware = bdate_range(START, END, freq=BDay(), - tz="Asia/Hong_Kong") - tm.assert_raises_regex(TypeError, "tz-naive.*tz-aware", - naive.join, aware) - tm.assert_raises_regex(TypeError, "tz-naive.*tz-aware", - aware.join, naive) + aware = bdate_range(START, END, freq=BDay(), tz="Asia/Hong_Kong") + + msg = 'tz-naive.*tz-aware' + with tm.assert_raises_regex(TypeError, msg): + naive.join(aware) + + with tm.assert_raises_regex(TypeError, msg): + aware.join(naive) def test_cached_range(self): DatetimeIndex._cached_range(START, END, offset=BDay()) DatetimeIndex._cached_range(START, periods=20, offset=BDay()) DatetimeIndex._cached_range(end=START, periods=20, offset=BDay()) - tm.assert_raises_regex(TypeError, "offset", - DatetimeIndex._cached_range, - START, END) + with tm.assert_raises_regex(TypeError, "offset"): + DatetimeIndex._cached_range(START, END) - tm.assert_raises_regex(TypeError, "specify period", - DatetimeIndex._cached_range, START, - offset=BDay()) + with tm.assert_raises_regex(TypeError, "specify period"): + DatetimeIndex._cached_range(START, offset=BDay()) - tm.assert_raises_regex(TypeError, "specify period", - DatetimeIndex._cached_range, end=END, - offset=BDay()) + with tm.assert_raises_regex(TypeError, "specify period"): + DatetimeIndex._cached_range(end=END, offset=BDay()) - tm.assert_raises_regex(TypeError, "start or end", - DatetimeIndex._cached_range, periods=20, - offset=BDay()) + with tm.assert_raises_regex(TypeError, "start or end"): + DatetimeIndex._cached_range(periods=20, offset=BDay()) def test_cached_range_bug(self): rng = date_range('2010-09-01 05:00:00', periods=50, @@ -300,8 +293,9 @@ def test_timezone_comparaison_bug(self): def test_timezone_comparaison_assert(self): start = Timestamp('20130220 10:00', tz='US/Eastern') - pytest.raises(AssertionError, date_range, start, periods=2, - tz='Europe/Berlin') + msg = 'Inferred time zone not equal to passed time zone' + with tm.assert_raises_regex(AssertionError, msg): + date_range(start, periods=2, tz='Europe/Berlin') def test_misc(self): end = datetime(2009, 5, 13) @@ -315,14 +309,17 @@ def test_misc(self): def test_date_parse_failure(self): badly_formed_date = '2007/100/1' - pytest.raises(ValueError, Timestamp, badly_formed_date) + with pytest.raises(ValueError): + Timestamp(badly_formed_date) + + with pytest.raises(ValueError): + bdate_range(start=badly_formed_date, periods=10) - pytest.raises(ValueError, bdate_range, start=badly_formed_date, - periods=10) - pytest.raises(ValueError, bdate_range, end=badly_formed_date, - periods=10) - pytest.raises(ValueError, bdate_range, badly_formed_date, - badly_formed_date) + with pytest.raises(ValueError): + bdate_range(end=badly_formed_date, periods=10) + + with pytest.raises(ValueError): + bdate_range(badly_formed_date, badly_formed_date) def test_daterange_bug_456(self): # GH #456 @@ -334,8 +331,9 @@ def test_daterange_bug_456(self): assert isinstance(result, DatetimeIndex) def test_error_with_zero_monthends(self): - pytest.raises(ValueError, date_range, '1/1/2000', '1/1/2001', - freq=MonthEnd(0)) + msg = 'Offset <0 \* MonthEnds> did not increment date' + with tm.assert_raises_regex(ValueError, msg): + date_range('1/1/2000', '1/1/2001', freq=MonthEnd(0)) def test_range_bug(self): # GH #770 @@ -343,8 +341,8 @@ def test_range_bug(self): result = date_range("2011-1-1", "2012-1-31", freq=offset) start = datetime(2011, 1, 1) - exp_values = [start + i * offset for i in range(5)] - tm.assert_index_equal(result, DatetimeIndex(exp_values)) + expected = DatetimeIndex([start + i * offset for i in range(5)]) + tm.assert_index_equal(result, expected) def test_range_tz_pytz(self): # see gh-2906 @@ -525,20 +523,18 @@ def test_freq_divides_end_in_nanos(self): class TestCustomDateRange(object): - def setup_method(self, method): - self.rng = cdate_range(START, END) def test_constructor(self): - cdate_range(START, END, freq=CDay()) - cdate_range(START, periods=20, freq=CDay()) - cdate_range(end=START, periods=20, freq=CDay()) + bdate_range(START, END, freq=CDay()) + bdate_range(START, periods=20, freq=CDay()) + bdate_range(end=START, periods=20, freq=CDay()) msg = 'periods must be a number, got C' with tm.assert_raises_regex(TypeError, msg): date_range('2011-1-1', '2012-1-1', 'C') with tm.assert_raises_regex(TypeError, msg): - cdate_range('2011-1-1', '2012-1-1', 'C') + bdate_range('2011-1-1', '2012-1-1', 'C') def test_cached_range(self): DatetimeIndex._cached_range(START, END, offset=CDay()) @@ -547,66 +543,93 @@ def test_cached_range(self): DatetimeIndex._cached_range(end=START, periods=20, offset=CDay()) - pytest.raises(Exception, DatetimeIndex._cached_range, START, END) + # with pytest.raises(TypeError): + with tm.assert_raises_regex(TypeError, "offset"): + DatetimeIndex._cached_range(START, END) - pytest.raises(Exception, DatetimeIndex._cached_range, START, - freq=CDay()) + # with pytest.raises(TypeError): + with tm.assert_raises_regex(TypeError, "specify period"): + DatetimeIndex._cached_range(START, offset=CDay()) - pytest.raises(Exception, DatetimeIndex._cached_range, end=END, - freq=CDay()) + # with pytest.raises(TypeError): + with tm.assert_raises_regex(TypeError, "specify period"): + DatetimeIndex._cached_range(end=END, offset=CDay()) - pytest.raises(Exception, DatetimeIndex._cached_range, periods=20, - freq=CDay()) + # with pytest.raises(TypeError): + with tm.assert_raises_regex(TypeError, "start or end"): + DatetimeIndex._cached_range(periods=20, offset=CDay()) def test_misc(self): end = datetime(2009, 5, 13) - dr = cdate_range(end=end, periods=20) + dr = bdate_range(end=end, periods=20, freq='C') firstDate = end - 19 * CDay() assert len(dr) == 20 assert dr[0] == firstDate assert dr[-1] == end - def test_date_parse_failure(self): - badly_formed_date = '2007/100/1' - - pytest.raises(ValueError, Timestamp, badly_formed_date) - - pytest.raises(ValueError, cdate_range, start=badly_formed_date, - periods=10) - pytest.raises(ValueError, cdate_range, end=badly_formed_date, - periods=10) - pytest.raises(ValueError, cdate_range, badly_formed_date, - badly_formed_date) - def test_daterange_bug_456(self): # GH #456 - rng1 = cdate_range('12/5/2011', '12/5/2011') - rng2 = cdate_range('12/2/2011', '12/5/2011') + rng1 = bdate_range('12/5/2011', '12/5/2011', freq='C') + rng2 = bdate_range('12/2/2011', '12/5/2011', freq='C') rng2.offset = CDay() result = rng1.union(rng2) assert isinstance(result, DatetimeIndex) def test_cdaterange(self): - rng = cdate_range('2013-05-01', periods=3) - xp = DatetimeIndex(['2013-05-01', '2013-05-02', '2013-05-03']) - tm.assert_index_equal(xp, rng) + result = bdate_range('2013-05-01', periods=3, freq='C') + expected = DatetimeIndex(['2013-05-01', '2013-05-02', '2013-05-03']) + tm.assert_index_equal(result, expected) def test_cdaterange_weekmask(self): - rng = cdate_range('2013-05-01', periods=3, - weekmask='Sun Mon Tue Wed Thu') - xp = DatetimeIndex(['2013-05-01', '2013-05-02', '2013-05-05']) - tm.assert_index_equal(xp, rng) + result = bdate_range('2013-05-01', periods=3, freq='C', + weekmask='Sun Mon Tue Wed Thu') + expected = DatetimeIndex(['2013-05-01', '2013-05-02', '2013-05-05']) + tm.assert_index_equal(result, expected) + + # raise with non-custom freq + msg = ('a custom frequency string is required when holidays or ' + 'weekmask are passed, got frequency B') + with tm.assert_raises_regex(ValueError, msg): + bdate_range('2013-05-01', periods=3, + weekmask='Sun Mon Tue Wed Thu') def test_cdaterange_holidays(self): - rng = cdate_range('2013-05-01', periods=3, holidays=['2013-05-01']) - xp = DatetimeIndex(['2013-05-02', '2013-05-03', '2013-05-06']) - tm.assert_index_equal(xp, rng) + result = bdate_range('2013-05-01', periods=3, freq='C', + holidays=['2013-05-01']) + expected = DatetimeIndex(['2013-05-02', '2013-05-03', '2013-05-06']) + tm.assert_index_equal(result, expected) + + # raise with non-custom freq + msg = ('a custom frequency string is required when holidays or ' + 'weekmask are passed, got frequency B') + with tm.assert_raises_regex(ValueError, msg): + bdate_range('2013-05-01', periods=3, holidays=['2013-05-01']) def test_cdaterange_weekmask_and_holidays(self): - rng = cdate_range('2013-05-01', periods=3, - weekmask='Sun Mon Tue Wed Thu', - holidays=['2013-05-01']) - xp = DatetimeIndex(['2013-05-02', '2013-05-05', '2013-05-06']) - tm.assert_index_equal(xp, rng) + result = bdate_range('2013-05-01', periods=3, freq='C', + weekmask='Sun Mon Tue Wed Thu', + holidays=['2013-05-01']) + expected = DatetimeIndex(['2013-05-02', '2013-05-05', '2013-05-06']) + tm.assert_index_equal(result, expected) + + # raise with non-custom freq + msg = ('a custom frequency string is required when holidays or ' + 'weekmask are passed, got frequency B') + with tm.assert_raises_regex(ValueError, msg): + bdate_range('2013-05-01', periods=3, + weekmask='Sun Mon Tue Wed Thu', + holidays=['2013-05-01']) + + @pytest.mark.parametrize('freq', [freq for freq in prefix_mapping + if freq.startswith('C')]) + def test_all_custom_freq(self, freq): + # should not raise + bdate_range(START, END, freq=freq, weekmask='Mon Wed Fri', + holidays=['2009-03-14']) + + bad_freq = freq + 'FOO' + msg = 'invalid custom frequency string: {freq}' + with tm.assert_raises_regex(ValueError, msg.format(freq=bad_freq)): + bdate_range(START, END, freq=bad_freq) diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py index 86e65feec04f36..7cb051d351444c 100644 --- a/pandas/tests/indexes/datetimes/test_ops.py +++ b/pandas/tests/indexes/datetimes/test_ops.py @@ -10,7 +10,6 @@ import pandas._libs.tslib as tslib import pandas.util.testing as tm from pandas.errors import PerformanceWarning -from pandas.core.indexes.datetimes import cdate_range from pandas import (DatetimeIndex, PeriodIndex, Series, Timestamp, Timedelta, date_range, TimedeltaIndex, _np_version_under1p10, Index, datetime, Float64Index, offsets, bdate_range) @@ -1208,7 +1207,7 @@ def test_identical(self): class TestCustomDatetimeIndex(object): def setup_method(self, method): - self.rng = cdate_range(START, END) + self.rng = bdate_range(START, END, freq='C') def test_comparison(self): d = self.rng[10] @@ -1277,10 +1276,11 @@ def test_summary(self): self.rng[2:2].summary() def test_summary_pytz(self): - cdate_range('1/1/2005', '1/1/2009', tz=pytz.utc).summary() + bdate_range('1/1/2005', '1/1/2009', freq='C', tz=pytz.utc).summary() def test_summary_dateutil(self): - cdate_range('1/1/2005', '1/1/2009', tz=dateutil.tz.tzutc()).summary() + bdate_range('1/1/2005', '1/1/2009', freq='C', + tz=dateutil.tz.tzutc()).summary() def test_equals(self): assert not self.rng.equals(list(self.rng)) diff --git a/pandas/tests/indexes/datetimes/test_setops.py b/pandas/tests/indexes/datetimes/test_setops.py index 4ffd2e1cd1e615..ff436e0501849f 100644 --- a/pandas/tests/indexes/datetimes/test_setops.py +++ b/pandas/tests/indexes/datetimes/test_setops.py @@ -4,7 +4,6 @@ import pandas as pd import pandas.util.testing as tm -from pandas.core.indexes.datetimes import cdate_range from pandas import (DatetimeIndex, date_range, Series, bdate_range, DataFrame, Int64Index, Index, to_datetime) from pandas.tseries.offsets import Minute, BMonthEnd, MonthEnd @@ -345,7 +344,7 @@ def test_month_range_union_tz_dateutil(self): class TestCustomDatetimeIndex(object): def setup_method(self, method): - self.rng = cdate_range(START, END) + self.rng = bdate_range(START, END, freq='C') def test_union(self): # overlapping @@ -412,7 +411,7 @@ def test_outer_join(self): def test_intersection_bug(self): # GH #771 - a = cdate_range('11/30/2011', '12/31/2011') - b = cdate_range('12/10/2011', '12/20/2011') + a = bdate_range('11/30/2011', '12/31/2011', freq='C') + b = bdate_range('12/10/2011', '12/20/2011', freq='C') result = a.intersection(b) tm.assert_index_equal(result, b) diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index ea37434e3a8d98..3a2a613986dcae 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -2987,6 +2987,7 @@ def generate_range(start=None, end=None, periods=None, CustomBusinessHour, # 'CBH' MonthEnd, # 'M' MonthBegin, # 'MS' + Nano, # 'N' SemiMonthEnd, # 'SM' SemiMonthBegin, # 'SMS' Week, # 'W' @@ -3002,5 +3003,3 @@ def generate_range(start=None, end=None, periods=None, FY5253, FY5253Quarter, ]) - -prefix_mapping['N'] = Nano From a6078728ecf95db2b6e3b31830e30673dee3200e Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 2 Oct 2017 07:59:15 -0400 Subject: [PATCH 186/188] BUG: Regression in .loc accepting a boolean Index as an indexer (#17738) closes #17131 --- doc/source/whatsnew/v0.21.0.txt | 1 + pandas/core/common.py | 4 ++-- pandas/tests/indexing/test_loc.py | 17 +++++++++++++++++ 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index d69a5c22acc035..f17e5b5e8fa488 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -668,6 +668,7 @@ Indexing - Bug in ``IntervalIndex`` where performing a scalar lookup fails for included right endpoints of non-overlapping monotonic decreasing indexes (:issue:`16417`, :issue:`17271`) - Bug in :meth:`DataFrame.first_valid_index` and :meth:`DataFrame.last_valid_index` when no valid entry (:issue:`17400`) - Bug in :func:`Series.rename` when called with a `callable`, incorrectly alters the name of the `Series`, rather than the name of the `Index`. (:issue:`17407`) +- Regression in ``.loc`` accepting a boolean ``Index`` as an indexer (:issue:`17131`) I/O ^^^ diff --git a/pandas/core/common.py b/pandas/core/common.py index 0f7b86f5e74a09..2686ad370e1ed2 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -15,7 +15,7 @@ from pandas import compat from pandas.compat import long, zip, iteritems from pandas.core.config import get_option -from pandas.core.dtypes.generic import ABCSeries +from pandas.core.dtypes.generic import ABCSeries, ABCIndex from pandas.core.dtypes.common import _NS_DTYPE from pandas.core.dtypes.inference import _iterable_not_string from pandas.core.dtypes.missing import isna, isnull, notnull # noqa @@ -182,7 +182,7 @@ def _maybe_box_datetimelike(value): def is_bool_indexer(key): - if isinstance(key, (ABCSeries, np.ndarray)): + if isinstance(key, (ABCSeries, np.ndarray, ABCIndex)): if key.dtype == np.object_: key = np.asarray(_values_from_object(key)) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 17316a714e2609..95d6a24e68425c 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -317,6 +317,23 @@ def test_loc_getitem_label_slice(self): self.check_result('mixed slice', 'loc', slice(2, 4, 2), 'ix', slice( 2, 4, 2), typs=['mixed'], axes=0, fails=TypeError) + def test_loc_index(self): + # gh-17131 + # a boolean index should index like a boolean numpy array + + df = DataFrame( + np.random.random(size=(5, 10)), + index=["alpha_0", "alpha_1", "alpha_2", "beta_0", "beta_1"]) + + mask = df.index.map(lambda x: "alpha" in x) + expected = df.loc[np.array(mask)] + + result = df.loc[mask] + tm.assert_frame_equal(result, expected) + + result = df.loc[mask.values] + tm.assert_frame_equal(result, expected) + def test_loc_general(self): df = DataFrame( From 1a6b7ab8ecb0270227066ec7cca8a6bbcd9ddbc3 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 2 Oct 2017 08:32:44 -0400 Subject: [PATCH 187/188] DOC: remove whatsnew note for xref #17131 --- doc/source/whatsnew/v0.21.0.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index f17e5b5e8fa488..d69a5c22acc035 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -668,7 +668,6 @@ Indexing - Bug in ``IntervalIndex`` where performing a scalar lookup fails for included right endpoints of non-overlapping monotonic decreasing indexes (:issue:`16417`, :issue:`17271`) - Bug in :meth:`DataFrame.first_valid_index` and :meth:`DataFrame.last_valid_index` when no valid entry (:issue:`17400`) - Bug in :func:`Series.rename` when called with a `callable`, incorrectly alters the name of the `Series`, rather than the name of the `Index`. (:issue:`17407`) -- Regression in ``.loc`` accepting a boolean ``Index`` as an indexer (:issue:`17131`) I/O ^^^ From a3d538ab72380471f5de7b8e4a3f811aa4de84af Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 2 Oct 2017 05:43:39 -0700 Subject: [PATCH 188/188] Separate out _convert_datetime_to_tsobject (#17715) --- pandas/_libs/tslib.pyx | 145 +++++++++++++++++++++++++++-------------- 1 file changed, 95 insertions(+), 50 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 096ebe9a5627b0..ff20ea287bd9d1 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -708,7 +708,7 @@ class Timestamp(_Timestamp): # reconstruct & check bounds ts_input = datetime(dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us, tzinfo=_tzinfo) - ts = convert_to_tsobject(ts_input, _tzinfo, None, 0, 0) + ts = convert_datetime_to_tsobject(ts_input, _tzinfo) value = ts.value + (dts.ps // 1000) if value != NPY_NAT: _check_dts_bounds(&dts) @@ -1455,52 +1455,11 @@ cdef convert_to_tsobject(object ts, object tz, object unit, obj.value = ts pandas_datetime_to_datetimestruct(ts, PANDAS_FR_ns, &obj.dts) elif PyDateTime_Check(ts): - if tz is not None: - # sort of a temporary hack - if ts.tzinfo is not None: - if (hasattr(tz, 'normalize') and - hasattr(ts.tzinfo, '_utcoffset')): - ts = tz.normalize(ts) - obj.value = _pydatetime_to_dts(ts, &obj.dts) - obj.tzinfo = ts.tzinfo - else: #tzoffset - try: - tz = ts.astimezone(tz).tzinfo - except: - pass - obj.value = _pydatetime_to_dts(ts, &obj.dts) - ts_offset = get_utcoffset(ts.tzinfo, ts) - obj.value -= _delta_to_nanoseconds(ts_offset) - tz_offset = get_utcoffset(tz, ts) - obj.value += _delta_to_nanoseconds(tz_offset) - pandas_datetime_to_datetimestruct(obj.value, - PANDAS_FR_ns, &obj.dts) - obj.tzinfo = tz - elif not is_utc(tz): - ts = _localize_pydatetime(ts, tz) - obj.value = _pydatetime_to_dts(ts, &obj.dts) - obj.tzinfo = ts.tzinfo - else: - # UTC - obj.value = _pydatetime_to_dts(ts, &obj.dts) - obj.tzinfo = pytz.utc - else: - obj.value = _pydatetime_to_dts(ts, &obj.dts) - obj.tzinfo = ts.tzinfo - - if obj.tzinfo is not None and not is_utc(obj.tzinfo): - offset = get_utcoffset(obj.tzinfo, ts) - obj.value -= _delta_to_nanoseconds(offset) - - if is_timestamp(ts): - obj.value += ts.nanosecond - obj.dts.ps = ts.nanosecond * 1000 - _check_dts_bounds(&obj.dts) - return obj + return convert_datetime_to_tsobject(ts, tz) elif PyDate_Check(ts): # Keep the converter same as PyDateTime's ts = datetime.combine(ts, datetime_time()) - return convert_to_tsobject(ts, tz, None, 0, 0) + return convert_datetime_to_tsobject(ts, tz) elif getattr(ts, '_typ', None) == 'period': raise ValueError( "Cannot convert Period to Timestamp " @@ -1518,6 +1477,83 @@ cdef convert_to_tsobject(object ts, object tz, object unit, return obj +cdef _TSObject convert_datetime_to_tsobject(datetime ts, object tz, + int32_t nanos=0): + """ + Convert a datetime (or Timestamp) input `ts`, along with optional timezone + object `tz` to a _TSObject. + + The optional argument `nanos` allows for cases where datetime input + needs to be supplemented with higher-precision information. + + Parameters + ---------- + ts : datetime or Timestamp + Value to be converted to _TSObject + tz : tzinfo or None + timezone for the timezone-aware output + nanos : int32_t, default is 0 + nanoseconds supplement the precision of the datetime input ts + + Returns + ------- + obj : _TSObject + """ + cdef: + _TSObject obj = _TSObject() + + if tz is not None: + tz = maybe_get_tz(tz) + + # sort of a temporary hack + if ts.tzinfo is not None: + if (hasattr(tz, 'normalize') and + hasattr(ts.tzinfo, '_utcoffset')): + ts = tz.normalize(ts) + obj.value = _pydatetime_to_dts(ts, &obj.dts) + obj.tzinfo = ts.tzinfo + else: + # tzoffset + try: + tz = ts.astimezone(tz).tzinfo + except: + pass + obj.value = _pydatetime_to_dts(ts, &obj.dts) + ts_offset = get_utcoffset(ts.tzinfo, ts) + obj.value -= int(ts_offset.total_seconds() * 1e9) + tz_offset = get_utcoffset(tz, ts) + obj.value += int(tz_offset.total_seconds() * 1e9) + pandas_datetime_to_datetimestruct(obj.value, + PANDAS_FR_ns, &obj.dts) + obj.tzinfo = tz + elif not is_utc(tz): + ts = _localize_pydatetime(ts, tz) + obj.value = _pydatetime_to_dts(ts, &obj.dts) + obj.tzinfo = ts.tzinfo + else: + # UTC + obj.value = _pydatetime_to_dts(ts, &obj.dts) + obj.tzinfo = pytz.utc + else: + obj.value = _pydatetime_to_dts(ts, &obj.dts) + obj.tzinfo = ts.tzinfo + + if obj.tzinfo is not None and not is_utc(obj.tzinfo): + offset = get_utcoffset(obj.tzinfo, ts) + obj.value -= int(offset.total_seconds() * 1e9) + + if is_timestamp(ts): + obj.value += ts.nanosecond + obj.dts.ps = ts.nanosecond * 1000 + + if nanos: + obj.value += nanos + obj.dts.ps = nanos * 1000 + + _check_dts_bounds(&obj.dts) + return obj + + cpdef convert_str_to_tsobject(object ts, object tz, object unit, dayfirst=False, yearfirst=False): """ ts must be a string """ @@ -1538,11 +1574,12 @@ cpdef convert_str_to_tsobject(object ts, object tz, object unit, elif ts == 'now': # Issue 9000, we short-circuit rather than going # into np_datetime_strings which returns utc - ts = Timestamp.now(tz) + ts = datetime.now(tz) elif ts == 'today': # Issue 9000, we short-circuit rather than going # into np_datetime_strings which returns a normalized datetime - ts = Timestamp.today(tz) + ts = datetime.now(tz) + # equiv: datetime.today().replace(tzinfo=tz) else: try: _string_to_dts(ts, &obj.dts, &out_local, &out_tzoffset) @@ -1557,7 +1594,15 @@ cpdef convert_str_to_tsobject(object ts, object tz, object unit, return obj else: # Keep the converter same as PyDateTime's - ts = Timestamp(obj.value, tz=obj.tzinfo) + obj = convert_to_tsobject(obj.value, obj.tzinfo, + None, 0, 0) + dtime = datetime(obj.dts.year, obj.dts.month, obj.dts.day, + obj.dts.hour, obj.dts.min, obj.dts.sec, + obj.dts.us, obj.tzinfo) + obj = convert_datetime_to_tsobject(dtime, tz, + nanos=obj.dts.ps / 1000) + return obj + else: ts = obj.value if tz is not None: @@ -1706,7 +1751,7 @@ def datetime_to_datetime64(ndarray[object] values): else: inferred_tz = get_timezone(val.tzinfo) - _ts = convert_to_tsobject(val, None, None, 0, 0) + _ts = convert_datetime_to_tsobject(val, None) iresult[i] = _ts.value _check_dts_bounds(&_ts.dts) else: @@ -2026,7 +2071,7 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', seen_datetime=1 if val.tzinfo is not None: if utc_convert: - _ts = convert_to_tsobject(val, None, 'ns', 0, 0) + _ts = convert_datetime_to_tsobject(val, None) iresult[i] = _ts.value try: _check_dts_bounds(&_ts.dts) @@ -2135,7 +2180,7 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', raise TypeError("invalid string coercion to datetime") try: - _ts = convert_to_tsobject(py_dt, None, None, 0, 0) + _ts = convert_datetime_to_tsobject(py_dt, None) iresult[i] = _ts.value except ValueError: if is_coerce: