From 1bf7688d5bb4d70bf5e8b7e74abe055472c9adfe Mon Sep 17 00:00:00 2001 From: jschendel Date: Sun, 4 Feb 2018 08:54:14 -0700 Subject: [PATCH 01/20] TST: Remove duplicate TimdeltaIndex tests (#19509) --- .../tests/indexes/timedeltas/test_astype.py | 49 +++---------------- 1 file changed, 6 insertions(+), 43 deletions(-) diff --git a/pandas/tests/indexes/timedeltas/test_astype.py b/pandas/tests/indexes/timedeltas/test_astype.py index af16fe71edcf3..c3bd857036efc 100644 --- a/pandas/tests/indexes/timedeltas/test_astype.py +++ b/pandas/tests/indexes/timedeltas/test_astype.py @@ -2,36 +2,20 @@ import numpy as np -import pandas as pd import pandas.util.testing as tm from pandas import (TimedeltaIndex, timedelta_range, Int64Index, Float64Index, - Index, Timedelta) + Index, Timedelta, NaT) -from ..datetimelike import DatetimeLike - -class TestTimedeltaIndex(DatetimeLike): - _holder = TimedeltaIndex +class TestTimedeltaIndex(object): _multiprocess_can_split_ = True - def test_numeric_compat(self): - # Dummy method to override super's version; this test is now done - # in test_arithmetic.py - pass - - def setup_method(self, method): - self.indices = dict(index=tm.makeTimedeltaIndex(10)) - self.setup_indices() - - def create_index(self): - return pd.to_timedelta(range(5), unit='d') + pd.offsets.Hour(1) - def test_astype(self): # GH 13149, GH 13209 - idx = TimedeltaIndex([1e14, 'NaT', pd.NaT, np.NaN]) + idx = TimedeltaIndex([1e14, 'NaT', NaT, np.NaN]) result = idx.astype(object) - expected = Index([Timedelta('1 days 03:46:40')] + [pd.NaT] * 3, + expected = Index([Timedelta('1 days 03:46:40')] + [NaT] * 3, dtype=object) tm.assert_index_equal(result, expected) @@ -51,7 +35,7 @@ def test_astype(self): def test_astype_timedelta64(self): # GH 13149, GH 13209 - idx = TimedeltaIndex([1e14, 'NaT', pd.NaT, np.NaN]) + idx = TimedeltaIndex([1e14, 'NaT', NaT, np.NaN]) result = idx.astype('timedelta64') expected = Float64Index([1e+14] + [np.NaN] * 3, dtype='float64') @@ -69,28 +53,7 @@ def test_astype_timedelta64(self): float, 'datetime64', 'datetime64[ns]']) def test_astype_raises(self, dtype): # GH 13149, GH 13209 - idx = TimedeltaIndex([1e14, 'NaT', pd.NaT, np.NaN]) + idx = TimedeltaIndex([1e14, 'NaT', NaT, np.NaN]) msg = 'Cannot cast TimedeltaIndex to dtype' with tm.assert_raises_regex(TypeError, msg): idx.astype(dtype) - - def test_pickle_compat_construction(self): - pass - - def test_shift(self): - # test shift for TimedeltaIndex - # err8083 - - drange = self.create_index() - result = drange.shift(1) - expected = TimedeltaIndex(['1 days 01:00:00', '2 days 01:00:00', - '3 days 01:00:00', - '4 days 01:00:00', '5 days 01:00:00'], - freq='D') - tm.assert_index_equal(result, expected) - - result = drange.shift(3, freq='2D 1s') - expected = TimedeltaIndex(['6 days 01:00:03', '7 days 01:00:03', - '8 days 01:00:03', '9 days 01:00:03', - '10 days 01:00:03'], freq='D') - tm.assert_index_equal(result, expected) From 58f2a4c99a358cd172aded6a59bba8fb6333e7a2 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 4 Feb 2018 08:05:30 -0800 Subject: [PATCH 02/20] Frame specific parts of #19504 (#19512) --- pandas/tests/frame/test_timezones.py | 135 +++++++++++++++++++++++++ pandas/tests/tseries/test_timezones.py | 123 ++-------------------- 2 files changed, 144 insertions(+), 114 deletions(-) create mode 100644 pandas/tests/frame/test_timezones.py diff --git a/pandas/tests/frame/test_timezones.py b/pandas/tests/frame/test_timezones.py new file mode 100644 index 0000000000000..fa589a0aa4817 --- /dev/null +++ b/pandas/tests/frame/test_timezones.py @@ -0,0 +1,135 @@ +# -*- coding: utf-8 -*- +""" +Tests for DataFrame timezone-related methods +""" +from datetime import datetime + +import pytest +import pytz +import numpy as np + +import pandas.util.testing as tm +from pandas.compat import lrange +from pandas.core.indexes.datetimes import date_range +from pandas.core.dtypes.dtypes import DatetimeTZDtype +from pandas import Series, DataFrame + + +class TestDataFrameTimezones(object): + def test_frame_from_records_utc(self): + rec = {'datum': 1.5, + 'begin_time': datetime(2006, 4, 27, tzinfo=pytz.utc)} + + # it works + DataFrame.from_records([rec], index='begin_time') + + def test_frame_tz_localize(self): + rng = date_range('1/1/2011', periods=100, freq='H') + + df = DataFrame({'a': 1}, index=rng) + result = df.tz_localize('utc') + expected = DataFrame({'a': 1}, rng.tz_localize('UTC')) + assert result.index.tz.zone == 'UTC' + tm.assert_frame_equal(result, expected) + + df = df.T + result = df.tz_localize('utc', axis=1) + assert result.columns.tz.zone == 'UTC' + tm.assert_frame_equal(result, expected.T) + + def test_frame_tz_convert(self): + rng = date_range('1/1/2011', periods=200, freq='D', tz='US/Eastern') + + df = DataFrame({'a': 1}, index=rng) + result = df.tz_convert('Europe/Berlin') + expected = DataFrame({'a': 1}, rng.tz_convert('Europe/Berlin')) + assert result.index.tz.zone == 'Europe/Berlin' + tm.assert_frame_equal(result, expected) + + df = df.T + result = df.tz_convert('Europe/Berlin', axis=1) + assert result.columns.tz.zone == 'Europe/Berlin' + tm.assert_frame_equal(result, expected.T) + + def test_frame_join_tzaware(self): + test1 = DataFrame(np.zeros((6, 3)), + index=date_range("2012-11-15 00:00:00", periods=6, + freq="100L", tz="US/Central")) + test2 = DataFrame(np.zeros((3, 3)), + index=date_range("2012-11-15 00:00:00", periods=3, + freq="250L", tz="US/Central"), + columns=lrange(3, 6)) + + result = test1.join(test2, how='outer') + ex_index = test1.index.union(test2.index) + + tm.assert_index_equal(result.index, ex_index) + assert result.index.tz.zone == 'US/Central' + + def test_frame_add_tz_mismatch_converts_to_utc(self): + rng = date_range('1/1/2011', periods=10, freq='H', tz='US/Eastern') + df = DataFrame(np.random.randn(len(rng)), index=rng, columns=['a']) + + df_moscow = df.tz_convert('Europe/Moscow') + result = df + df_moscow + assert result.index.tz is pytz.utc + + result = df_moscow + df + assert result.index.tz is pytz.utc + + def test_frame_align_aware(self): + idx1 = date_range('2001', periods=5, freq='H', tz='US/Eastern') + idx2 = date_range('2001', periods=5, freq='2H', tz='US/Eastern') + df1 = DataFrame(np.random.randn(len(idx1), 3), idx1) + df2 = DataFrame(np.random.randn(len(idx2), 3), idx2) + new1, new2 = df1.align(df2) + assert df1.index.tz == new1.index.tz + assert df2.index.tz == new2.index.tz + + # different timezones convert to UTC + + # frame with frame + df1_central = df1.tz_convert('US/Central') + new1, new2 = df1.align(df1_central) + assert new1.index.tz == pytz.UTC + assert new2.index.tz == pytz.UTC + + # frame with Series + new1, new2 = df1.align(df1_central[0], axis=0) + assert new1.index.tz == pytz.UTC + assert new2.index.tz == pytz.UTC + + df1[0].align(df1_central, axis=0) + assert new1.index.tz == pytz.UTC + assert new2.index.tz == pytz.UTC + + @pytest.mark.parametrize('tz', ['US/Eastern', 'dateutil/US/Eastern']) + def test_frame_no_datetime64_dtype(self, tz): + # after GH#7822 + # these retain the timezones on dict construction + dr = date_range('2011/1/1', '2012/1/1', freq='W-FRI') + dr_tz = dr.tz_localize(tz) + df = DataFrame({'A': 'foo', 'B': dr_tz}, index=dr) + tz_expected = DatetimeTZDtype('ns', dr_tz.tzinfo) + assert df['B'].dtype == tz_expected + + # GH#2810 (with timezones) + datetimes_naive = [ts.to_pydatetime() for ts in dr] + datetimes_with_tz = [ts.to_pydatetime() for ts in dr_tz] + df = DataFrame({'dr': dr, + 'dr_tz': dr_tz, + 'datetimes_naive': datetimes_naive, + 'datetimes_with_tz': datetimes_with_tz}) + result = df.get_dtype_counts().sort_index() + expected = Series({'datetime64[ns]': 2, + str(tz_expected): 2}).sort_index() + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize('tz', ['US/Eastern', 'dateutil/US/Eastern']) + def test_frame_reset_index(self, tz): + dr = date_range('2012-06-02', periods=10, tz=tz) + df = DataFrame(np.random.randn(len(dr)), dr) + roundtripped = df.reset_index().set_index('index') + xp = df.index.tz + rs = roundtripped.index.tz + assert xp == rs diff --git a/pandas/tests/tseries/test_timezones.py b/pandas/tests/tseries/test_timezones.py index cc5f4d30f9aaf..e47be69b79feb 100644 --- a/pandas/tests/tseries/test_timezones.py +++ b/pandas/tests/tseries/test_timezones.py @@ -16,13 +16,11 @@ import pandas.tseries.offsets as offsets from pandas.compat import lrange, zip from pandas.core.indexes.datetimes import bdate_range, date_range -from pandas.core.dtypes.dtypes import DatetimeTZDtype from pandas._libs import tslib from pandas._libs.tslibs import timezones, conversion -from pandas import (Index, Series, DataFrame, isna, Timestamp, NaT, +from pandas import (Index, Series, isna, Timestamp, NaT, DatetimeIndex, to_datetime) -from pandas.util.testing import (assert_frame_equal, assert_series_equal, - set_timezone) +from pandas.util.testing import assert_series_equal, set_timezone class FixedOffset(tzinfo): @@ -786,29 +784,6 @@ def test_to_datetime_tzlocal(self): result = to_datetime(arr, utc=True) assert result.tz is pytz.utc - def test_frame_no_datetime64_dtype(self): - - # after 7822 - # these retain the timezones on dict construction - - dr = date_range('2011/1/1', '2012/1/1', freq='W-FRI') - dr_tz = dr.tz_localize(self.tzstr('US/Eastern')) - e = DataFrame({'A': 'foo', 'B': dr_tz}, index=dr) - tz_expected = DatetimeTZDtype('ns', dr_tz.tzinfo) - assert e['B'].dtype == tz_expected - - # GH 2810 (with timezones) - datetimes_naive = [ts.to_pydatetime() for ts in dr] - datetimes_with_tz = [ts.to_pydatetime() for ts in dr_tz] - df = DataFrame({'dr': dr, - 'dr_tz': dr_tz, - 'datetimes_naive': datetimes_naive, - 'datetimes_with_tz': datetimes_with_tz}) - result = df.get_dtype_counts().sort_index() - expected = Series({'datetime64[ns]': 2, - str(tz_expected): 2}).sort_index() - assert_series_equal(result, expected) - def test_hongkong_tz_convert(self): # #1673 dr = date_range('2012-01-01', '2012-01-10', freq='D', tz='Hongkong') @@ -872,21 +847,6 @@ def test_convert_datetime_list(self): assert dr.tz == dr2.tz assert dr2.name == 'foo' - def test_frame_from_records_utc(self): - rec = {'datum': 1.5, - 'begin_time': datetime(2006, 4, 27, tzinfo=pytz.utc)} - - # it works - DataFrame.from_records([rec], index='begin_time') - - def test_frame_reset_index(self): - dr = date_range('2012-06-02', periods=10, tz=self.tzstr('US/Eastern')) - df = DataFrame(np.random.randn(len(dr)), dr) - roundtripped = df.reset_index().set_index('index') - xp = df.index.tz - rs = roundtripped.index.tz - assert xp == rs - def test_dateutil_tzoffset_support(self): values = [188.5, 328.25] tzinfo = tzoffset(None, 7200) @@ -1289,7 +1249,7 @@ def test_tz_localize_roundtrip(self): tm.assert_index_equal(reset, idx) assert reset.tzinfo is None - def test_series_frame_tz_localize(self): + def test_series_tz_localize(self): rng = date_range('1/1/2011', periods=100, freq='H') ts = Series(1, index=rng) @@ -1297,41 +1257,19 @@ def test_series_frame_tz_localize(self): result = ts.tz_localize('utc') assert result.index.tz.zone == 'UTC' - df = DataFrame({'a': 1}, index=rng) - result = df.tz_localize('utc') - expected = DataFrame({'a': 1}, rng.tz_localize('UTC')) - assert result.index.tz.zone == 'UTC' - assert_frame_equal(result, expected) - - df = df.T - result = df.tz_localize('utc', axis=1) - assert result.columns.tz.zone == 'UTC' - assert_frame_equal(result, expected.T) - # Can't localize if already tz-aware rng = date_range('1/1/2011', periods=100, freq='H', tz='utc') ts = Series(1, index=rng) tm.assert_raises_regex(TypeError, 'Already tz-aware', ts.tz_localize, 'US/Eastern') - def test_series_frame_tz_convert(self): + def test_series_tz_convert(self): rng = date_range('1/1/2011', periods=200, freq='D', tz='US/Eastern') ts = Series(1, index=rng) result = ts.tz_convert('Europe/Berlin') assert result.index.tz.zone == 'Europe/Berlin' - df = DataFrame({'a': 1}, index=rng) - result = df.tz_convert('Europe/Berlin') - expected = DataFrame({'a': 1}, rng.tz_convert('Europe/Berlin')) - assert result.index.tz.zone == 'Europe/Berlin' - assert_frame_equal(result, expected) - - df = df.T - result = df.tz_convert('Europe/Berlin', axis=1) - assert result.columns.tz.zone == 'Europe/Berlin' - assert_frame_equal(result, expected.T) - # can't convert tz-naive rng = date_range('1/1/2011', periods=200, freq='D') ts = Series(1, index=rng) @@ -1389,20 +1327,6 @@ def test_join_aware(self): pytest.raises(Exception, ts.__add__, ts_utc) pytest.raises(Exception, ts_utc.__add__, ts) - test1 = DataFrame(np.zeros((6, 3)), - index=date_range("2012-11-15 00:00:00", periods=6, - freq="100L", tz="US/Central")) - test2 = DataFrame(np.zeros((3, 3)), - index=date_range("2012-11-15 00:00:00", periods=3, - freq="250L", tz="US/Central"), - columns=lrange(3, 6)) - - result = test1.join(test2, how='outer') - ex_index = test1.index.union(test2.index) - - tm.assert_index_equal(result.index, ex_index) - assert result.index.tz.zone == 'US/Central' - # non-overlapping rng = date_range("2012-11-15 00:00:00", periods=6, freq="H", tz="US/Central") @@ -1413,34 +1337,13 @@ def test_join_aware(self): result = rng.union(rng2) assert result.tz.zone == 'UTC' - def test_align_aware(self): + def test_series_align_aware(self): idx1 = date_range('2001', periods=5, freq='H', tz='US/Eastern') - idx2 = date_range('2001', periods=5, freq='2H', tz='US/Eastern') - df1 = DataFrame(np.random.randn(len(idx1), 3), idx1) - df2 = DataFrame(np.random.randn(len(idx2), 3), idx2) - new1, new2 = df1.align(df2) - assert df1.index.tz == new1.index.tz - assert df2.index.tz == new2.index.tz - + ser = Series(np.random.randn(len(idx1)), index=idx1) + ser_central = ser.tz_convert('US/Central') # # different timezones convert to UTC - # frame - df1_central = df1.tz_convert('US/Central') - new1, new2 = df1.align(df1_central) - assert new1.index.tz == pytz.UTC - assert new2.index.tz == pytz.UTC - - # series - new1, new2 = df1[0].align(df1_central[0]) - assert new1.index.tz == pytz.UTC - assert new2.index.tz == pytz.UTC - - # combination - new1, new2 = df1.align(df1_central[0], axis=0) - assert new1.index.tz == pytz.UTC - assert new2.index.tz == pytz.UTC - - df1[0].align(df1_central, axis=0) + new1, new2 = ser.align(ser_central) assert new1.index.tz == pytz.UTC assert new2.index.tz == pytz.UTC @@ -1523,7 +1426,7 @@ def test_append_aware_naive(self): assert ts_result.index.equals(ts1.index.astype(object).append( ts2.index)) - def test_equal_join_ensure_utc(self): + def test_series_add_tz_mismatch_converts_to_utc(self): rng = date_range('1/1/2011', periods=10, freq='H', tz='US/Eastern') ts = Series(np.random.randn(len(rng)), index=rng) @@ -1535,14 +1438,6 @@ def test_equal_join_ensure_utc(self): result = ts_moscow + ts assert result.index.tz is pytz.utc - df = DataFrame({'a': ts}) - df_moscow = df.tz_convert('Europe/Moscow') - result = df + df_moscow - assert result.index.tz is pytz.utc - - result = df_moscow + df - assert result.index.tz is pytz.utc - def test_arith_utc_convert(self): rng = date_range('1/1/2011', periods=100, freq='H', tz='utc') From a44f1c1b1d5944946c3fa6b15c7f962e015f2444 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 4 Feb 2018 08:06:51 -0800 Subject: [PATCH 03/20] split Timestamp tests off of 19504 (#19511) --- .../tests/scalar/timestamp/test_timezones.py | 189 +++++++++++++++++ pandas/tests/tseries/test_timezones.py | 195 +----------------- 2 files changed, 190 insertions(+), 194 deletions(-) diff --git a/pandas/tests/scalar/timestamp/test_timezones.py b/pandas/tests/scalar/timestamp/test_timezones.py index eeec70cc234f5..7a5c6feb8b651 100644 --- a/pandas/tests/scalar/timestamp/test_timezones.py +++ b/pandas/tests/scalar/timestamp/test_timezones.py @@ -2,11 +2,18 @@ """ Tests for Timestamp timezone-related methods """ +from datetime import date, timedelta +from distutils.version import LooseVersion import pytest +import pytz from pytz.exceptions import AmbiguousTimeError, NonExistentTimeError +import dateutil +from dateutil.tz import gettz, tzoffset import pandas.util.testing as tm +import pandas.util._test_decorators as td + from pandas import Timestamp, NaT @@ -14,6 +21,22 @@ class TestTimestampTZOperations(object): # -------------------------------------------------------------- # Timestamp.tz_localize + def test_tz_localize_ambiguous_bool(self): + # make sure that we are correctly accepting bool values as ambiguous + # GH#14402 + ts = Timestamp('2015-11-01 01:00:03') + expected0 = Timestamp('2015-11-01 01:00:03-0500', tz='US/Central') + expected1 = Timestamp('2015-11-01 01:00:03-0600', tz='US/Central') + + with pytest.raises(pytz.AmbiguousTimeError): + ts.tz_localize('US/Central') + + result = ts.tz_localize('US/Central', ambiguous=True) + assert result == expected0 + + result = ts.tz_localize('US/Central', ambiguous=False) + assert result == expected1 + def test_tz_localize_ambiguous(self): ts = Timestamp('2014-11-02 01:00') ts_dst = ts.tz_localize('US/Eastern', ambiguous=True) @@ -70,6 +93,55 @@ def test_tz_localize_roundtrip(self, stamp, tz): assert reset == ts assert reset.tzinfo is None + def test_tz_localize_ambiguous_compat(self): + # validate that pytz and dateutil are compat for dst + # when the transition happens + naive = Timestamp('2013-10-27 01:00:00') + + pytz_zone = 'Europe/London' + dateutil_zone = 'dateutil/Europe/London' + result_pytz = naive.tz_localize(pytz_zone, ambiguous=0) + result_dateutil = naive.tz_localize(dateutil_zone, ambiguous=0) + assert result_pytz.value == result_dateutil.value + assert result_pytz.value == 1382835600000000000 + + if LooseVersion(dateutil.__version__) < LooseVersion('2.6.0'): + # dateutil 2.6 buggy w.r.t. ambiguous=0 + # see gh-14621 + # see https://github.com/dateutil/dateutil/issues/321 + assert (result_pytz.to_pydatetime().tzname() == + result_dateutil.to_pydatetime().tzname()) + assert str(result_pytz) == str(result_dateutil) + elif LooseVersion(dateutil.__version__) > LooseVersion('2.6.0'): + # fixed ambiguous behavior + assert result_pytz.to_pydatetime().tzname() == 'GMT' + assert result_dateutil.to_pydatetime().tzname() == 'BST' + assert str(result_pytz) != str(result_dateutil) + + # 1 hour difference + result_pytz = naive.tz_localize(pytz_zone, ambiguous=1) + result_dateutil = naive.tz_localize(dateutil_zone, ambiguous=1) + assert result_pytz.value == result_dateutil.value + assert result_pytz.value == 1382832000000000000 + + # dateutil < 2.6 is buggy w.r.t. ambiguous timezones + if LooseVersion(dateutil.__version__) > LooseVersion('2.5.3'): + # see gh-14621 + assert str(result_pytz) == str(result_dateutil) + assert (result_pytz.to_pydatetime().tzname() == + result_dateutil.to_pydatetime().tzname()) + + @pytest.mark.parametrize('tz', [pytz.timezone('US/Eastern'), + gettz('US/Eastern'), + 'US/Eastern', 'dateutil/US/Eastern']) + def test_timestamp_tz_localize(self, tz): + stamp = Timestamp('3/11/2012 04:00') + + result = stamp.tz_localize(tz) + expected = Timestamp('3/11/2012 04:00', tz=tz) + assert result.hour == expected.hour + assert result == expected + # ------------------------------------------------------------------ # Timestamp.tz_convert @@ -85,3 +157,120 @@ def test_tz_convert_roundtrip(self, stamp, tz): assert reset == Timestamp(stamp) assert reset.tzinfo is None assert reset == converted.tz_convert('UTC').tz_localize(None) + + @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + def test_astimezone(self, tzstr): + # astimezone is an alias for tz_convert, so keep it with + # the tz_convert tests + utcdate = Timestamp('3/11/2012 22:00', tz='UTC') + expected = utcdate.tz_convert(tzstr) + result = utcdate.astimezone(tzstr) + assert expected == result + assert isinstance(result, Timestamp) + + @td.skip_if_windows + def test_tz_convert_utc_with_system_utc(self): + from pandas._libs.tslibs.timezones import maybe_get_tz + + # from system utc to real utc + ts = Timestamp('2001-01-05 11:56', tz=maybe_get_tz('dateutil/UTC')) + # check that the time hasn't changed. + assert ts == ts.tz_convert(dateutil.tz.tzutc()) + + # from system utc to real utc + ts = Timestamp('2001-01-05 11:56', tz=maybe_get_tz('dateutil/UTC')) + # check that the time hasn't changed. + assert ts == ts.tz_convert(dateutil.tz.tzutc()) + + # ------------------------------------------------------------------ + # Timestamp.__init__ with tz str or tzinfo + + def test_timestamp_constructor_tz_utc(self): + utc_stamp = Timestamp('3/11/2012 05:00', tz='utc') + assert utc_stamp.tzinfo is pytz.utc + assert utc_stamp.hour == 5 + + utc_stamp = Timestamp('3/11/2012 05:00').tz_localize('utc') + assert utc_stamp.hour == 5 + + def test_timestamp_to_datetime_tzoffset(self): + tzinfo = tzoffset(None, 7200) + expected = Timestamp('3/11/2012 04:00', tz=tzinfo) + result = Timestamp(expected.to_pydatetime()) + assert expected == result + + def test_timestamp_constructor_near_dst_boundary(self): + # GH#11481 & GH#15777 + # Naive string timestamps were being localized incorrectly + # with tz_convert_single instead of tz_localize_to_utc + + for tz in ['Europe/Brussels', 'Europe/Prague']: + result = Timestamp('2015-10-25 01:00', tz=tz) + expected = Timestamp('2015-10-25 01:00').tz_localize(tz) + assert result == expected + + with pytest.raises(pytz.AmbiguousTimeError): + Timestamp('2015-10-25 02:00', tz=tz) + + result = Timestamp('2017-03-26 01:00', tz='Europe/Paris') + expected = Timestamp('2017-03-26 01:00').tz_localize('Europe/Paris') + assert result == expected + + with pytest.raises(pytz.NonExistentTimeError): + Timestamp('2017-03-26 02:00', tz='Europe/Paris') + + # GH#11708 + naive = Timestamp('2015-11-18 10:00:00') + result = naive.tz_localize('UTC').tz_convert('Asia/Kolkata') + expected = Timestamp('2015-11-18 15:30:00+0530', tz='Asia/Kolkata') + assert result == expected + + # GH#15823 + result = Timestamp('2017-03-26 00:00', tz='Europe/Paris') + expected = Timestamp('2017-03-26 00:00:00+0100', tz='Europe/Paris') + assert result == expected + + result = Timestamp('2017-03-26 01:00', tz='Europe/Paris') + expected = Timestamp('2017-03-26 01:00:00+0100', tz='Europe/Paris') + assert result == expected + + with pytest.raises(pytz.NonExistentTimeError): + Timestamp('2017-03-26 02:00', tz='Europe/Paris') + + result = Timestamp('2017-03-26 02:00:00+0100', tz='Europe/Paris') + naive = Timestamp(result.value) + expected = naive.tz_localize('UTC').tz_convert('Europe/Paris') + assert result == expected + + result = Timestamp('2017-03-26 03:00', tz='Europe/Paris') + expected = Timestamp('2017-03-26 03:00:00+0200', tz='Europe/Paris') + assert result == expected + + @pytest.mark.parametrize('tz', [pytz.timezone('US/Eastern'), + gettz('US/Eastern'), + 'US/Eastern', 'dateutil/US/Eastern']) + def test_timestamp_constructed_by_date_and_tz(self, tz): + # GH#2993, Timestamp cannot be constructed by datetime.date + # and tz correctly + + result = Timestamp(date(2012, 3, 11), tz=tz) + + expected = Timestamp('3/11/2012', tz=tz) + assert result.hour == expected.hour + assert result == expected + + @pytest.mark.parametrize('tz', [pytz.timezone('US/Eastern'), + gettz('US/Eastern'), + 'US/Eastern', 'dateutil/US/Eastern']) + def test_timestamp_add_timedelta_push_over_dst_boundary(self, tz): + # GH#1389 + + # 4 hours before DST transition + stamp = Timestamp('3/10/2012 22:00', tz=tz) + + result = stamp + timedelta(hours=6) + + # spring forward, + "7" hours + expected = Timestamp('3/11/2012 05:00', tz=tz) + + assert result == expected diff --git a/pandas/tests/tseries/test_timezones.py b/pandas/tests/tseries/test_timezones.py index e47be69b79feb..2630984a70807 100644 --- a/pandas/tests/tseries/test_timezones.py +++ b/pandas/tests/tseries/test_timezones.py @@ -9,7 +9,7 @@ from pytz import NonExistentTimeError from distutils.version import LooseVersion from dateutil.tz import tzlocal, tzoffset -from datetime import datetime, timedelta, tzinfo, date +from datetime import datetime, timedelta, tzinfo import pandas.util.testing as tm import pandas.util._test_decorators as td @@ -119,120 +119,6 @@ def test_localize_utc_conversion_explicit(self): pytest.raises(NonExistentTimeError, rng.tz_localize, self.tz('US/Eastern')) - def test_timestamp_tz_localize(self): - stamp = Timestamp('3/11/2012 04:00') - - result = stamp.tz_localize(self.tzstr('US/Eastern')) - expected = Timestamp('3/11/2012 04:00', tz=self.tzstr('US/Eastern')) - assert result.hour == expected.hour - assert result == expected - - def test_timestamp_tz_localize_explicit(self): - stamp = Timestamp('3/11/2012 04:00') - - result = stamp.tz_localize(self.tz('US/Eastern')) - expected = Timestamp('3/11/2012 04:00', tz=self.tz('US/Eastern')) - assert result.hour == expected.hour - assert result == expected - - def test_timestamp_constructed_by_date_and_tz(self): - # Fix Issue 2993, Timestamp cannot be constructed by datetime.date - # and tz correctly - - result = Timestamp(date(2012, 3, 11), tz=self.tzstr('US/Eastern')) - - expected = Timestamp('3/11/2012', tz=self.tzstr('US/Eastern')) - assert result.hour == expected.hour - assert result == expected - - def test_timestamp_constructed_by_date_and_tz_explicit(self): - # Fix Issue 2993, Timestamp cannot be constructed by datetime.date - # and tz correctly - - result = Timestamp(date(2012, 3, 11), tz=self.tz('US/Eastern')) - - expected = Timestamp('3/11/2012', tz=self.tz('US/Eastern')) - assert result.hour == expected.hour - assert result == expected - - def test_timestamp_constructor_near_dst_boundary(self): - # GH 11481 & 15777 - # Naive string timestamps were being localized incorrectly - # with tz_convert_single instead of tz_localize_to_utc - - for tz in ['Europe/Brussels', 'Europe/Prague']: - result = Timestamp('2015-10-25 01:00', tz=tz) - expected = Timestamp('2015-10-25 01:00').tz_localize(tz) - assert result == expected - - with pytest.raises(pytz.AmbiguousTimeError): - Timestamp('2015-10-25 02:00', tz=tz) - - result = Timestamp('2017-03-26 01:00', tz='Europe/Paris') - expected = Timestamp('2017-03-26 01:00').tz_localize('Europe/Paris') - assert result == expected - - with pytest.raises(pytz.NonExistentTimeError): - Timestamp('2017-03-26 02:00', tz='Europe/Paris') - - # GH 11708 - result = to_datetime("2015-11-18 15:30:00+05:30").tz_localize( - 'UTC').tz_convert('Asia/Kolkata') - expected = Timestamp('2015-11-18 15:30:00+0530', tz='Asia/Kolkata') - assert result == expected - - # GH 15823 - result = Timestamp('2017-03-26 00:00', tz='Europe/Paris') - expected = Timestamp('2017-03-26 00:00:00+0100', tz='Europe/Paris') - assert result == expected - - result = Timestamp('2017-03-26 01:00', tz='Europe/Paris') - expected = Timestamp('2017-03-26 01:00:00+0100', tz='Europe/Paris') - assert result == expected - - with pytest.raises(pytz.NonExistentTimeError): - Timestamp('2017-03-26 02:00', tz='Europe/Paris') - result = Timestamp('2017-03-26 02:00:00+0100', tz='Europe/Paris') - expected = Timestamp(result.value).tz_localize( - 'UTC').tz_convert('Europe/Paris') - assert result == expected - - result = Timestamp('2017-03-26 03:00', tz='Europe/Paris') - expected = Timestamp('2017-03-26 03:00:00+0200', tz='Europe/Paris') - assert result == expected - - def test_timestamp_to_datetime_tzoffset(self): - tzinfo = tzoffset(None, 7200) - expected = Timestamp('3/11/2012 04:00', tz=tzinfo) - result = Timestamp(expected.to_pydatetime()) - assert expected == result - - def test_timedelta_push_over_dst_boundary(self): - # #1389 - - # 4 hours before DST transition - stamp = Timestamp('3/10/2012 22:00', tz=self.tzstr('US/Eastern')) - - result = stamp + timedelta(hours=6) - - # spring forward, + "7" hours - expected = Timestamp('3/11/2012 05:00', tz=self.tzstr('US/Eastern')) - - assert result == expected - - def test_timedelta_push_over_dst_boundary_explicit(self): - # #1389 - - # 4 hours before DST transition - stamp = Timestamp('3/10/2012 22:00', tz=self.tz('US/Eastern')) - - result = stamp + timedelta(hours=6) - - # spring forward, + "7" hours - expected = Timestamp('3/11/2012 05:00', tz=self.tz('US/Eastern')) - - assert result == expected - def test_tz_localize_dti(self): dti = DatetimeIndex(start='1/1/2005', end='1/1/2005 0:00:30.256', freq='L') @@ -267,13 +153,6 @@ def test_tz_localize_empty_series(self): ts2 = ts.tz_localize(self.tzstr('US/Eastern')) assert self.cmptz(ts2.index.tz, self.tz('US/Eastern')) - def test_astimezone(self): - utc = Timestamp('3/11/2012 22:00', tz='UTC') - expected = utc.tz_convert(self.tzstr('US/Eastern')) - result = utc.astimezone(self.tzstr('US/Eastern')) - assert expected == result - assert isinstance(result, Timestamp) - def test_create_with_tz(self): stamp = Timestamp('3/11/2012 05:00', tz=self.tzstr('US/Eastern')) assert stamp.hour == 5 @@ -283,13 +162,6 @@ def test_create_with_tz(self): assert stamp == rng[1] - utc_stamp = Timestamp('3/11/2012 05:00', tz='utc') - assert utc_stamp.tzinfo is pytz.utc - assert utc_stamp.hour == 5 - - utc_stamp = Timestamp('3/11/2012 05:00').tz_localize('utc') - assert utc_stamp.hour == 5 - def test_create_with_fixed_tz(self): off = FixedOffset(420, '+07:00') start = datetime(2012, 3, 11, 5, 0, 0, tzinfo=off) @@ -591,16 +463,6 @@ def test_ambiguous_bool(self): expected0 = Timestamp('2015-11-01 01:00:03-0500', tz='US/Central') expected1 = Timestamp('2015-11-01 01:00:03-0600', tz='US/Central') - def f(): - t.tz_localize('US/Central') - pytest.raises(pytz.AmbiguousTimeError, f) - - result = t.tz_localize('US/Central', ambiguous=True) - assert result == expected0 - - result = t.tz_localize('US/Central', ambiguous=False) - assert result == expected1 - s = Series([t]) expected0 = Series([expected0]) expected1 = Series([expected1]) @@ -948,20 +810,6 @@ def normalize(self, ts): # no-op for dateutil return ts - @td.skip_if_windows - def test_utc_with_system_utc(self): - from pandas._libs.tslibs.timezones import maybe_get_tz - - # from system utc to real utc - ts = Timestamp('2001-01-05 11:56', tz=maybe_get_tz('dateutil/UTC')) - # check that the time hasn't changed. - assert ts == ts.tz_convert(dateutil.tz.tzutc()) - - # from system utc to real utc - ts = Timestamp('2001-01-05 11:56', tz=maybe_get_tz('dateutil/UTC')) - # check that the time hasn't changed. - assert ts == ts.tz_convert(dateutil.tz.tzutc()) - def test_tz_convert_hour_overflow_dst(self): # Regression test for: # https://github.com/pandas-dev/pandas/issues/13306 @@ -1175,47 +1023,6 @@ def test_cache_keys_are_distinct_for_pytz_vs_dateutil(self, tz_name): class TestTimeZones(object): timezones = ['UTC', 'Asia/Tokyo', 'US/Eastern', 'dateutil/US/Pacific'] - def test_ambiguous_compat(self): - # validate that pytz and dateutil are compat for dst - # when the transition happens - - pytz_zone = 'Europe/London' - dateutil_zone = 'dateutil/Europe/London' - result_pytz = (Timestamp('2013-10-27 01:00:00') - .tz_localize(pytz_zone, ambiguous=0)) - result_dateutil = (Timestamp('2013-10-27 01:00:00') - .tz_localize(dateutil_zone, ambiguous=0)) - assert result_pytz.value == result_dateutil.value - assert result_pytz.value == 1382835600000000000 - - if LooseVersion(dateutil.__version__) < LooseVersion('2.6.0'): - # dateutil 2.6 buggy w.r.t. ambiguous=0 - # see gh-14621 - # see https://github.com/dateutil/dateutil/issues/321 - assert (result_pytz.to_pydatetime().tzname() == - result_dateutil.to_pydatetime().tzname()) - assert str(result_pytz) == str(result_dateutil) - elif LooseVersion(dateutil.__version__) > LooseVersion('2.6.0'): - # fixed ambiguous behavior - assert result_pytz.to_pydatetime().tzname() == 'GMT' - assert result_dateutil.to_pydatetime().tzname() == 'BST' - assert str(result_pytz) != str(result_dateutil) - - # 1 hour difference - result_pytz = (Timestamp('2013-10-27 01:00:00') - .tz_localize(pytz_zone, ambiguous=1)) - result_dateutil = (Timestamp('2013-10-27 01:00:00') - .tz_localize(dateutil_zone, ambiguous=1)) - assert result_pytz.value == result_dateutil.value - assert result_pytz.value == 1382832000000000000 - - # dateutil < 2.6 is buggy w.r.t. ambiguous timezones - if LooseVersion(dateutil.__version__) > LooseVersion('2.5.3'): - # see gh-14621 - assert str(result_pytz) == str(result_dateutil) - assert (result_pytz.to_pydatetime().tzname() == - result_dateutil.to_pydatetime().tzname()) - def test_index_equals_with_tz(self): left = date_range('1/1/2011', periods=100, freq='H', tz='utc') right = date_range('1/1/2011', periods=100, freq='H', tz='US/Eastern') From 56dbaaef4fdd61974c447c124df2331acbbc7d27 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 4 Feb 2018 08:18:10 -0800 Subject: [PATCH 04/20] ops cleanup, named functions instead of lambdas (#19515) --- pandas/core/ops.py | 92 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 64 insertions(+), 28 deletions(-) diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 6ea4a81cb52a1..6db84aedce7e7 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -42,6 +42,60 @@ ABCSparseSeries, ABCSparseArray) +# ----------------------------------------------------------------------------- +# Reversed Operations not available in the stdlib operator module. +# Defining these instead of using lambdas allows us to reference them by name. + +def radd(left, right): + return right + left + + +def rsub(left, right): + return right - left + + +def rmul(left, right): + return right * left + + +def rdiv(left, right): + return right / left + + +def rtruediv(left, right): + return right / left + + +def rfloordiv(left, right): + return right // left + + +def rmod(left, right): + return right % left + + +def rdivmod(left, right): + return divmod(right, left) + + +def rpow(left, right): + return right ** left + + +def rand_(left, right): + return operator.and_(right, left) + + +def ror_(left, right): + return operator.or_(right, left) + + +def rxor(left, right): + return operator.xor(right, left) + + +# ----------------------------------------------------------------------------- + def _gen_eval_kwargs(name): """ Find the keyword arguments to pass to numexpr for the given operation. @@ -140,64 +194,51 @@ def _get_frame_op_default_axis(name): _op_descriptions = { 'add': {'op': '+', 'desc': 'Addition', - 'reversed': False, 'reverse': 'radd'}, 'sub': {'op': '-', 'desc': 'Subtraction', - 'reversed': False, 'reverse': 'rsub'}, 'mul': {'op': '*', 'desc': 'Multiplication', - 'reversed': False, 'reverse': 'rmul'}, 'mod': {'op': '%', 'desc': 'Modulo', - 'reversed': False, 'reverse': 'rmod'}, 'pow': {'op': '**', 'desc': 'Exponential power', - 'reversed': False, 'reverse': 'rpow'}, 'truediv': {'op': '/', 'desc': 'Floating division', - 'reversed': False, 'reverse': 'rtruediv'}, 'floordiv': {'op': '//', 'desc': 'Integer division', - 'reversed': False, 'reverse': 'rfloordiv'}, 'divmod': {'op': 'divmod', 'desc': 'Integer division and modulo', - 'reversed': False, 'reverse': None}, 'eq': {'op': '==', 'desc': 'Equal to', - 'reversed': False, 'reverse': None}, 'ne': {'op': '!=', 'desc': 'Not equal to', - 'reversed': False, 'reverse': None}, 'lt': {'op': '<', 'desc': 'Less than', - 'reversed': False, 'reverse': None}, 'le': {'op': '<=', 'desc': 'Less than or equal to', - 'reversed': False, 'reverse': None}, 'gt': {'op': '>', 'desc': 'Greater than', - 'reversed': False, 'reverse': None}, 'ge': {'op': '>=', 'desc': 'Greater than or equal to', - 'reversed': False, 'reverse': None}} _op_names = list(_op_descriptions.keys()) for key in _op_names: + _op_descriptions[key]['reversed'] = False reverse_op = _op_descriptions[key]['reverse'] if reverse_op is not None: _op_descriptions[reverse_op] = _op_descriptions[key].copy() @@ -392,7 +433,7 @@ def names(x): # yapf: disable new_methods = dict( add=arith_method(operator.add, names('add'), op('+')), - radd=arith_method(lambda x, y: y + x, names('radd'), op('+')), + radd=arith_method(radd, names('radd'), op('+')), sub=arith_method(operator.sub, names('sub'), op('-')), mul=arith_method(operator.mul, names('mul'), op('*')), truediv=arith_method(operator.truediv, names('truediv'), op('/')), @@ -404,13 +445,11 @@ def names(x): # not entirely sure why this is necessary, but previously was included # so it's here to maintain compatibility rmul=arith_method(operator.mul, names('rmul'), op('*')), - rsub=arith_method(lambda x, y: y - x, names('rsub'), op('-')), - rtruediv=arith_method(lambda x, y: operator.truediv(y, x), - names('rtruediv'), op('/')), - rfloordiv=arith_method(lambda x, y: operator.floordiv(y, x), - names('rfloordiv'), op('//')), - rpow=arith_method(lambda x, y: y**x, names('rpow'), op('**')), - rmod=arith_method(lambda x, y: y % x, names('rmod'), op('%'))) + rsub=arith_method(rsub, names('rsub'), op('-')), + rtruediv=arith_method(rtruediv, names('rtruediv'), op('/')), + rfloordiv=arith_method(rfloordiv, names('rfloordiv'), op('//')), + rpow=arith_method(rpow, names('rpow'), op('**')), + rmod=arith_method(rmod, names('rmod'), op('%'))) # yapf: enable new_methods['div'] = new_methods['truediv'] new_methods['rdiv'] = new_methods['rtruediv'] @@ -430,12 +469,9 @@ def names(x): or_=bool_method(operator.or_, names('or_'), op('|')), # For some reason ``^`` wasn't used in original. xor=bool_method(operator.xor, names('xor'), op('^')), - rand_=bool_method(lambda x, y: operator.and_(y, x), - names('rand_'), op('&')), - ror_=bool_method(lambda x, y: operator.or_(y, x), - names('ror_'), op('|')), - rxor=bool_method(lambda x, y: operator.xor(y, x), - names('rxor'), op('^')))) + rand_=bool_method(rand_, names('rand_'), op('&')), + ror_=bool_method(ror_, names('ror_'), op('|')), + rxor=bool_method(rxor, names('rxor'), op('^')))) if have_divmod: # divmod doesn't have an op that is supported by numexpr new_methods['divmod'] = arith_method(divmod, names('divmod'), None) From bc1d0273cd4b7ddf348feff5f46b6eb114ea04c9 Mon Sep 17 00:00:00 2001 From: Paul Reidy Date: Sun, 4 Feb 2018 16:32:52 +0000 Subject: [PATCH 05/20] DOC: Improve replace docstring (#18100) --- pandas/core/frame.py | 8 ++ pandas/core/generic.py | 212 +++++++++++++++++++++++++++++++++-------- pandas/core/series.py | 8 ++ 3 files changed, 187 insertions(+), 41 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 96d28581cfdd9..201d8ba427c8a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3080,6 +3080,14 @@ def fillna(self, value=None, method=None, axis=None, inplace=False, inplace=inplace, limit=limit, downcast=downcast, **kwargs) + @Appender(_shared_docs['replace'] % _shared_doc_kwargs) + def replace(self, to_replace=None, value=None, inplace=False, limit=None, + regex=False, method='pad', axis=None): + return super(DataFrame, self).replace(to_replace=to_replace, + value=value, inplace=inplace, + limit=limit, regex=regex, + method=method, axis=axis) + @Appender(_shared_docs['shift'] % _shared_doc_kwargs) def shift(self, periods=1, freq=None, axis=0): return super(DataFrame, self).shift(periods=periods, freq=freq, diff --git a/pandas/core/generic.py b/pandas/core/generic.py index d34a85b5b4388..0f038cd687dfd 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -69,6 +69,10 @@ def _single_replace(self, to_replace, method, inplace, limit): + """ + Replaces values in a Series using the fill method specified when no + replacement value is given in the replace method + """ if self.ndim != 1: raise TypeError('cannot replace {0} with method {1} on a {2}' .format(to_replace, method, type(self).__name__)) @@ -4787,94 +4791,111 @@ def bfill(self, axis=None, inplace=False, limit=None, downcast=None): return self.fillna(method='bfill', axis=axis, inplace=inplace, limit=limit, downcast=downcast) - def replace(self, to_replace=None, value=None, inplace=False, limit=None, - regex=False, method='pad', axis=None): - """ + _shared_docs['replace'] = (""" Replace values given in 'to_replace' with 'value'. Parameters ---------- to_replace : str, regex, list, dict, Series, numeric, or None - * str or regex: + * numeric, str or regex: - - str: string exactly matching `to_replace` will be replaced - with `value` - - regex: regexs matching `to_replace` will be replaced with - `value` + - numeric: numeric values equal to ``to_replace`` will be + replaced with ``value`` + - str: string exactly matching ``to_replace`` will be replaced + with ``value`` + - regex: regexs matching ``to_replace`` will be replaced with + ``value`` * list of str, regex, or numeric: - - First, if `to_replace` and `value` are both lists, they + - First, if ``to_replace`` and ``value`` are both lists, they **must** be the same length. - Second, if ``regex=True`` then all of the strings in **both** lists will be interpreted as regexs otherwise they will match - directly. This doesn't matter much for `value` since there + directly. This doesn't matter much for ``value`` since there are only a few possible substitution regexes you can use. - - str and regex rules apply as above. + - str, regex and numeric rules apply as above. * dict: - - Nested dictionaries, e.g., {'a': {'b': nan}}, are read as - follows: look in column 'a' for the value 'b' and replace it - with nan. You can nest regular expressions as well. Note that + - Dicts can be used to specify different replacement values + for different existing values. For example, + {'a': 'b', 'y': 'z'} replaces the value 'a' with 'b' and + 'y' with 'z'. To use a dict in this way the ``value`` + parameter should be ``None``. + - For a DataFrame a dict can specify that different values + should be replaced in different columns. For example, + {'a': 1, 'b': 'z'} looks for the value 1 in column 'a' and + the value 'z' in column 'b' and replaces these values with + whatever is specified in ``value``. The ``value`` parameter + should not be ``None`` in this case. You can treat this as a + special case of passing two lists except that you are + specifying the column to search in. + - For a DataFrame nested dictionaries, e.g., + {'a': {'b': np.nan}}, are read as follows: look in column 'a' + for the value 'b' and replace it with NaN. The ``value`` + parameter should be ``None`` to use a nested dict in this + way. You can nest regular expressions as well. Note that column names (the top-level dictionary keys in a nested dictionary) **cannot** be regular expressions. - - Keys map to column names and values map to substitution - values. You can treat this as a special case of passing two - lists except that you are specifying the column to search in. * None: - This means that the ``regex`` argument must be a string, compiled regular expression, or list, dict, ndarray or Series - of such elements. If `value` is also ``None`` then this + of such elements. If ``value`` is also ``None`` then this **must** be a nested dictionary or ``Series``. See the examples section for examples of each of these. value : scalar, dict, list, str, regex, default None - Value to use to fill holes (e.g. 0), alternately a dict of values - specifying which value to use for each column (columns not in the - dict will not be filled). Regular expressions, strings and lists or - dicts of such objects are also allowed. + Value to replace any values matching ``to_replace`` with. + For a DataFrame a dict of values can be used to specify which + value to use for each column (columns not in the dict will not be + filled). Regular expressions, strings and lists or dicts of such + objects are also allowed. inplace : boolean, default False If True, in place. Note: this will modify any other views on this object (e.g. a column from a DataFrame). Returns the caller if this is True. limit : int, default None Maximum size gap to forward or backward fill - regex : bool or same types as `to_replace`, default False - Whether to interpret `to_replace` and/or `value` as regular - expressions. If this is ``True`` then `to_replace` *must* be a - string. Otherwise, `to_replace` must be ``None`` because this - parameter will be interpreted as a regular expression or a list, - dict, or array of regular expressions. + regex : bool or same types as ``to_replace``, default False + Whether to interpret ``to_replace`` and/or ``value`` as regular + expressions. If this is ``True`` then ``to_replace`` *must* be a + string. Alternatively, this could be a regular expression or a + list, dict, or array of regular expressions in which case + ``to_replace`` must be ``None``. method : string, optional, {'pad', 'ffill', 'bfill'} The method to use when for replacement, when ``to_replace`` is a ``list``. See Also -------- - NDFrame.reindex - NDFrame.asfreq - NDFrame.fillna + %(klass)s.fillna : Fill NA/NaN values + %(klass)s.where : Replace values based on boolean condition Returns ------- - filled : NDFrame + filled : %(klass)s Raises ------ AssertionError - * If `regex` is not a ``bool`` and `to_replace` is not ``None``. + * If ``regex`` is not a ``bool`` and ``to_replace`` is not + ``None``. TypeError - * If `to_replace` is a ``dict`` and `value` is not a ``list``, + * If ``to_replace`` is a ``dict`` and ``value`` is not a ``list``, ``dict``, ``ndarray``, or ``Series`` - * If `to_replace` is ``None`` and `regex` is not compilable into a - regular expression or is a list, dict, ndarray, or Series. + * If ``to_replace`` is ``None`` and ``regex`` is not compilable + into a regular expression or is a list, dict, ndarray, or + Series. + * When replacing multiple ``bool`` or ``datetime64`` objects and + the arguments to ``to_replace`` does not match the type of the + value being replaced ValueError - * If `to_replace` and `value` are ``list`` s or ``ndarray`` s, but - they are not the same length. + * If a ``list`` or an ``ndarray`` is passed to ``to_replace`` and + `value` but they are not the same length. Notes ----- @@ -4883,12 +4904,121 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, * Regular expressions will only substitute on strings, meaning you cannot provide, for example, a regular expression matching floating point numbers and expect the columns in your frame that have a - numeric dtype to be matched. However, if those floating point numbers - *are* strings, then you can do this. + numeric dtype to be matched. However, if those floating point + numbers *are* strings, then you can do this. * This method has *a lot* of options. You are encouraged to experiment and play with this method to gain intuition about how it works. - """ + Examples + -------- + + >>> s = pd.Series([0, 1, 2, 3, 4]) + >>> s.replace(0, 5) + 0 5 + 1 1 + 2 2 + 3 3 + 4 4 + dtype: int64 + >>> df = pd.DataFrame({'A': [0, 1, 2, 3, 4], + ... 'B': [5, 6, 7, 8, 9], + ... 'C': ['a', 'b', 'c', 'd', 'e']}) + >>> df.replace(0, 5) + A B C + 0 5 5 a + 1 1 6 b + 2 2 7 c + 3 3 8 d + 4 4 9 e + + >>> df.replace([0, 1, 2, 3], 4) + A B C + 0 4 5 a + 1 4 6 b + 2 4 7 c + 3 4 8 d + 4 4 9 e + >>> df.replace([0, 1, 2, 3], [4, 3, 2, 1]) + A B C + 0 4 5 a + 1 3 6 b + 2 2 7 c + 3 1 8 d + 4 4 9 e + >>> s.replace([1, 2], method='bfill') + 0 0 + 1 3 + 2 3 + 3 3 + 4 4 + dtype: int64 + + >>> df.replace({0: 10, 1: 100}) + A B C + 0 10 5 a + 1 100 6 b + 2 2 7 c + 3 3 8 d + 4 4 9 e + >>> df.replace({'A': 0, 'B': 5}, 100) + A B C + 0 100 100 a + 1 1 6 b + 2 2 7 c + 3 3 8 d + 4 4 9 e + >>> df.replace({'A': {0: 100, 4: 400}}) + A B C + 0 100 5 a + 1 1 6 b + 2 2 7 c + 3 3 8 d + 4 400 9 e + + >>> df = pd.DataFrame({'A': ['bat', 'foo', 'bait'], + ... 'B': ['abc', 'bar', 'xyz']}) + >>> df.replace(to_replace=r'^ba.$', value='new', regex=True) + A B + 0 new abc + 1 foo new + 2 bait xyz + >>> df.replace({'A': r'^ba.$'}, {'A': 'new'}, regex=True) + A B + 0 new abc + 1 foo bar + 2 bait xyz + >>> df.replace(regex=r'^ba.$', value='new') + A B + 0 new abc + 1 foo new + 2 bait xyz + >>> df.replace(regex={r'^ba.$':'new', 'foo':'xyz'}) + A B + 0 new abc + 1 xyz new + 2 bait xyz + >>> df.replace(regex=[r'^ba.$', 'foo'], value='new') + A B + 0 new abc + 1 new new + 2 bait xyz + + Note that when replacing multiple ``bool`` or ``datetime64`` objects, + the data types in the ``to_replace`` parameter must match the data + type of the value being replaced: + + >>> df = pd.DataFrame({'A': [True, False, True], + ... 'B': [False, True, False]}) + >>> df.replace({'a string': 'new value', True: False}) # raises + TypeError: Cannot compare types 'ndarray(dtype=bool)' and 'str' + + This raises a ``TypeError`` because one of the ``dict`` keys is not of + the correct type for replacement. + """) + + @Appender(_shared_docs['replace'] % _shared_doc_kwargs) + def replace(self, to_replace=None, value=None, inplace=False, limit=None, + regex=False, method='pad', axis=None): inplace = validate_bool_kwarg(inplace, 'inplace') if not is_bool(regex) and to_replace is not None: raise AssertionError("'to_replace' must be 'None' if 'regex' is " diff --git a/pandas/core/series.py b/pandas/core/series.py index 78b4c3a70a519..e4b8979d6393a 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2671,6 +2671,14 @@ def fillna(self, value=None, method=None, axis=None, inplace=False, limit=limit, downcast=downcast, **kwargs) + @Appender(generic._shared_docs['replace'] % _shared_doc_kwargs) + def replace(self, to_replace=None, value=None, inplace=False, limit=None, + regex=False, method='pad', axis=None): + return super(Series, self).replace(to_replace=to_replace, value=value, + inplace=inplace, limit=limit, + regex=regex, method=method, + axis=axis) + @Appender(generic._shared_docs['shift'] % _shared_doc_kwargs) def shift(self, periods=1, freq=None, axis=0): return super(Series, self).shift(periods=periods, freq=freq, axis=axis) From de39a1572fcf82071f3c0b5f22be1611222bdf41 Mon Sep 17 00:00:00 2001 From: topper-123 Date: Sun, 4 Feb 2018 21:44:39 +0000 Subject: [PATCH 06/20] DOC: minor groupby and resampler improvements (#19514) --- doc/source/groupby.rst | 7 ++++--- pandas/core/generic.py | 27 ++++++++++++++++++++++++--- pandas/core/groupby.py | 2 +- 3 files changed, 29 insertions(+), 7 deletions(-) diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index 413138b1e52fc..407fad39ba232 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -1219,8 +1219,8 @@ see :ref:`here `. Combining ``.groupby`` and ``.pipe`` is often useful when you need to reuse GroupBy objects. -For an example, imagine having a DataFrame with columns for stores, products, -revenue and sold quantity. We'd like to do a groupwise calculation of *prices* +As an example, imagine having a DataFrame with columns for stores, products, +revenue and quantity sold. We'd like to do a groupwise calculation of *prices* (i.e. revenue/quantity) per store and per product. We could do this in a multi-step operation, but expressing it in terms of piping can make the code more readable. First we set the data: @@ -1230,7 +1230,8 @@ code more readable. First we set the data: import numpy as np n = 1000 df = pd.DataFrame({'Store': np.random.choice(['Store_1', 'Store_2'], n), - 'Product': np.random.choice(['Product_1', 'Product_2', 'Product_3'], n), + 'Product': np.random.choice(['Product_1', + 'Product_2'], n), 'Revenue': (np.random.random(n)*50+10).round(2), 'Quantity': np.random.randint(1, 10, size=n)}) df.head(2) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0f038cd687dfd..cb4bbb7b27c42 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5691,6 +5691,10 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, reduce the dimensionality of the return type if possible, otherwise return a consistent type + Returns + ------- + GroupBy object + Examples -------- DataFrame results @@ -5702,10 +5706,15 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, >>> data.groupby(['col1', 'col2']).mean() - Returns - ------- - GroupBy object + Notes + ----- + See the `user guide + `_ for more. + See also + -------- + resample : Convenience method for frequency conversion and resampling + of time series. """ from pandas.core.groupby import groupby @@ -5904,8 +5913,16 @@ def resample(self, rule, how=None, axis=0, fill_method=None, closed=None, .. versionadded:: 0.19.0 + Returns + ------- + Resampler object + Notes ----- + See the `user guide + `_ + for more. + To learn more about the offset strings, please see `this link `__. @@ -6071,6 +6088,10 @@ def resample(self, rule, how=None, axis=0, fill_method=None, closed=None, a b c d 2000-01-01 00:00:00 0 6 12 18 2000-01-01 00:03:00 0 4 8 12 + + See also + -------- + groupby : Group by mapping, function, label, or list of labels. """ from pandas.core.resample import (resample, _maybe_process_deprecations) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 2c1deb9db7bba..88af80e295d74 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -230,7 +230,7 @@ Notes ----- See more `here -`_ +`_ Examples -------- From ce435dfefaec4582fbd435ceb6127f14ca8d6975 Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Mon, 5 Feb 2018 08:39:43 +0000 Subject: [PATCH 07/20] DEPR: Changing default of str.extract(expand=False) to str.extract(expand=True) (#19118) --- doc/source/text.rst | 3 ++- doc/source/whatsnew/v0.23.0.txt | 47 +++++++++++++++++++++++++++++++++ pandas/core/strings.py | 15 +++-------- pandas/tests/test_strings.py | 9 ++++--- 4 files changed, 58 insertions(+), 16 deletions(-) diff --git a/doc/source/text.rst b/doc/source/text.rst index 2b6459b581c1e..1e620acb1f88a 100644 --- a/doc/source/text.rst +++ b/doc/source/text.rst @@ -218,7 +218,8 @@ Extract first match in each subject (extract) ``DataFrame``, depending on the subject and regular expression pattern (same behavior as pre-0.18.0). When ``expand=True`` it always returns a ``DataFrame``, which is more consistent and less - confusing from the perspective of a user. + confusing from the perspective of a user. ``expand=True`` is the + default since version 0.23.0. The ``extract`` method accepts a `regular expression `__ with at least one diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 69965f44d87a8..0ac27a2f23386 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -296,6 +296,53 @@ Build Changes - Building from source now explicitly requires ``setuptools`` in ``setup.py`` (:issue:`18113`) - Updated conda recipe to be in compliance with conda-build 3.0+ (:issue:`18002`) +Extraction of matching patterns from strings +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +By default, extracting matching patterns from strings with :func:`str.extract` used to return a +``Series`` if a single group was being extracted (a ``DataFrame`` if more than one group was +extracted``). As of Pandas 0.23.0 :func:`str.extract` always returns a ``DataFrame``, unless +``expand`` is set to ``False`` (:issue:`11386`). + +Also, ``None`` was an accepted value for the ``expand`` parameter (which was equivalent to +``False``), but now raises a ``ValueError``. + +Previous Behavior: + +.. code-block:: ipython + + In [1]: s = pd.Series(['number 10', '12 eggs']) + + In [2]: extracted = s.str.extract('.*(\d\d).*') + + In [3]: extracted + Out [3]: + 0 10 + 1 12 + dtype: object + + In [4]: type(extracted) + Out [4]: + pandas.core.series.Series + +New Behavior: + +.. ipython:: python + + s = pd.Series(['number 10', '12 eggs']) + extracted = s.str.extract('.*(\d\d).*') + extracted + type(extracted) + +To restore previous behavior, simply set ``expand`` to ``False``: + +.. ipython:: python + + s = pd.Series(['number 10', '12 eggs']) + extracted = s.str.extract('.*(\d\d).*', expand=False) + extracted + type(extracted) + .. _whatsnew_0230.api: Other API Changes diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 12c7feb5f2b15..b1c1ede66236c 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -598,7 +598,7 @@ def _str_extract_frame(arr, pat, flags=0): dtype=object) -def str_extract(arr, pat, flags=0, expand=None): +def str_extract(arr, pat, flags=0, expand=True): r""" For each subject string in the Series, extract groups from the first match of regular expression pat. @@ -610,7 +610,7 @@ def str_extract(arr, pat, flags=0, expand=None): flags : int, default 0 (no flags) re module flags, e.g. re.IGNORECASE - expand : bool, default False + expand : bool, default True * If True, return DataFrame. * If False, return Series/Index/DataFrame. @@ -676,15 +676,6 @@ def str_extract(arr, pat, flags=0, expand=None): dtype: object """ - if expand is None: - warnings.warn( - "currently extract(expand=None) " + - "means expand=False (return Index/Series/DataFrame) " + - "but in a future version of pandas this will be changed " + - "to expand=True (return DataFrame)", - FutureWarning, - stacklevel=3) - expand = False if not isinstance(expand, bool): raise ValueError("expand must be True or False") if expand: @@ -1739,7 +1730,7 @@ def translate(self, table, deletechars=None): findall = _pat_wrapper(str_findall, flags=True) @copy(str_extract) - def extract(self, pat, flags=0, expand=None): + def extract(self, pat, flags=0, expand=True): return str_extract(self, pat, flags=flags, expand=expand) @copy(str_extractall) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 973fe74429551..178c5ff655b04 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -612,13 +612,16 @@ def test_match(self): def test_extract_expand_None(self): values = Series(['fooBAD__barBAD', NA, 'foo']) - with tm.assert_produces_warning(FutureWarning): + with tm.assert_raises_regex(ValueError, + 'expand must be True or False'): values.str.extract('.*(BAD[_]+).*(BAD)', expand=None) def test_extract_expand_unspecified(self): values = Series(['fooBAD__barBAD', NA, 'foo']) - with tm.assert_produces_warning(FutureWarning): - values.str.extract('.*(BAD[_]+).*(BAD)') + result_unspecified = values.str.extract('.*(BAD[_]+).*') + assert isinstance(result_unspecified, DataFrame) + result_true = values.str.extract('.*(BAD[_]+).*', expand=True) + tm.assert_frame_equal(result_unspecified, result_true) def test_extract_expand_False(self): # Contains tests like those in test_match and some others. From 074d88159667c33319f1c5ab848870b4bd1e7e6e Mon Sep 17 00:00:00 2001 From: jschendel Date: Mon, 5 Feb 2018 04:05:20 -0700 Subject: [PATCH 08/20] TST: Remove legacy instances of _multiprocess_can_split_ (#19536) --- pandas/tests/frame/test_apply.py | 2 -- pandas/tests/indexes/period/test_period.py | 1 - pandas/tests/indexes/timedeltas/test_astype.py | 1 - pandas/tests/indexes/timedeltas/test_construction.py | 1 - pandas/tests/indexes/timedeltas/test_indexing.py | 1 - pandas/tests/indexes/timedeltas/test_ops.py | 1 - pandas/tests/indexes/timedeltas/test_setops.py | 1 - pandas/tests/indexes/timedeltas/test_timedelta.py | 2 -- pandas/tests/indexes/timedeltas/test_timedelta_range.py | 1 - pandas/tests/indexes/timedeltas/test_tools.py | 1 - pandas/tests/scalar/test_timedelta.py | 2 -- pandas/tests/series/test_apply.py | 2 -- 12 files changed, 16 deletions(-) diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index e0fc6c470fe57..d69ddcd8f14d4 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -496,8 +496,6 @@ def zip_frames(*frames): class TestDataFrameAggregate(TestData): - _multiprocess_can_split_ = True - def test_agg_transform(self): with np.errstate(all='ignore'): diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index ab341b70dfe91..6fc7fa5486f82 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -14,7 +14,6 @@ class TestPeriodIndex(DatetimeLike): _holder = PeriodIndex - _multiprocess_can_split_ = True def setup_method(self, method): self.indices = dict(index=tm.makePeriodIndex(10), diff --git a/pandas/tests/indexes/timedeltas/test_astype.py b/pandas/tests/indexes/timedeltas/test_astype.py index c3bd857036efc..6c644d239069a 100644 --- a/pandas/tests/indexes/timedeltas/test_astype.py +++ b/pandas/tests/indexes/timedeltas/test_astype.py @@ -8,7 +8,6 @@ class TestTimedeltaIndex(object): - _multiprocess_can_split_ = True def test_astype(self): # GH 13149, GH 13209 diff --git a/pandas/tests/indexes/timedeltas/test_construction.py b/pandas/tests/indexes/timedeltas/test_construction.py index 70aadd9f57174..68dc0003e2312 100644 --- a/pandas/tests/indexes/timedeltas/test_construction.py +++ b/pandas/tests/indexes/timedeltas/test_construction.py @@ -9,7 +9,6 @@ class TestTimedeltaIndex(object): - _multiprocess_can_split_ = True def test_construction_base_constructor(self): arr = [pd.Timedelta('1 days'), pd.NaT, pd.Timedelta('3 days')] diff --git a/pandas/tests/indexes/timedeltas/test_indexing.py b/pandas/tests/indexes/timedeltas/test_indexing.py index e64c4e6ac54a5..59e38c2e738b0 100644 --- a/pandas/tests/indexes/timedeltas/test_indexing.py +++ b/pandas/tests/indexes/timedeltas/test_indexing.py @@ -9,7 +9,6 @@ class TestTimedeltaIndex(object): - _multiprocess_can_split_ = True def test_insert(self): diff --git a/pandas/tests/indexes/timedeltas/test_ops.py b/pandas/tests/indexes/timedeltas/test_ops.py index e944aad13f8d5..86d7dd4e1b117 100644 --- a/pandas/tests/indexes/timedeltas/test_ops.py +++ b/pandas/tests/indexes/timedeltas/test_ops.py @@ -420,7 +420,6 @@ def test_equals(self): class TestTimedeltas(object): - _multiprocess_can_split_ = True def test_timedelta_ops(self): # GH4984 diff --git a/pandas/tests/indexes/timedeltas/test_setops.py b/pandas/tests/indexes/timedeltas/test_setops.py index 22546d25273a7..020e9079b3436 100644 --- a/pandas/tests/indexes/timedeltas/test_setops.py +++ b/pandas/tests/indexes/timedeltas/test_setops.py @@ -6,7 +6,6 @@ class TestTimedeltaIndex(object): - _multiprocess_can_split_ = True def test_union(self): diff --git a/pandas/tests/indexes/timedeltas/test_timedelta.py b/pandas/tests/indexes/timedeltas/test_timedelta.py index 32157a9a44e04..ce0f3b89b753e 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta.py @@ -18,7 +18,6 @@ class TestTimedeltaIndex(DatetimeLike): _holder = TimedeltaIndex - _multiprocess_can_split_ = True def setup_method(self, method): self.indices = dict(index=tm.makeTimedeltaIndex(10)) @@ -300,7 +299,6 @@ def test_freq_conversion(self): class TestTimeSeries(object): - _multiprocess_can_split_ = True def test_series_box_timedelta(self): rng = timedelta_range('1 day 1 s', periods=5, freq='h') diff --git a/pandas/tests/indexes/timedeltas/test_timedelta_range.py b/pandas/tests/indexes/timedeltas/test_timedelta_range.py index 7624e1f79af15..784ef845fea10 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta_range.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta_range.py @@ -7,7 +7,6 @@ class TestTimedeltas(object): - _multiprocess_can_split_ = True def test_timedelta_range(self): diff --git a/pandas/tests/indexes/timedeltas/test_tools.py b/pandas/tests/indexes/timedeltas/test_tools.py index b4ad28eeacb69..daa9739132d9e 100644 --- a/pandas/tests/indexes/timedeltas/test_tools.py +++ b/pandas/tests/indexes/timedeltas/test_tools.py @@ -11,7 +11,6 @@ class TestTimedeltas(object): - _multiprocess_can_split_ = True def test_to_timedelta(self): def conv(v): diff --git a/pandas/tests/scalar/test_timedelta.py b/pandas/tests/scalar/test_timedelta.py index 64d4940082978..667266be2a89b 100644 --- a/pandas/tests/scalar/test_timedelta.py +++ b/pandas/tests/scalar/test_timedelta.py @@ -13,7 +13,6 @@ class TestTimedeltaArithmetic(object): - _multiprocess_can_split_ = True def test_arithmetic_overflow(self): with pytest.raises(OverflowError): @@ -286,7 +285,6 @@ def test_compare_timedelta_ndarray(self): class TestTimedeltas(object): - _multiprocess_can_split_ = True def setup_method(self, method): pass diff --git a/pandas/tests/series/test_apply.py b/pandas/tests/series/test_apply.py index 3822ecd0a1b0e..0780c846a6c19 100644 --- a/pandas/tests/series/test_apply.py +++ b/pandas/tests/series/test_apply.py @@ -164,8 +164,6 @@ def test_apply_dict_depr(self): class TestSeriesAggregate(TestData): - _multiprocess_can_split_ = True - def test_transform(self): # transforming functions From 98f3937c3bec176fe0fe8e08bfa9d689a7fc45ce Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 5 Feb 2018 03:06:42 -0800 Subject: [PATCH 09/20] remove unused calendar options from period_helper (#19534) --- pandas/_libs/src/period_helper.c | 119 +++++++++++-------------------- pandas/_libs/src/period_helper.h | 4 -- pandas/_libs/tslibs/period.pyx | 1 - 3 files changed, 43 insertions(+), 81 deletions(-) diff --git a/pandas/_libs/src/period_helper.c b/pandas/_libs/src/period_helper.c index f1367978bd6c9..8f1c527a68455 100644 --- a/pandas/_libs/src/period_helper.c +++ b/pandas/_libs/src/period_helper.c @@ -47,13 +47,10 @@ static int days_in_month[2][12] = { {31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}, {31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}}; -/* Return 1/0 iff year points to a leap year in calendar. */ -static int dInfoCalc_Leapyear(npy_int64 year, int calendar) { - if (calendar == GREGORIAN_CALENDAR) { - return (year % 4 == 0) && ((year % 100 != 0) || (year % 400 == 0)); - } else { - return (year % 4 == 0); - } +/* Return 1/0 iff year points to a leap year. + * Assumes GREGORIAN_CALENDAR */ +static int dInfoCalc_Leapyear(npy_int64 year) { + return (year % 4 == 0) && ((year % 100 != 0) || (year % 400 == 0)); } /* Return the day of the week for the given absolute date. */ @@ -71,40 +68,33 @@ static int dInfoCalc_DayOfWeek(npy_int64 absdate) { static int monthToQuarter(int month) { return ((month - 1) / 3) + 1; } /* Return the year offset, that is the absolute date of the day - 31.12.(year-1) in the given calendar. + 31.12.(year-1) + + Assumes GREGORIAN_CALENDAR + + This is equivalent to: + + (datetime(year, 1, 1) - datetime(1970, 1, 1)).days Note: For the Julian calendar we shift the absdate (which is measured using the Gregorian Epoch) value by two days because the Epoch (0001-01-01) in the Julian calendar lies 2 days before the Epoch in the Gregorian calendar. */ -static int dInfoCalc_YearOffset(npy_int64 year, int calendar) { +static int dInfoCalc_YearOffset(npy_int64 year) { year--; - if (calendar == GREGORIAN_CALENDAR) { - if (year >= 0 || -1 / 4 == -1) - return year * 365 + year / 4 - year / 100 + year / 400; - else - return year * 365 + (year - 3) / 4 - (year - 99) / 100 + + if (year >= 0 || -1 / 4 == -1) + return year * 365 + year / 4 - year / 100 + year / 400; + else + return year * 365 + (year - 3) / 4 - (year - 99) / 100 + (year - 399) / 400; - } else if (calendar == JULIAN_CALENDAR) { - if (year >= 0 || -1 / 4 == -1) - return year * 365 + year / 4 - 2; - else - return year * 365 + (year - 3) / 4 - 2; - } - Py_Error(PyExc_ValueError, "unknown calendar"); -onError: - return INT_ERR_CODE; } -/* Set the instance's value using the given date and time. calendar may be set - * to the flags: GREGORIAN_CALENDAR, JULIAN_CALENDAR to indicate the calendar - * to be used. */ - +/* Set the instance's value using the given date and time. + * Assumes GREGORIAN_CALENDAR */ static int dInfoCalc_SetFromDateAndTime(struct date_info *dinfo, int year, int month, int day, int hour, - int minute, double second, - int calendar) { + int minute, double second) { /* Calculate the absolute date */ { int leap; @@ -116,7 +106,7 @@ static int dInfoCalc_SetFromDateAndTime(struct date_info *dinfo, int year, PyExc_ValueError, "year out of range: %i", year); /* Is it a leap year ? */ - leap = dInfoCalc_Leapyear(year, calendar); + leap = dInfoCalc_Leapyear(year); /* Negative month values indicate months relative to the years end */ if (month < 0) month += 13; @@ -128,7 +118,7 @@ static int dInfoCalc_SetFromDateAndTime(struct date_info *dinfo, int year, Py_AssertWithArg(day >= 1 && day <= days_in_month[leap][month - 1], PyExc_ValueError, "day out of range: %i", day); - yearoffset = dInfoCalc_YearOffset(year, calendar); + yearoffset = dInfoCalc_YearOffset(year); if (yearoffset == INT_ERR_CODE) goto onError; absdate = day + month_offset[leap][month - 1] + yearoffset; @@ -142,8 +132,6 @@ static int dInfoCalc_SetFromDateAndTime(struct date_info *dinfo, int year, dinfo->day_of_week = dInfoCalc_DayOfWeek(absdate); dinfo->day_of_year = (short)(absdate - yearoffset); - - dinfo->calendar = calendar; } /* Calculate the absolute time */ @@ -171,33 +159,27 @@ static int dInfoCalc_SetFromDateAndTime(struct date_info *dinfo, int year, return INT_ERR_CODE; } -/* Sets the date part of the date_info struct using the indicated - calendar. +/* Sets the date part of the date_info struct + Assumes GREGORIAN_CALENDAR XXX This could also be done using some integer arithmetics rather than with this iterative approach... */ static int dInfoCalc_SetFromAbsDate(register struct date_info *dinfo, - npy_int64 absdate, int calendar) { + npy_int64 absdate) { register npy_int64 year; npy_int64 yearoffset; int leap, dayoffset; int *monthoffset; /* Approximate year */ - if (calendar == GREGORIAN_CALENDAR) { - year = (npy_int64)(((double)absdate) / 365.2425); - } else if (calendar == JULIAN_CALENDAR) { - year = (npy_int64)(((double)absdate) / 365.25); - } else { - Py_Error(PyExc_ValueError, "unknown calendar"); - } + year = (npy_int64)(((double)absdate) / 365.2425); if (absdate > 0) year++; /* Apply corrections to reach the correct year */ while (1) { /* Calculate the year offset */ - yearoffset = dInfoCalc_YearOffset(year, calendar); + yearoffset = dInfoCalc_YearOffset(year); if (yearoffset == INT_ERR_CODE) goto onError; /* Backward correction: absdate must be greater than the @@ -208,7 +190,7 @@ static int dInfoCalc_SetFromAbsDate(register struct date_info *dinfo, } dayoffset = absdate - yearoffset; - leap = dInfoCalc_Leapyear(year, calendar); + leap = dInfoCalc_Leapyear(year); /* Forward correction: non leap years only have 365 days */ if (dayoffset > 365 && !leap) { @@ -219,7 +201,6 @@ static int dInfoCalc_SetFromAbsDate(register struct date_info *dinfo, } dinfo->year = year; - dinfo->calendar = calendar; /* Now iterate to find the month */ monthoffset = month_offset[leap]; @@ -410,8 +391,7 @@ static npy_int64 DtoB_WeekendToFriday(npy_int64 absdate, int day_of_week) { static npy_int64 absdate_from_ymd(int y, int m, int d) { struct date_info tempDate; - if (dInfoCalc_SetFromDateAndTime(&tempDate, y, m, d, 0, 0, 0, - GREGORIAN_CALENDAR)) { + if (dInfoCalc_SetFromDateAndTime(&tempDate, y, m, d, 0, 0, 0)) { return INT_ERR_CODE; } return tempDate.absdate; @@ -423,8 +403,7 @@ static npy_int64 asfreq_DTtoA(npy_int64 ordinal, char relation, asfreq_info *af_info) { struct date_info dinfo; ordinal = downsample_daytime(ordinal, af_info, 0); - if (dInfoCalc_SetFromAbsDate(&dinfo, ordinal + ORD_OFFSET, - GREGORIAN_CALENDAR)) + if (dInfoCalc_SetFromAbsDate(&dinfo, ordinal + ORD_OFFSET)) return INT_ERR_CODE; if (dinfo.month > af_info->to_a_year_end) { return (npy_int64)(dinfo.year + 1 - BASE_YEAR); @@ -436,8 +415,7 @@ static npy_int64 asfreq_DTtoA(npy_int64 ordinal, char relation, static npy_int64 DtoQ_yq(npy_int64 ordinal, asfreq_info *af_info, int *year, int *quarter) { struct date_info dinfo; - if (dInfoCalc_SetFromAbsDate(&dinfo, ordinal + ORD_OFFSET, - GREGORIAN_CALENDAR)) + if (dInfoCalc_SetFromAbsDate(&dinfo, ordinal + ORD_OFFSET)) return INT_ERR_CODE; if (af_info->to_q_year_end != 12) { dinfo.month -= af_info->to_q_year_end; @@ -474,8 +452,7 @@ static npy_int64 asfreq_DTtoM(npy_int64 ordinal, char relation, ordinal = downsample_daytime(ordinal, af_info, 0); - if (dInfoCalc_SetFromAbsDate(&dinfo, ordinal + ORD_OFFSET, - GREGORIAN_CALENDAR)) + if (dInfoCalc_SetFromAbsDate(&dinfo, ordinal + ORD_OFFSET)) return INT_ERR_CODE; return (npy_int64)((dinfo.year - BASE_YEAR) * 12 + dinfo.month - 1); } @@ -493,8 +470,7 @@ static npy_int64 asfreq_DTtoB(npy_int64 ordinal, char relation, ordinal = downsample_daytime(ordinal, af_info, 0); - if (dInfoCalc_SetFromAbsDate(&dinfo, ordinal + ORD_OFFSET, - GREGORIAN_CALENDAR)) + if (dInfoCalc_SetFromAbsDate(&dinfo, ordinal + ORD_OFFSET)) return INT_ERR_CODE; if (relation == 'S') { @@ -595,8 +571,7 @@ static npy_int64 asfreq_WtoB(npy_int64 ordinal, char relation, asfreq_info *af_info) { struct date_info dinfo; if (dInfoCalc_SetFromAbsDate( - &dinfo, asfreq_WtoDT(ordinal, relation, af_info) + ORD_OFFSET, - GREGORIAN_CALENDAR)) + &dinfo, asfreq_WtoDT(ordinal, relation, af_info) + ORD_OFFSET)) return INT_ERR_CODE; if (relation == 'S') { @@ -655,8 +630,7 @@ static npy_int64 asfreq_MtoB(npy_int64 ordinal, char relation, struct date_info dinfo; if (dInfoCalc_SetFromAbsDate( - &dinfo, asfreq_MtoDT(ordinal, relation, af_info) + ORD_OFFSET, - GREGORIAN_CALENDAR)) + &dinfo, asfreq_MtoDT(ordinal, relation, af_info) + ORD_OFFSET)) return INT_ERR_CODE; if (relation == 'S') { @@ -731,8 +705,7 @@ static npy_int64 asfreq_QtoB(npy_int64 ordinal, char relation, asfreq_info *af_info) { struct date_info dinfo; if (dInfoCalc_SetFromAbsDate( - &dinfo, asfreq_QtoDT(ordinal, relation, af_info) + ORD_OFFSET, - GREGORIAN_CALENDAR)) + &dinfo, asfreq_QtoDT(ordinal, relation, af_info) + ORD_OFFSET)) return INT_ERR_CODE; if (relation == 'S') { @@ -803,8 +776,7 @@ static npy_int64 asfreq_AtoB(npy_int64 ordinal, char relation, asfreq_info *af_info) { struct date_info dinfo; if (dInfoCalc_SetFromAbsDate( - &dinfo, asfreq_AtoDT(ordinal, relation, af_info) + ORD_OFFSET, - GREGORIAN_CALENDAR)) + &dinfo, asfreq_AtoDT(ordinal, relation, af_info) + ORD_OFFSET)) return INT_ERR_CODE; if (relation == 'S') { @@ -1096,19 +1068,17 @@ static int dInfoCalc_SetFromAbsTime(struct date_info *dinfo, double abstime) { return 0; } -/* Set the instance's value using the given date and time. calendar - may be set to the flags: GREGORIAN_CALENDAR, JULIAN_CALENDAR to - indicate the calendar to be used. */ +/* Set the instance's value using the given date and time. + Assumes GREGORIAN_CALENDAR. */ static int dInfoCalc_SetFromAbsDateTime(struct date_info *dinfo, - npy_int64 absdate, double abstime, - int calendar) { + npy_int64 absdate, double abstime) { /* Bounds check */ Py_AssertWithArg(abstime >= 0.0 && abstime <= SECONDS_PER_DAY, PyExc_ValueError, "abstime out of range (0.0 - 86400.0): %f", abstime); /* Calculate the date */ - if (dInfoCalc_SetFromAbsDate(dinfo, absdate, calendar)) goto onError; + if (dInfoCalc_SetFromAbsDate(dinfo, absdate)) goto onError; /* Calculate the time */ if (dInfoCalc_SetFromAbsTime(dinfo, abstime)) goto onError; @@ -1356,8 +1326,7 @@ static int _ISOWeek(struct date_info *dinfo) { /* Verify */ if (week < 0) { /* The day lies in last week of the previous year */ - if ((week > -2) || (week == -2 && dInfoCalc_Leapyear(dinfo->year - 1, - dinfo->calendar))) + if ((week > -2) || (week == -2 && dInfoCalc_Leapyear(dinfo->year - 1))) week = 53; else week = 52; @@ -1384,8 +1353,7 @@ int get_date_info(npy_int64 ordinal, int freq, struct date_info *dinfo) { absdate += 1; } - if (dInfoCalc_SetFromAbsDateTime(dinfo, absdate, abstime, - GREGORIAN_CALENDAR)) + if (dInfoCalc_SetFromAbsDateTime(dinfo, absdate, abstime)) return INT_ERR_CODE; return 0; @@ -1480,7 +1448,6 @@ int pdays_in_month(npy_int64 ordinal, int freq) { if (get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) return INT_ERR_CODE; - days = days_in_month[dInfoCalc_Leapyear(dinfo.year, dinfo.calendar)] - [dinfo.month - 1]; + days = days_in_month[dInfoCalc_Leapyear(dinfo.year)][dinfo.month - 1]; return days; } diff --git a/pandas/_libs/src/period_helper.h b/pandas/_libs/src/period_helper.h index 35dd20848a2ec..d3d32f81d1f66 100644 --- a/pandas/_libs/src/period_helper.h +++ b/pandas/_libs/src/period_helper.h @@ -24,9 +24,6 @@ frequency conversion routines. * declarations from period here */ -#define GREGORIAN_CALENDAR 0 -#define JULIAN_CALENDAR 1 - #define SECONDS_PER_DAY ((double)86400.0) #define Py_AssertWithArg(x, errortype, errorstr, a1) \ @@ -138,7 +135,6 @@ typedef struct date_info { int year; int day_of_week; int day_of_year; - int calendar; } date_info; typedef npy_int64 (*freq_conv_func)(npy_int64, char, asfreq_info *); diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index e2caebe4c4afc..5098e5c9100ff 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -59,7 +59,6 @@ cdef extern from "period_helper.h": int year int day_of_week int day_of_year - int calendar ctypedef struct asfreq_info: int from_week_end From 5b58a20504aeb3efe8858164377edc0e4f02ae02 Mon Sep 17 00:00:00 2001 From: discort Date: Mon, 5 Feb 2018 06:12:02 -0500 Subject: [PATCH 10/20] BUG: groupby with resample using on parameter errors when selecting column to apply function closes #17813 Author: discort Closes #19433 from discort/fix_17813 and squashes the following commits: 2f25d40a0 [discort] Fixed bug in df.resample using 'on' parameter --- doc/source/whatsnew/v0.23.0.txt | 8 ++++++-- pandas/core/groupby.py | 18 +++++++++++++++--- pandas/tests/test_resample.py | 9 +++++++++ 3 files changed, 30 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 0ac27a2f23386..b3905824f7e44 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -289,6 +289,8 @@ Convert to an xarray DataArray p.to_xarray() +.. _whatsnew_0230.api_breaking.build_changes: + Build Changes ^^^^^^^^^^^^^ @@ -296,6 +298,8 @@ Build Changes - Building from source now explicitly requires ``setuptools`` in ``setup.py`` (:issue:`18113`) - Updated conda recipe to be in compliance with conda-build 3.0+ (:issue:`18002`) +.. _whatsnew_0230.api_breaking.extract: + Extraction of matching patterns from strings ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -594,8 +598,8 @@ Groupby/Resample/Rolling - Fixed regression in :func:`DataFrame.groupby` which would not emit an error when called with a tuple key not in the index (:issue:`18798`) - Bug in :func:`DataFrame.resample` which silently ignored unsupported (or mistyped) options for ``label``, ``closed`` and ``convention`` (:issue:`19303`) - Bug in :func:`DataFrame.groupby` where tuples were interpreted as lists of keys rather than as keys (:issue:`17979`, :issue:`18249`) -- Bug in ``transform`` where particular aggregation functions were being incorrectly cast to match the dtype(s) of the grouped data (:issue:`19200`) -- +- Bug in :func:`DataFrame.transform` where particular aggregation functions were being incorrectly cast to match the dtype(s) of the grouped data (:issue:`19200`) +- Bug in :func:`DataFrame.groupby` passing the `on=` kwarg, and subsequently using ``.apply()`` (:issue:`17813`) Sparse ^^^^^^ diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 88af80e295d74..ab0070777c190 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -37,6 +37,7 @@ _ensure_categorical, _ensure_float) from pandas.core.dtypes.cast import maybe_downcast_to_dtype +from pandas.core.dtypes.generic import ABCSeries from pandas.core.dtypes.missing import isna, notna, _maybe_fill from pandas.core.base import (PandasObject, SelectionMixin, GroupByError, @@ -423,6 +424,7 @@ def __init__(self, key=None, level=None, freq=None, axis=0, sort=False): self.obj = None self.indexer = None self.binner = None + self._grouper = None @property def ax(self): @@ -465,12 +467,22 @@ def _set_grouper(self, obj, sort=False): raise ValueError( "The Grouper cannot specify both a key and a level!") + # Keep self.grouper value before overriding + if self._grouper is None: + self._grouper = self.grouper + # the key must be a valid info item if self.key is not None: key = self.key - if key not in obj._info_axis: - raise KeyError("The grouper name {0} is not found".format(key)) - ax = Index(obj[key], name=key) + # The 'on' is already defined + if getattr(self.grouper, 'name', None) == key and \ + isinstance(obj, ABCSeries): + ax = self._grouper.take(obj.index) + else: + if key not in obj._info_axis: + raise KeyError( + "The grouper name {0} is not found".format(key)) + ax = Index(obj[key], name=key) else: ax = obj._get_axis(self.axis) diff --git a/pandas/tests/test_resample.py b/pandas/tests/test_resample.py index a5aaa328a8e06..2de890ea459f0 100644 --- a/pandas/tests/test_resample.py +++ b/pandas/tests/test_resample.py @@ -3077,6 +3077,15 @@ def test_getitem_multiple(self): result = r['buyer'].count() assert_series_equal(result, expected) + def test_groupby_resample_on_api_with_getitem(self): + # GH 17813 + df = pd.DataFrame({'id': list('aabbb'), + 'date': pd.date_range('1-1-2016', periods=5), + 'data': 1}) + exp = df.set_index('date').groupby('id').resample('2D')['data'].sum() + result = df.groupby('id').resample('2D', on='date')['data'].sum() + assert_series_equal(result, exp) + def test_nearest(self): # GH 17496 From d5a7e7c947325554d4ee3c4e3755c878610d354c Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Mon, 5 Feb 2018 06:35:03 -0500 Subject: [PATCH 11/20] TST: Fix makeIntIndex, benchmark get loc Author: Pietro Battiston Closes #19483 from toobaz/test_get_loc and squashes the following commits: 51d691106 [Pietro Battiston] TST: benchmark get_loc in various cases d424f63df [Pietro Battiston] TST: produce unsorted integer index (consistently with other types) --- asv_bench/benchmarks/index_object.py | 17 +++++++++++++++++ pandas/tests/indexes/test_base.py | 16 +++++++++------- pandas/tests/indexing/test_floats.py | 15 +++++++-------- 3 files changed, 33 insertions(+), 15 deletions(-) diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py index 970760373632a..f1703e163917a 100644 --- a/asv_bench/benchmarks/index_object.py +++ b/asv_bench/benchmarks/index_object.py @@ -147,6 +147,11 @@ def setup(self, dtype): self.idx = getattr(tm, 'make{}Index'.format(dtype))(N) self.array_mask = (np.arange(N) % 3) == 0 self.series_mask = Series(self.array_mask) + self.sorted = self.idx.sort_values() + half = N // 2 + self.non_unique = self.idx[:half].append(self.idx[:half]) + self.non_unique_sorted = self.sorted[:half].append(self.sorted[:half]) + self.key = self.sorted[N // 4] def time_boolean_array(self, dtype): self.idx[self.array_mask] @@ -163,6 +168,18 @@ def time_slice(self, dtype): def time_slice_step(self, dtype): self.idx[::2] + def time_get_loc(self, dtype): + self.idx.get_loc(self.key) + + def time_get_loc_sorted(self, dtype): + self.sorted.get_loc(self.key) + + def time_get_loc_non_unique(self, dtype): + self.non_unique.get_loc(self.key) + + def time_get_loc_non_unique_sorted(self, dtype): + self.non_unique_sorted.get_loc(self.key) + class Float64IndexMethod(object): # GH 13166 diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 974099f1fbbe9..90edcb526bb2e 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -830,15 +830,16 @@ def test_map_with_tuples(self): # Test that returning a single tuple from an Index # returns an Index. - boolean_index = tm.makeIntIndex(3).map(lambda x: (x,)) - expected = Index([(0,), (1,), (2,)]) - tm.assert_index_equal(boolean_index, expected) + idx = tm.makeIntIndex(3) + result = tm.makeIntIndex(3).map(lambda x: (x,)) + expected = Index([(i,) for i in idx]) + tm.assert_index_equal(result, expected) # Test that returning a tuple from a map of a single index # returns a MultiIndex object. - boolean_index = tm.makeIntIndex(3).map(lambda x: (x, x == 1)) - expected = MultiIndex.from_tuples([(0, False), (1, True), (2, False)]) - tm.assert_index_equal(boolean_index, expected) + result = idx.map(lambda x: (x, x == 1)) + expected = MultiIndex.from_tuples([(i, i == 1) for i in idx]) + tm.assert_index_equal(result, expected) # Test that returning a single object from a MultiIndex # returns an Index. @@ -870,7 +871,8 @@ def test_map_tseries_indices_return_index(self): def test_map_dictlike(self, mapper): # GH 12756 expected = Index(['foo', 'bar', 'baz']) - result = tm.makeIntIndex(3).map(mapper(expected.values, [0, 1, 2])) + idx = tm.makeIntIndex(3) + result = idx.map(mapper(expected.values, idx)) tm.assert_index_equal(result, expected) for name in self.indices.keys(): diff --git a/pandas/tests/indexing/test_floats.py b/pandas/tests/indexing/test_floats.py index d2692c7dc302e..e3f93924aca0d 100644 --- a/pandas/tests/indexing/test_floats.py +++ b/pandas/tests/indexing/test_floats.py @@ -4,7 +4,8 @@ from warnings import catch_warnings import numpy as np -from pandas import Series, DataFrame, Index, Float64Index +from pandas import (Series, DataFrame, Index, Float64Index, Int64Index, + RangeIndex) from pandas.util.testing import assert_series_equal, assert_almost_equal import pandas.util.testing as tm @@ -206,9 +207,8 @@ def test_scalar_integer(self): # test how scalar float indexers work on int indexes # integer index - for index in [tm.makeIntIndex, tm.makeRangeIndex]: + for i in [Int64Index(range(5)), RangeIndex(5)]: - i = index(5) for s in [Series(np.arange(len(i))), DataFrame(np.random.randn(len(i), len(i)), index=i, columns=i)]: @@ -362,9 +362,9 @@ def test_slice_integer(self): # these coerce to a like integer # oob indicates if we are out of bounds # of positional indexing - for index, oob in [(tm.makeIntIndex(5), False), - (tm.makeRangeIndex(5), False), - (tm.makeIntIndex(5) + 10, True)]: + for index, oob in [(Int64Index(range(5)), False), + (RangeIndex(5), False), + (Int64Index(range(5)) + 10, True)]: # s is an in-range index s = Series(range(5), index=index) @@ -486,9 +486,8 @@ def f(): def test_slice_integer_frame_getitem(self): # similar to above, but on the getitem dim (of a DataFrame) - for index in [tm.makeIntIndex, tm.makeRangeIndex]: + for index in [Int64Index(range(5)), RangeIndex(5)]: - index = index(5) s = DataFrame(np.random.randn(5, 2), index=index) def f(idxr): From f391cbfe57fb4e334e9f06d49073dc1ca25eb1e1 Mon Sep 17 00:00:00 2001 From: Pepe Flores Date: Mon, 5 Feb 2018 20:43:02 +0200 Subject: [PATCH 12/20] DOC: Fix typo in example (#19537) Fix typo in the example for pandas.io.formats.style.Styler.format --- pandas/io/formats/style.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 58796aa30f0bf..20e72dd6bde91 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -364,7 +364,7 @@ def format(self, formatter, subset=None): >>> df = pd.DataFrame(np.random.randn(4, 2), columns=['a', 'b']) >>> df.style.format("{:.2%}") >>> df['c'] = ['a', 'b', 'c', 'd'] - >>> df.style.format({'C': str.upper}) + >>> df.style.format({'c': str.upper}) """ if subset is None: row_locs = range(len(self.data)) From a01f74cf27314817acff6289f36b6eba9c49fb6c Mon Sep 17 00:00:00 2001 From: Matthew Kirk Date: Mon, 5 Feb 2018 20:24:00 -0500 Subject: [PATCH 13/20] BUG: don't assume series is length > 0 closes #19368 Author: Matthew Kirk Closes #19438 from hexgnu/segfault_memory_usage and squashes the following commits: f9433d844 [Matthew Kirk] Use shared docstring and get rid of if condition 4ead141c0 [Matthew Kirk] Move whatsnew doc to Sparse ae9f74d58 [Matthew Kirk] Revert base.py cdd4141e4 [Matthew Kirk] Fix linting error 93a0c3daa [Matthew Kirk] Merge remote-tracking branch 'upstream/master' into segfault_memory_usage 207bc74d2 [Matthew Kirk] Define memory_usage on SparseArray 21ae14707 [Matthew Kirk] FIX: revert change to lib.pyx 3f52a44f6 [Matthew Kirk] Ah ha I think I got it 5e59e9cbc [Matthew Kirk] Use range over 0 <= for loops e25158713 [Matthew Kirk] Fix failing test with indexing 27df317be [Matthew Kirk] Merge remote-tracking branch 'upstream/master' into segfault_memory_usage 7fdd03e94 [Matthew Kirk] Take out comment and use product 6bd6ddd02 [Matthew Kirk] BUG: don't assume series is length > 0 --- doc/source/whatsnew/v0.23.0.txt | 2 +- pandas/core/base.py | 2 +- pandas/core/sparse/array.py | 16 ++++++++++++++-- pandas/tests/sparse/series/test_series.py | 13 +++++++++++++ 4 files changed, 29 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index b3905824f7e44..e4f00990d28c0 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -606,7 +606,7 @@ Sparse - Bug in which creating a ``SparseDataFrame`` from a dense ``Series`` or an unsupported type raised an uncontrolled exception (:issue:`19374`) - Bug in :class:`SparseDataFrame.to_csv` causing exception (:issue:`19384`) -- +- Bug in :class:`SparseSeries.memory_usage` which caused segfault by accessing non sparse elements (:issue:`19368`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/base.py b/pandas/core/base.py index 54d25a16a10a3..d5b204dba063e 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1048,7 +1048,7 @@ def is_monotonic_decreasing(self): def memory_usage(self, deep=False): """ - Memory usage of my values + Memory usage of the values Parameters ---------- diff --git a/pandas/core/sparse/array.py b/pandas/core/sparse/array.py index fa07400a0706e..65aefd9fb8c0a 100644 --- a/pandas/core/sparse/array.py +++ b/pandas/core/sparse/array.py @@ -8,10 +8,10 @@ import warnings import pandas as pd -from pandas.core.base import PandasObject +from pandas.core.base import PandasObject, IndexOpsMixin from pandas import compat -from pandas.compat import range +from pandas.compat import range, PYPY from pandas.compat.numpy import function as nv from pandas.core.dtypes.generic import ABCSparseSeries @@ -30,6 +30,7 @@ from pandas.core.dtypes.missing import isna, notna, na_value_for_dtype import pandas._libs.sparse as splib +import pandas._libs.lib as lib from pandas._libs.sparse import SparseIndex, BlockIndex, IntIndex from pandas._libs import index as libindex import pandas.core.algorithms as algos @@ -238,6 +239,17 @@ def kind(self): elif isinstance(self.sp_index, IntIndex): return 'integer' + @Appender(IndexOpsMixin.memory_usage.__doc__) + def memory_usage(self, deep=False): + values = self.sp_values + + v = values.nbytes + + if deep and is_object_dtype(self) and not PYPY: + v += lib.memory_usage_of_objects(values) + + return v + def __array_wrap__(self, out_arr, context=None): """ NumPy calls this method when ufunc is applied diff --git a/pandas/tests/sparse/series/test_series.py b/pandas/tests/sparse/series/test_series.py index 2ea1e63433520..3f5d5a59cc540 100644 --- a/pandas/tests/sparse/series/test_series.py +++ b/pandas/tests/sparse/series/test_series.py @@ -23,6 +23,8 @@ from pandas.core.sparse.api import SparseSeries from pandas.tests.series.test_api import SharedWithSparse +from itertools import product + def _test_data1(): # nan-based @@ -971,6 +973,17 @@ def test_combine_first(self): tm.assert_sp_series_equal(result, result2) tm.assert_sp_series_equal(result, expected) + @pytest.mark.parametrize('deep,fill_values', [([True, False], + [0, 1, np.nan, None])]) + def test_memory_usage_deep(self, deep, fill_values): + for deep, fill_value in product(deep, fill_values): + sparse_series = SparseSeries(fill_values, fill_value=fill_value) + dense_series = Series(fill_values) + sparse_usage = sparse_series.memory_usage(deep=deep) + dense_usage = dense_series.memory_usage(deep=deep) + + assert sparse_usage < dense_usage + class TestSparseHandlingMultiIndexes(object): From ed10bf618b93726c61ed9b3ebbc3031416bc1263 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Mon, 5 Feb 2018 20:29:15 -0500 Subject: [PATCH 14/20] TST: fix and test index division by zero Related: #19336 Author: Brock Mendel Closes #19347 from jbrockmendel/div_zero2 and squashes the following commits: be1e2e1b8 [Brock Mendel] move fixture to conftest 64b0c0853 [Brock Mendel] Merge branch 'master' of https://github.com/pandas-dev/pandas into div_zero2 aa969f8d2 [Brock Mendel] Merge branch 'master' of https://github.com/pandas-dev/pandas into div_zero2 000aefde0 [Brock Mendel] fix long again 9de356ab0 [Brock Mendel] revert fixture to fix test_range failures b8cf21d3e [Brock Mendel] flake8 remove unused import afedba98b [Brock Mendel] whatsnew clarification b51c2e14c [Brock Mendel] fixturize 37efd5108 [Brock Mendel] make zero a fixture 965f7214e [Brock Mendel] Merge branch 'master' of https://github.com/pandas-dev/pandas into div_zero2 d648ef698 [Brock Mendel] requested edits 1ef3a6c74 [Brock Mendel] Merge branch 'master' of https://github.com/pandas-dev/pandas into div_zero2 78de1a4df [Brock Mendel] Merge branch 'master' of https://github.com/pandas-dev/pandas into div_zero2 0277d9fca [Brock Mendel] add ipython output to whatsnew 5d7e3ea0c [Brock Mendel] Merge branch 'master' of https://github.com/pandas-dev/pandas into div_zero2 ea75c3ca0 [Brock Mendel] ipython block 6fc61bd99 [Brock Mendel] elaborate docstring ca3bf4241 [Brock Mendel] Whatsnew section cd543497c [Brock Mendel] move dispatch_missing to core.missing 06df02a89 [Brock Mendel] py3 fix 84c74c54a [Brock Mendel] remove operator.div for py3 6acc2f78a [Brock Mendel] fix missing import e0e89b978 [Brock Mendel] fix and and tests for divmod 969f342e1 [Brock Mendel] fix and test index division by zero --- doc/source/whatsnew/v0.23.0.txt | 44 +++++++++++++++ pandas/core/indexes/base.py | 2 + pandas/core/indexes/range.py | 31 +++++------ pandas/core/missing.py | 82 ++++++++++++++++++++++++++++ pandas/tests/indexes/conftest.py | 18 +++++- pandas/tests/indexes/test_numeric.py | 42 ++++++++++++++ 6 files changed, 200 insertions(+), 19 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index e4f00990d28c0..ea56ebad7d782 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -204,6 +204,50 @@ Please note that the string `index` is not supported with the round trip format, new_df print(new_df.index.name) +.. _whatsnew_0230.enhancements.index_division_by_zero: + +Index Division By Zero Fills Correctly +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Division operations on ``Index`` and subclasses will now fill division of positive numbers by zero with ``np.inf``, division of negative numbers by zero with ``-np.inf`` and `0 / 0` with ``np.nan``. This matches existing ``Series`` behavior. (:issue:`19322`, :issue:`19347`) + +Previous Behavior: + +.. code-block:: ipython + + In [6]: index = pd.Int64Index([-1, 0, 1]) + + In [7]: index / 0 + Out[7]: Int64Index([0, 0, 0], dtype='int64') + + # Previous behavior yielded different results depending on the type of zero in the divisor + In [8]: index / 0.0 + Out[8]: Float64Index([-inf, nan, inf], dtype='float64') + + In [9]: index = pd.UInt64Index([0, 1]) + + In [10]: index / np.array([0, 0], dtype=np.uint64) + Out[10]: UInt64Index([0, 0], dtype='uint64') + + In [11]: pd.RangeIndex(1, 5) / 0 + ZeroDivisionError: integer division or modulo by zero + +Current Behavior: + +.. ipython:: python + + index = pd.Int64Index([-1, 0, 1]) + # division by zero gives -infinity where negative, +infinity where positive, and NaN for 0 / 0 + index / 0 + + # The result of division by zero should not depend on whether the zero is int or float + index / 0.0 + + index = pd.UInt64Index([0, 1]) + index / np.array([0, 0], dtype=np.uint64) + + pd.RangeIndex(1, 5) / 0 + .. _whatsnew_0230.enhancements.other: Other Enhancements diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 626f3dc86556a..1e1bb0d49b3df 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4040,6 +4040,8 @@ def _evaluate_numeric_binop(self, other): attrs = self._maybe_update_attributes(attrs) with np.errstate(all='ignore'): result = op(values, other) + + result = missing.dispatch_missing(op, values, other, result) return constructor(result, **attrs) return _evaluate_numeric_binop diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index a82ee6b2b44af..0ed92a67c7e14 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -550,7 +550,7 @@ def __getitem__(self, key): return super_getitem(key) def __floordiv__(self, other): - if is_integer(other): + if is_integer(other) and other != 0: if (len(self) == 0 or self._start % other == 0 and self._step % other == 0): @@ -592,14 +592,15 @@ def _evaluate_numeric_binop(self, other): attrs = self._get_attributes_dict() attrs = self._maybe_update_attributes(attrs) + left, right = self, other if reversed: - self, other = other, self + left, right = right, left try: # apply if we have an override if step: with np.errstate(all='ignore'): - rstep = step(self._step, other) + rstep = step(left._step, right) # we don't have a representable op # so return a base index @@ -607,11 +608,11 @@ def _evaluate_numeric_binop(self, other): raise ValueError else: - rstep = self._step + rstep = left._step with np.errstate(all='ignore'): - rstart = op(self._start, other) - rstop = op(self._stop, other) + rstart = op(left._start, right) + rstop = op(left._stop, right) result = RangeIndex(rstart, rstop, @@ -627,18 +628,12 @@ def _evaluate_numeric_binop(self, other): return result - except (ValueError, TypeError, AttributeError): - pass - - # convert to Int64Index ops - if isinstance(self, RangeIndex): - self = self.values - if isinstance(other, RangeIndex): - other = other.values - - with np.errstate(all='ignore'): - results = op(self, other) - return Index(results, **attrs) + except (ValueError, TypeError, AttributeError, + ZeroDivisionError): + # Defer to Int64Index implementation + if reversed: + return op(other, self._int64index) + return op(self._int64index, other) return _evaluate_numeric_binop diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 2eccc5777bca6..31c489e2f8941 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -1,6 +1,7 @@ """ Routines for filling missing data """ +import operator import numpy as np from distutils.version import LooseVersion @@ -650,6 +651,87 @@ def fill_zeros(result, x, y, name, fill): return result +def mask_zero_div_zero(x, y, result, copy=False): + """ + Set results of 0 / 0 or 0 // 0 to np.nan, regardless of the dtypes + of the numerator or the denominator. + + Parameters + ---------- + x : ndarray + y : ndarray + result : ndarray + copy : bool (default False) + Whether to always create a new array or try to fill in the existing + array if possible. + + Returns + ------- + filled_result : ndarray + + Examples + -------- + >>> x = np.array([1, 0, -1], dtype=np.int64) + >>> y = 0 # int 0; numpy behavior is different with float + >>> result = x / y + >>> result # raw numpy result does not fill division by zero + array([0, 0, 0]) + >>> mask_zero_div_zero(x, y, result) + array([ inf, nan, -inf]) + """ + if is_scalar(y): + y = np.array(y) + + zmask = y == 0 + if zmask.any(): + shape = result.shape + + nan_mask = (zmask & (x == 0)).ravel() + neginf_mask = (zmask & (x < 0)).ravel() + posinf_mask = (zmask & (x > 0)).ravel() + + if nan_mask.any() or neginf_mask.any() or posinf_mask.any(): + # Fill negative/0 with -inf, positive/0 with +inf, 0/0 with NaN + result = result.astype('float64', copy=copy).ravel() + + np.putmask(result, nan_mask, np.nan) + np.putmask(result, posinf_mask, np.inf) + np.putmask(result, neginf_mask, -np.inf) + + result = result.reshape(shape) + + return result + + +def dispatch_missing(op, left, right, result): + """ + Fill nulls caused by division by zero, casting to a diffferent dtype + if necessary. + + Parameters + ---------- + op : function (operator.add, operator.div, ...) + left : object (Index for non-reversed ops) + right : object (Index fof reversed ops) + result : ndarray + + Returns + ------- + result : ndarray + """ + opstr = '__{opname}__'.format(opname=op.__name__).replace('____', '__') + if op in [operator.truediv, operator.floordiv, + getattr(operator, 'div', None)]: + result = mask_zero_div_zero(left, right, result) + elif op is operator.mod: + result = fill_zeros(result, left, right, opstr, np.nan) + elif op is divmod: + res0 = mask_zero_div_zero(left, right, result[0]) + res1 = fill_zeros(result[1], left, right, opstr, np.nan) + result = (res0, res1) + return result + + def _interp_limit(invalid, fw_limit, bw_limit): """ Get indexers of values that won't be filled diff --git a/pandas/tests/indexes/conftest.py b/pandas/tests/indexes/conftest.py index 217ee07affa84..6d88ef0cfa6c5 100644 --- a/pandas/tests/indexes/conftest.py +++ b/pandas/tests/indexes/conftest.py @@ -1,9 +1,10 @@ import pytest import numpy as np +import pandas as pd import pandas.util.testing as tm from pandas.core.indexes.api import Index, MultiIndex -from pandas.compat import lzip +from pandas.compat import lzip, long @pytest.fixture(params=[tm.makeUnicodeIndex(100), @@ -29,3 +30,18 @@ def indices(request): def one(request): # zero-dim integer array behaves like an integer return request.param + + +zeros = [box([0] * 5, dtype=dtype) + for box in [pd.Index, np.array] + for dtype in [np.int64, np.uint64, np.float64]] +zeros.extend([np.array(0, dtype=dtype) + for dtype in [np.int64, np.uint64, np.float64]]) +zeros.extend([0, 0.0, long(0)]) + + +@pytest.fixture(params=zeros) +def zero(request): + # For testing division by (or of) zero for Index with length 5, this + # gives several scalar-zeros and length-5 vector-zeros + return request.param diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index 0c1bec7a6f1a9..c6883df7ee91a 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -157,6 +157,48 @@ def test_divmod_series(self): for r, e in zip(result, expected): tm.assert_series_equal(r, e) + def test_div_zero(self, zero): + idx = self.create_index() + + expected = Index([np.nan, np.inf, np.inf, np.inf, np.inf], + dtype=np.float64) + result = idx / zero + tm.assert_index_equal(result, expected) + ser_compat = Series(idx).astype('i8') / np.array(zero).astype('i8') + tm.assert_series_equal(ser_compat, Series(result)) + + def test_floordiv_zero(self, zero): + idx = self.create_index() + expected = Index([np.nan, np.inf, np.inf, np.inf, np.inf], + dtype=np.float64) + + result = idx // zero + tm.assert_index_equal(result, expected) + ser_compat = Series(idx).astype('i8') // np.array(zero).astype('i8') + tm.assert_series_equal(ser_compat, Series(result)) + + def test_mod_zero(self, zero): + idx = self.create_index() + + expected = Index([np.nan, np.nan, np.nan, np.nan, np.nan], + dtype=np.float64) + result = idx % zero + tm.assert_index_equal(result, expected) + ser_compat = Series(idx).astype('i8') % np.array(zero).astype('i8') + tm.assert_series_equal(ser_compat, Series(result)) + + def test_divmod_zero(self, zero): + idx = self.create_index() + + exleft = Index([np.nan, np.inf, np.inf, np.inf, np.inf], + dtype=np.float64) + exright = Index([np.nan, np.nan, np.nan, np.nan, np.nan], + dtype=np.float64) + + result = divmod(idx, zero) + tm.assert_index_equal(result[0], exleft) + tm.assert_index_equal(result[1], exright) + def test_explicit_conversions(self): # GH 8608 From 672f5a151886a8bf457ac499d9a9a471689ee9ff Mon Sep 17 00:00:00 2001 From: Sam Foo Date: Tue, 6 Feb 2018 05:15:50 -0500 Subject: [PATCH 15/20] DOC: Remove repeated duplicated word (#19546) --- doc/source/advanced.rst | 2 +- doc/source/comparison_with_sas.rst | 4 ++-- doc/source/computation.rst | 2 +- doc/source/io.rst | 2 +- doc/source/release.rst | 10 +++++----- doc/source/tutorials.rst | 2 +- 6 files changed, 11 insertions(+), 11 deletions(-) diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst index 25f7c5a3ad948..ca903dadc6eb1 100644 --- a/doc/source/advanced.rst +++ b/doc/source/advanced.rst @@ -672,7 +672,7 @@ The ``CategoricalIndex`` is **preserved** after indexing: df2.loc['a'].index Sorting the index will sort by the order of the categories (Recall that we -created the index with with ``CategoricalDtype(list('cab'))``, so the sorted +created the index with ``CategoricalDtype(list('cab'))``, so the sorted order is ``cab``.). .. ipython:: python diff --git a/doc/source/comparison_with_sas.rst b/doc/source/comparison_with_sas.rst index e9e0d7716af3a..214667119f7e0 100644 --- a/doc/source/comparison_with_sas.rst +++ b/doc/source/comparison_with_sas.rst @@ -279,7 +279,7 @@ date/datetime columns. The equivalent pandas operations are shown below. In addition to these functions pandas supports other Time Series features -not available in Base SAS (such as resampling and and custom offsets) - +not available in Base SAS (such as resampling and custom offsets) - see the :ref:`timeseries documentation` for more details. .. ipython:: python @@ -584,7 +584,7 @@ For example, in SAS you could do this to filter missing values. if value_x ^= .; run; -Which doesn't work in in pandas. Instead, the ``pd.isna`` or ``pd.notna`` functions +Which doesn't work in pandas. Instead, the ``pd.isna`` or ``pd.notna`` functions should be used for comparisons. .. ipython:: python diff --git a/doc/source/computation.rst b/doc/source/computation.rst index a64542fa71705..4285767654e25 100644 --- a/doc/source/computation.rst +++ b/doc/source/computation.rst @@ -512,7 +512,7 @@ a same sized result as the input. When using ``.resample()`` with an offset. Construct a new index that is the frequency of the offset. For each frequency bin, aggregate points from the input within a backwards-in-time looking window that fall in that bin. The result of this -aggregation is the output for that frequency point. The windows are fixed size size in the frequency space. Your result +aggregation is the output for that frequency point. The windows are fixed size in the frequency space. Your result will have the shape of a regular frequency between the min and the max of the original input object. To summarize, ``.rolling()`` is a time-based window operation, while ``.resample()`` is a frequency-based window operation. diff --git a/doc/source/io.rst b/doc/source/io.rst index 60dc89f8fd495..1785de54b7dd6 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -4529,7 +4529,7 @@ Several caveats. on an attempt at serialization. You can specify an ``engine`` to direct the serialization. This can be one of ``pyarrow``, or ``fastparquet``, or ``auto``. -If the engine is NOT specified, then the ``pd.options.io.parquet.engine`` option is checked; if this is also ``auto``, then +If the engine is NOT specified, then the ``pd.options.io.parquet.engine`` option is checked; if this is also ``auto``, then ``pyarrow`` is tried, and falling back to ``fastparquet``. See the documentation for `pyarrow `__ and `fastparquet `__ diff --git a/doc/source/release.rst b/doc/source/release.rst index cd763de42d162..8e063116cbf07 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -406,7 +406,7 @@ of all enhancements and bugs that have been fixed in 0.20.1. .. note:: - This is a combined release for 0.20.0 and and 0.20.1. + This is a combined release for 0.20.0 and 0.20.1. Version 0.20.1 contains one additional change for backwards-compatibility with downstream projects using pandas' ``utils`` routines. (:issue:`16250`) Thanks @@ -2918,7 +2918,7 @@ Improvements to existing features - clipboard functions use pyperclip (no dependencies on Windows, alternative dependencies offered for Linux) (:issue:`3837`). - Plotting functions now raise a ``TypeError`` before trying to plot anything - if the associated objects have have a dtype of ``object`` (:issue:`1818`, + if the associated objects have a dtype of ``object`` (:issue:`1818`, :issue:`3572`, :issue:`3911`, :issue:`3912`), but they will try to convert object arrays to numeric arrays if possible so that you can still plot, for example, an object array with floats. This happens before any drawing takes place which @@ -4082,7 +4082,7 @@ Bug Fixes columns (:issue:`1943`) - Fix time zone localization bug causing improper fields (e.g. hours) in time zones that have not had a UTC transition in a long time (:issue:`1946`) -- Fix errors when parsing and working with with fixed offset timezones +- Fix errors when parsing and working with fixed offset timezones (:issue:`1922`, :issue:`1928`) - Fix text parser bug when handling UTC datetime objects generated by dateutil (:issue:`1693`) @@ -4383,7 +4383,7 @@ Bug Fixes error (:issue:`1090`) - Consistently set name on groupby pieces (:issue:`184`) - Treat dict return values as Series in GroupBy.apply (:issue:`823`) -- Respect column selection for DataFrame in in GroupBy.transform (:issue:`1365`) +- Respect column selection for DataFrame in GroupBy.transform (:issue:`1365`) - Fix MultiIndex partial indexing bug (:issue:`1352`) - Enable assignment of rows in mixed-type DataFrame via .ix (:issue:`1432`) - Reset index mapping when grouping Series in Cython (:issue:`1423`) @@ -5040,7 +5040,7 @@ New Features - Add `melt` function to `pandas.core.reshape` - Add `level` parameter to group by level in Series and DataFrame descriptive statistics (:issue:`313`) -- Add `head` and `tail` methods to Series, analogous to to DataFrame (PR +- Add `head` and `tail` methods to Series, analogous to DataFrame (PR :issue:`296`) - Add `Series.isin` function which checks if each value is contained in a passed sequence (:issue:`289`) diff --git a/doc/source/tutorials.rst b/doc/source/tutorials.rst index 43ccd372d9d5b..710212bc237cd 100644 --- a/doc/source/tutorials.rst +++ b/doc/source/tutorials.rst @@ -19,7 +19,7 @@ pandas Cookbook The goal of this cookbook (by `Julia Evans `_) is to give you some concrete examples for getting started with pandas. These are examples with real-world data, and all the bugs and weirdness that -that entails. +entails. Here are links to the v0.1 release. For an up-to-date table of contents, see the `pandas-cookbook GitHub repository `_. To run the examples in this tutorial, you'll need to From a22acc2961bc6719f11a2900e004982e55007401 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 6 Feb 2018 03:20:35 -0800 Subject: [PATCH 16/20] centralize and split frame division tests (#19527) --- pandas/tests/frame/test_arithmetic.py | 122 +++++++++++++++++++++++++- pandas/tests/frame/test_operators.py | 70 --------------- pandas/tests/frame/test_timeseries.py | 9 -- 3 files changed, 121 insertions(+), 80 deletions(-) diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 9b99a7b73b82b..1bb8e8edffc6e 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -1,8 +1,9 @@ # -*- coding: utf-8 -*- - import pytest import numpy as np +from pandas.compat import range + import pandas as pd import pandas.util.testing as tm @@ -58,10 +59,129 @@ def test_df_flex_cmp_constant_return_types_empty(self, opname): result = getattr(empty, opname)(const).get_dtype_counts() tm.assert_series_equal(result, pd.Series([2], ['bool'])) + @pytest.mark.parametrize('timestamps', [ + [pd.Timestamp('2012-01-01 13:00:00+00:00')] * 2, + [pd.Timestamp('2012-01-01 13:00:00')] * 2]) + def test_tz_aware_scalar_comparison(self, timestamps): + # Test for issue #15966 + df = pd.DataFrame({'test': timestamps}) + expected = pd.DataFrame({'test': [False, False]}) + tm.assert_frame_equal(df == -1, expected) + # ------------------------------------------------------------------- # Arithmetic +class TestFrameMulDiv(object): + """Tests for DataFrame multiplication and division""" + # ------------------------------------------------------------------ + # Mod By Zero + + def test_df_mod_zero_df(self): + # GH#3590, modulo as ints + df = pd.DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]}) + + # this is technically wrong, as the integer portion is coerced to float + # ### + first = pd.Series([0, 0, 0, 0], dtype='float64') + second = pd.Series([np.nan, np.nan, np.nan, 0]) + expected = pd.DataFrame({'first': first, 'second': second}) + result = df % df + tm.assert_frame_equal(result, expected) + + def test_df_mod_zero_array(self): + # GH#3590, modulo as ints + df = pd.DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]}) + + # this is technically wrong, as the integer portion is coerced to float + # ### + first = pd.Series([0, 0, 0, 0], dtype='float64') + second = pd.Series([np.nan, np.nan, np.nan, 0]) + expected = pd.DataFrame({'first': first, 'second': second}) + + # numpy has a slightly different (wrong) treatment + with np.errstate(all='ignore'): + arr = df.values % df.values + result2 = pd.DataFrame(arr, index=df.index, + columns=df.columns, dtype='float64') + result2.iloc[0:3, 1] = np.nan + tm.assert_frame_equal(result2, expected) + + def test_df_mod_zero_int(self): + # GH#3590, modulo as ints + df = pd.DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]}) + + result = df % 0 + expected = pd.DataFrame(np.nan, index=df.index, columns=df.columns) + tm.assert_frame_equal(result, expected) + + # numpy has a slightly different (wrong) treatment + with np.errstate(all='ignore'): + arr = df.values.astype('float64') % 0 + result2 = pd.DataFrame(arr, index=df.index, columns=df.columns) + tm.assert_frame_equal(result2, expected) + + def test_df_mod_zero_series_does_not_commute(self): + # GH#3590, modulo as ints + # not commutative with series + df = pd.DataFrame(np.random.randn(10, 5)) + ser = df[0] + res = ser % df + res2 = df % ser + assert not res.fillna(0).equals(res2.fillna(0)) + + # ------------------------------------------------------------------ + # Division By Zero + + def test_df_div_zero_df(self): + # integer div, but deal with the 0's (GH#9144) + df = pd.DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]}) + result = df / df + + first = pd.Series([1.0, 1.0, 1.0, 1.0]) + second = pd.Series([np.nan, np.nan, np.nan, 1]) + expected = pd.DataFrame({'first': first, 'second': second}) + tm.assert_frame_equal(result, expected) + + def test_df_div_zero_array(self): + # integer div, but deal with the 0's (GH#9144) + df = pd.DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]}) + + first = pd.Series([1.0, 1.0, 1.0, 1.0]) + second = pd.Series([np.nan, np.nan, np.nan, 1]) + expected = pd.DataFrame({'first': first, 'second': second}) + + with np.errstate(all='ignore'): + arr = df.values.astype('float') / df.values + result = pd.DataFrame(arr, index=df.index, + columns=df.columns) + tm.assert_frame_equal(result, expected) + + def test_df_div_zero_int(self): + # integer div, but deal with the 0's (GH#9144) + df = pd.DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]}) + + result = df / 0 + expected = pd.DataFrame(np.inf, index=df.index, columns=df.columns) + expected.iloc[0:3, 1] = np.nan + tm.assert_frame_equal(result, expected) + + # numpy has a slightly different (wrong) treatment + with np.errstate(all='ignore'): + arr = df.values.astype('float64') / 0 + result2 = pd.DataFrame(arr, index=df.index, + columns=df.columns) + tm.assert_frame_equal(result2, expected) + + def test_df_div_zero_series_does_not_commute(self): + # integer div, but deal with the 0's (GH#9144) + df = pd.DataFrame(np.random.randn(10, 5)) + ser = df[0] + res = ser / df + res2 = df / ser + assert not res.fillna(0).equals(res2.fillna(0)) + + class TestFrameArithmetic(object): @pytest.mark.xfail(reason='GH#7996 datetime64 units not converted to nano') diff --git a/pandas/tests/frame/test_operators.py b/pandas/tests/frame/test_operators.py index bdccbec6111d3..bf895be8bc813 100644 --- a/pandas/tests/frame/test_operators.py +++ b/pandas/tests/frame/test_operators.py @@ -203,76 +203,6 @@ def test_timestamp_compare(self): result = right_f(Timestamp('nat'), df) assert_frame_equal(result, expected) - def test_modulo(self): - # GH3590, modulo as ints - p = DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]}) - - # this is technically wrong as the integer portion is coerced to float - # ### - expected = DataFrame({'first': Series([0, 0, 0, 0], dtype='float64'), - 'second': Series([np.nan, np.nan, np.nan, 0])}) - result = p % p - assert_frame_equal(result, expected) - - # numpy has a slightly different (wrong) treatement - with np.errstate(all='ignore'): - arr = p.values % p.values - result2 = DataFrame(arr, index=p.index, - columns=p.columns, dtype='float64') - result2.iloc[0:3, 1] = np.nan - assert_frame_equal(result2, expected) - - result = p % 0 - expected = DataFrame(np.nan, index=p.index, columns=p.columns) - assert_frame_equal(result, expected) - - # numpy has a slightly different (wrong) treatement - with np.errstate(all='ignore'): - arr = p.values.astype('float64') % 0 - result2 = DataFrame(arr, index=p.index, columns=p.columns) - assert_frame_equal(result2, expected) - - # not commutative with series - p = DataFrame(np.random.randn(10, 5)) - s = p[0] - res = s % p - res2 = p % s - assert not res.fillna(0).equals(res2.fillna(0)) - - def test_div(self): - - # integer div, but deal with the 0's (GH 9144) - p = DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]}) - result = p / p - - expected = DataFrame({'first': Series([1.0, 1.0, 1.0, 1.0]), - 'second': Series([nan, nan, nan, 1])}) - assert_frame_equal(result, expected) - - with np.errstate(all='ignore'): - arr = p.values.astype('float') / p.values - result2 = DataFrame(arr, index=p.index, - columns=p.columns) - assert_frame_equal(result2, expected) - - result = p / 0 - expected = DataFrame(np.inf, index=p.index, columns=p.columns) - expected.iloc[0:3, 1] = nan - assert_frame_equal(result, expected) - - # numpy has a slightly different (wrong) treatement - with np.errstate(all='ignore'): - arr = p.values.astype('float64') / 0 - result2 = DataFrame(arr, index=p.index, - columns=p.columns) - assert_frame_equal(result2, expected) - - p = DataFrame(np.random.randn(10, 5)) - s = p[0] - res = s / p - res2 = p / s - assert not res.fillna(0).equals(res2.fillna(0)) - def test_logical_operators(self): def _check_bin_op(op): diff --git a/pandas/tests/frame/test_timeseries.py b/pandas/tests/frame/test_timeseries.py index e6b47fd69cb05..25dd285e883a0 100644 --- a/pandas/tests/frame/test_timeseries.py +++ b/pandas/tests/frame/test_timeseries.py @@ -738,12 +738,3 @@ def test_tz_convert_and_localize(self, fn): with assert_raises_regex(ValueError, 'not valid'): df = DataFrame(index=l0) df = getattr(df, fn)('US/Pacific', level=1) - - @pytest.mark.parametrize('timestamps', [ - [Timestamp('2012-01-01 13:00:00+00:00')] * 2, - [Timestamp('2012-01-01 13:00:00')] * 2]) - def test_tz_aware_scalar_comparison(self, timestamps): - # Test for issue #15966 - df = DataFrame({'test': timestamps}) - expected = DataFrame({'test': [False, False]}) - assert_frame_equal(df == -1, expected) From 84522a0f5e033ab631d83808d02cbb07ec8dfec3 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 6 Feb 2018 03:27:16 -0800 Subject: [PATCH 17/20] Fix parsing corner case closes #19382 (#19529) --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/_libs/tslib.pyx | 30 ++++++++++++++++---- pandas/_libs/tslibs/conversion.pyx | 8 ++++++ pandas/tests/indexes/datetimes/test_tools.py | 16 ++++++++++- pandas/tests/scalar/test_timestamp.py | 8 ++++++ 5 files changed, 56 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index ea56ebad7d782..ca625f492b61f 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -550,6 +550,7 @@ Datetimelike - Bug in :func:`Series.truncate` which raises ``TypeError`` with a monotonic ``PeriodIndex`` (:issue:`17717`) - Bug in :func:`~DataFrame.pct_change` using ``periods`` and ``freq`` returned different length outputs (:issue:`7292`) - Bug in comparison of :class:`DatetimeIndex` against ``None`` or ``datetime.date`` objects raising ``TypeError`` for ``==`` and ``!=`` comparisons instead of all-``False`` and all-``True``, respectively (:issue:`19301`) +- Bug in :class:`Timestamp` and :func:`to_datetime` where a string representing a barely out-of-bounds timestamp would be incorrectly rounded down instead of raising ``OutOfBoundsDatetime`` (:issue:`19382`) - Timezones diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 81df7981096ba..877d7deff6ff4 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -609,20 +609,38 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', value = tz_convert_single(value, tz, 'UTC') iresult[i] = value check_dts_bounds(&dts) + except OutOfBoundsDatetime: + # GH#19382 for just-barely-OutOfBounds falling back to + # dateutil parser will return incorrect result because + # it will ignore nanoseconds + if require_iso8601: + if _parse_today_now(val, &iresult[i]): + continue + elif is_coerce: + iresult[i] = NPY_NAT + continue + elif is_raise: + raise ValueError("time data {val} doesn't match " + "format specified" + .format(val=val)) + return values + elif is_coerce: + iresult[i] = NPY_NAT + continue + raise except ValueError: # if requiring iso8601 strings, skip trying other formats if require_iso8601: if _parse_today_now(val, &iresult[i]): continue - if is_coerce: + elif is_coerce: iresult[i] = NPY_NAT continue elif is_raise: - raise ValueError( - "time data %r doesn't match format " - "specified" % (val,)) - else: - return values + raise ValueError("time data {val} doesn't match " + "format specified" + .format(val=val)) + return values try: py_dt = parse_datetime_string(val, dayfirst=dayfirst, diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index a32bfc1f6836c..4f1a053da6f1d 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -26,6 +26,7 @@ from np_datetime cimport (check_dts_bounds, dt64_to_dtstruct, dtstruct_to_dt64, get_datetime64_unit, get_datetime64_value, pydatetime_to_dt64) +from np_datetime import OutOfBoundsDatetime from util cimport (is_string_object, is_datetime64_object, @@ -472,6 +473,13 @@ cdef _TSObject convert_str_to_tsobject(object ts, object tz, object unit, ts = tz_localize_to_utc(np.array([ts], dtype='i8'), tz, ambiguous='raise', errors='raise')[0] + + except OutOfBoundsDatetime: + # GH#19382 for just-barely-OutOfBounds falling back to dateutil + # parser will return incorrect result because it will ignore + # nanoseconds + raise + except ValueError: try: ts = parse_datetime_string(ts, dayfirst=dayfirst, diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index 44f3c21d23e62..f8b1f68ba33ce 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -17,6 +17,7 @@ from pandas._libs.tslibs import parsing from pandas.core.tools import datetimes as tools +from pandas.errors import OutOfBoundsDatetime from pandas.compat import lmap from pandas.compat.numpy import np_array_datetime64_compat from pandas.core.dtypes.common import is_datetime64_ns_dtype @@ -783,7 +784,6 @@ def test_dataframe_dtypes(self, cache): class TestToDatetimeMisc(object): - @pytest.mark.parametrize('cache', [True, False]) def test_to_datetime_iso8601(self, cache): result = to_datetime(["2012-01-01 00:00:00"], cache=cache) @@ -1596,6 +1596,20 @@ def test_coerce_of_invalid_datetimes(self): ) ) + def test_to_datetime_barely_out_of_bounds(self): + # GH#19529 + # GH#19382 close enough to bounds that dropping nanos would result + # in an in-bounds datetime + arr = np.array(['2262-04-11 23:47:16.854775808'], dtype=object) + + with pytest.raises(OutOfBoundsDatetime): + to_datetime(arr) + + with pytest.raises(OutOfBoundsDatetime): + # Essentially the same as above, but more directly calling + # the relevant function + tslib.array_to_datetime(arr) + def test_normalize_date(): value = date(2012, 9, 7) diff --git a/pandas/tests/scalar/test_timestamp.py b/pandas/tests/scalar/test_timestamp.py index 301f6da140866..7695c94409232 100644 --- a/pandas/tests/scalar/test_timestamp.py +++ b/pandas/tests/scalar/test_timestamp.py @@ -18,6 +18,7 @@ from pandas._libs.tslibs import conversion from pandas._libs.tslibs.timezones import get_timezone, dateutil_gettz as gettz +from pandas.errors import OutOfBoundsDatetime from pandas.compat import long, PY3 from pandas.compat.numpy import np_datetime64_compat from pandas import Timestamp, Period, Timedelta @@ -410,6 +411,13 @@ def test_out_of_bounds_string(self): with pytest.raises(ValueError): Timestamp('2263-01-01') + def test_barely_out_of_bounds(self): + # GH#19529 + # GH#19382 close enough to bounds that dropping nanos would result + # in an in-bounds datetime + with pytest.raises(OutOfBoundsDatetime): + Timestamp('2262-04-11 23:47:16.854775808') + def test_bounds_with_different_units(self): out_of_bounds_dates = ('1677-09-21', '2262-04-12') From 54f1b3eca094e0b98d6d2b93854f9c937394109d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 6 Feb 2018 03:34:32 -0800 Subject: [PATCH 18/20] Collect Series timezone tests (#19541) --- pandas/tests/series/test_timezones.py | 293 +++++++++++++++++++++++++ pandas/tests/tseries/test_timezones.py | 258 +--------------------- 2 files changed, 296 insertions(+), 255 deletions(-) create mode 100644 pandas/tests/series/test_timezones.py diff --git a/pandas/tests/series/test_timezones.py b/pandas/tests/series/test_timezones.py new file mode 100644 index 0000000000000..2e15c964e4e93 --- /dev/null +++ b/pandas/tests/series/test_timezones.py @@ -0,0 +1,293 @@ +# -*- coding: utf-8 -*- +""" +Tests for Series timezone-related methods +""" +from datetime import datetime + +import pytest +import pytz +import numpy as np +from dateutil.tz import tzoffset + +import pandas.util.testing as tm +from pandas._libs import tslib +from pandas._libs.tslibs import timezones +from pandas.compat import lrange +from pandas.core.indexes.datetimes import date_range +from pandas import Series, Timestamp, DatetimeIndex, Index + + +class TestSeriesTimezones(object): + # ----------------------------------------------------------------- + # Series.tz_localize + def test_series_tz_localize(self): + + rng = date_range('1/1/2011', periods=100, freq='H') + ts = Series(1, index=rng) + + result = ts.tz_localize('utc') + assert result.index.tz.zone == 'UTC' + + # Can't localize if already tz-aware + rng = date_range('1/1/2011', periods=100, freq='H', tz='utc') + ts = Series(1, index=rng) + tm.assert_raises_regex(TypeError, 'Already tz-aware', + ts.tz_localize, 'US/Eastern') + + def test_series_tz_localize_ambiguous_bool(self): + # make sure that we are correctly accepting bool values as ambiguous + + # GH#14402 + ts = Timestamp('2015-11-01 01:00:03') + expected0 = Timestamp('2015-11-01 01:00:03-0500', tz='US/Central') + expected1 = Timestamp('2015-11-01 01:00:03-0600', tz='US/Central') + + ser = Series([ts]) + expected0 = Series([expected0]) + expected1 = Series([expected1]) + + with pytest.raises(pytz.AmbiguousTimeError): + ser.dt.tz_localize('US/Central') + + result = ser.dt.tz_localize('US/Central', ambiguous=True) + tm.assert_series_equal(result, expected0) + + result = ser.dt.tz_localize('US/Central', ambiguous=[True]) + tm.assert_series_equal(result, expected0) + + result = ser.dt.tz_localize('US/Central', ambiguous=False) + tm.assert_series_equal(result, expected1) + + result = ser.dt.tz_localize('US/Central', ambiguous=[False]) + tm.assert_series_equal(result, expected1) + + @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + def test_series_tz_localize_empty(self, tzstr): + # GH#2248 + ser = Series() + + ser2 = ser.tz_localize('utc') + assert ser2.index.tz == pytz.utc + + ser2 = ser.tz_localize(tzstr) + timezones.tz_compare(ser2.index.tz, timezones.maybe_get_tz(tzstr)) + + # ----------------------------------------------------------------- + # Series.tz_convert + + def test_series_tz_convert(self): + rng = date_range('1/1/2011', periods=200, freq='D', tz='US/Eastern') + ts = Series(1, index=rng) + + result = ts.tz_convert('Europe/Berlin') + assert result.index.tz.zone == 'Europe/Berlin' + + # can't convert tz-naive + rng = date_range('1/1/2011', periods=200, freq='D') + ts = Series(1, index=rng) + tm.assert_raises_regex(TypeError, "Cannot convert tz-naive", + ts.tz_convert, 'US/Eastern') + + # ----------------------------------------------------------------- + # Series.append + + def test_series_append_aware(self): + rng1 = date_range('1/1/2011 01:00', periods=1, freq='H', + tz='US/Eastern') + rng2 = date_range('1/1/2011 02:00', periods=1, freq='H', + tz='US/Eastern') + ser1 = Series([1], index=rng1) + ser2 = Series([2], index=rng2) + ts_result = ser1.append(ser2) + + exp_index = DatetimeIndex(['2011-01-01 01:00', '2011-01-01 02:00'], + tz='US/Eastern') + exp = Series([1, 2], index=exp_index) + tm.assert_series_equal(ts_result, exp) + assert ts_result.index.tz == rng1.tz + + rng1 = date_range('1/1/2011 01:00', periods=1, freq='H', tz='UTC') + rng2 = date_range('1/1/2011 02:00', periods=1, freq='H', tz='UTC') + ser1 = Series([1], index=rng1) + ser2 = Series([2], index=rng2) + ts_result = ser1.append(ser2) + + exp_index = DatetimeIndex(['2011-01-01 01:00', '2011-01-01 02:00'], + tz='UTC') + exp = Series([1, 2], index=exp_index) + tm.assert_series_equal(ts_result, exp) + utc = rng1.tz + assert utc == ts_result.index.tz + + # GH#7795 + # different tz coerces to object dtype, not UTC + rng1 = date_range('1/1/2011 01:00', periods=1, freq='H', + tz='US/Eastern') + rng2 = date_range('1/1/2011 02:00', periods=1, freq='H', + tz='US/Central') + ser1 = Series([1], index=rng1) + ser2 = Series([2], index=rng2) + ts_result = ser1.append(ser2) + exp_index = Index([Timestamp('1/1/2011 01:00', tz='US/Eastern'), + Timestamp('1/1/2011 02:00', tz='US/Central')]) + exp = Series([1, 2], index=exp_index) + tm.assert_series_equal(ts_result, exp) + + def test_series_append_aware_naive(self): + rng1 = date_range('1/1/2011 01:00', periods=1, freq='H') + rng2 = date_range('1/1/2011 02:00', periods=1, freq='H', + tz='US/Eastern') + ser1 = Series(np.random.randn(len(rng1)), index=rng1) + ser2 = Series(np.random.randn(len(rng2)), index=rng2) + ts_result = ser1.append(ser2) + + expected = ser1.index.astype(object).append(ser2.index.astype(object)) + assert ts_result.index.equals(expected) + + # mixed + rng1 = date_range('1/1/2011 01:00', periods=1, freq='H') + rng2 = lrange(100) + ser1 = Series(np.random.randn(len(rng1)), index=rng1) + ser2 = Series(np.random.randn(len(rng2)), index=rng2) + ts_result = ser1.append(ser2) + + expected = ser1.index.astype(object).append(ser2.index) + assert ts_result.index.equals(expected) + + def test_series_append_dst(self): + rng1 = date_range('1/1/2016 01:00', periods=3, freq='H', + tz='US/Eastern') + rng2 = date_range('8/1/2016 01:00', periods=3, freq='H', + tz='US/Eastern') + ser1 = Series([1, 2, 3], index=rng1) + ser2 = Series([10, 11, 12], index=rng2) + ts_result = ser1.append(ser2) + + exp_index = DatetimeIndex(['2016-01-01 01:00', '2016-01-01 02:00', + '2016-01-01 03:00', '2016-08-01 01:00', + '2016-08-01 02:00', '2016-08-01 03:00'], + tz='US/Eastern') + exp = Series([1, 2, 3, 10, 11, 12], index=exp_index) + tm.assert_series_equal(ts_result, exp) + assert ts_result.index.tz == rng1.tz + + # ----------------------------------------------------------------- + + def test_dateutil_tzoffset_support(self): + values = [188.5, 328.25] + tzinfo = tzoffset(None, 7200) + index = [datetime(2012, 5, 11, 11, tzinfo=tzinfo), + datetime(2012, 5, 11, 12, tzinfo=tzinfo)] + series = Series(data=values, index=index) + + assert series.index.tz == tzinfo + + # it works! #2443 + repr(series.index[0]) + + @pytest.mark.parametrize('tz', ['US/Eastern', 'dateutil/US/Eastern']) + def test_tz_aware_asfreq(self, tz): + dr = date_range('2011-12-01', '2012-07-20', freq='D', tz=tz) + + ser = Series(np.random.randn(len(dr)), index=dr) + + # it works! + ser.asfreq('T') + + @pytest.mark.parametrize('tz', ['US/Eastern', 'dateutil/US/Eastern']) + def test_string_index_alias_tz_aware(self, tz): + rng = date_range('1/1/2000', periods=10, tz=tz) + ser = Series(np.random.randn(len(rng)), index=rng) + + result = ser['1/3/2000'] + tm.assert_almost_equal(result, ser[2]) + + # TODO: De-duplicate with test below + def test_series_add_tz_mismatch_converts_to_utc_duplicate(self): + rng = date_range('1/1/2011', periods=10, freq='H', tz='US/Eastern') + ser = Series(np.random.randn(len(rng)), index=rng) + + ts_moscow = ser.tz_convert('Europe/Moscow') + + result = ser + ts_moscow + assert result.index.tz is pytz.utc + + result = ts_moscow + ser + assert result.index.tz is pytz.utc + + def test_series_add_tz_mismatch_converts_to_utc(self): + rng = date_range('1/1/2011', periods=100, freq='H', tz='utc') + + perm = np.random.permutation(100)[:90] + ser1 = Series(np.random.randn(90), + index=rng.take(perm).tz_convert('US/Eastern')) + + perm = np.random.permutation(100)[:90] + ser2 = Series(np.random.randn(90), + index=rng.take(perm).tz_convert('Europe/Berlin')) + + result = ser1 + ser2 + + uts1 = ser1.tz_convert('utc') + uts2 = ser2.tz_convert('utc') + expected = uts1 + uts2 + + assert result.index.tz == pytz.UTC + tm.assert_series_equal(result, expected) + + def test_series_add_aware_naive_raises(self): + rng = date_range('1/1/2011', periods=10, freq='H') + ser = Series(np.random.randn(len(rng)), index=rng) + + ser_utc = ser.tz_localize('utc') + + with pytest.raises(Exception): + ser + ser_utc + + with pytest.raises(Exception): + ser_utc + ser + + def test_series_align_aware(self): + idx1 = date_range('2001', periods=5, freq='H', tz='US/Eastern') + ser = Series(np.random.randn(len(idx1)), index=idx1) + ser_central = ser.tz_convert('US/Central') + # # different timezones convert to UTC + + new1, new2 = ser.align(ser_central) + assert new1.index.tz == pytz.UTC + assert new2.index.tz == pytz.UTC + + @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + def test_localized_at_time_between_time(self, tzstr): + from datetime import time + tz = timezones.maybe_get_tz(tzstr) + + rng = date_range('4/16/2012', '5/1/2012', freq='H') + ts = Series(np.random.randn(len(rng)), index=rng) + + ts_local = ts.tz_localize(tzstr) + + result = ts_local.at_time(time(10, 0)) + expected = ts.at_time(time(10, 0)).tz_localize(tzstr) + tm.assert_series_equal(result, expected) + assert timezones.tz_compare(result.index.tz, tz) + + t1, t2 = time(10, 0), time(11, 0) + result = ts_local.between_time(t1, t2) + expected = ts.between_time(t1, t2).tz_localize(tzstr) + tm.assert_series_equal(result, expected) + assert timezones.tz_compare(result.index.tz, tz) + + @pytest.mark.parametrize('tzstr', ['Europe/Berlin', + 'dateutil/Europe/Berlin']) + def test_getitem_pydatetime_tz(self, tzstr): + tz = timezones.maybe_get_tz(tzstr) + + index = date_range(start='2012-12-24 16:00', end='2012-12-24 18:00', + freq='H', tz=tzstr) + ts = Series(index=index, data=index.hour) + time_pandas = Timestamp('2012-12-24 17:00', tz=tzstr) + + dt = datetime(2012, 12, 24, 17, 0) + time_datetime = tslib._localize_pydatetime(dt, tz) + assert ts[time_pandas] == ts[time_datetime] diff --git a/pandas/tests/tseries/test_timezones.py b/pandas/tests/tseries/test_timezones.py index 2630984a70807..8f46e0a58580e 100644 --- a/pandas/tests/tseries/test_timezones.py +++ b/pandas/tests/tseries/test_timezones.py @@ -8,7 +8,7 @@ from dateutil.parser import parse from pytz import NonExistentTimeError from distutils.version import LooseVersion -from dateutil.tz import tzlocal, tzoffset +from dateutil.tz import tzlocal from datetime import datetime, timedelta, tzinfo import pandas.util.testing as tm @@ -18,9 +18,9 @@ from pandas.core.indexes.datetimes import bdate_range, date_range from pandas._libs import tslib from pandas._libs.tslibs import timezones, conversion -from pandas import (Index, Series, isna, Timestamp, NaT, +from pandas import (Index, isna, Timestamp, NaT, DatetimeIndex, to_datetime) -from pandas.util.testing import assert_series_equal, set_timezone +from pandas.util.testing import set_timezone class FixedOffset(tzinfo): @@ -142,17 +142,6 @@ def test_tz_localize_dti(self): pytest.raises(pytz.NonExistentTimeError, dti.tz_localize, self.tzstr('US/Eastern')) - def test_tz_localize_empty_series(self): - # #2248 - - ts = Series() - - ts2 = ts.tz_localize('utc') - assert ts2.index.tz == pytz.utc - - ts2 = ts.tz_localize(self.tzstr('US/Eastern')) - assert self.cmptz(ts2.index.tz, self.tz('US/Eastern')) - def test_create_with_tz(self): stamp = Timestamp('3/11/2012 05:00', tz=self.tzstr('US/Eastern')) assert stamp.hour == 5 @@ -455,34 +444,6 @@ def test_ambiguous_nat(self): # right is datetime64[ns, tzfile('/usr/share/zoneinfo/US/Eastern')] tm.assert_numpy_array_equal(di_test.values, localized.values) - def test_ambiguous_bool(self): - # make sure that we are correctly accepting bool values as ambiguous - - # gh-14402 - t = Timestamp('2015-11-01 01:00:03') - expected0 = Timestamp('2015-11-01 01:00:03-0500', tz='US/Central') - expected1 = Timestamp('2015-11-01 01:00:03-0600', tz='US/Central') - - s = Series([t]) - expected0 = Series([expected0]) - expected1 = Series([expected1]) - - def f(): - s.dt.tz_localize('US/Central') - pytest.raises(pytz.AmbiguousTimeError, f) - - result = s.dt.tz_localize('US/Central', ambiguous=True) - assert_series_equal(result, expected0) - - result = s.dt.tz_localize('US/Central', ambiguous=[True]) - assert_series_equal(result, expected0) - - result = s.dt.tz_localize('US/Central', ambiguous=False) - assert_series_equal(result, expected1) - - result = s.dt.tz_localize('US/Central', ambiguous=[False]) - assert_series_equal(result, expected1) - def test_nonexistent_raise_coerce(self): # See issue 13057 from pytz.exceptions import NonExistentTimeError @@ -565,34 +526,6 @@ def test_index_astype_asobject_tzinfos(self): assert x == exval assert x.tzinfo == exval.tzinfo - def test_localized_at_time_between_time(self): - from datetime import time - - rng = date_range('4/16/2012', '5/1/2012', freq='H') - ts = Series(np.random.randn(len(rng)), index=rng) - - ts_local = ts.tz_localize(self.tzstr('US/Eastern')) - - result = ts_local.at_time(time(10, 0)) - expected = ts.at_time(time(10, 0)).tz_localize(self.tzstr( - 'US/Eastern')) - assert_series_equal(result, expected) - assert self.cmptz(result.index.tz, self.tz('US/Eastern')) - - t1, t2 = time(10, 0), time(11, 0) - result = ts_local.between_time(t1, t2) - expected = ts.between_time(t1, - t2).tz_localize(self.tzstr('US/Eastern')) - assert_series_equal(result, expected) - assert self.cmptz(result.index.tz, self.tz('US/Eastern')) - - def test_string_index_alias_tz_aware(self): - rng = date_range('1/1/2000', periods=10, tz=self.tzstr('US/Eastern')) - ts = Series(np.random.randn(len(rng)), index=rng) - - result = ts['1/3/2000'] - tm.assert_almost_equal(result, ts[2]) - def test_fixed_offset(self): dates = [datetime(2000, 1, 1, tzinfo=fixed_off), datetime(2000, 1, 2, tzinfo=fixed_off), @@ -668,15 +601,6 @@ def test_shift_localized(self): result = dr_tz.shift(1, '10T') assert result.tz == dr_tz.tz - def test_tz_aware_asfreq(self): - dr = date_range('2011-12-01', '2012-07-20', freq='D', - tz=self.tzstr('US/Eastern')) - - s = Series(np.random.randn(len(dr)), index=dr) - - # it works! - s.asfreq('T') - def test_static_tzinfo(self): # it works! index = DatetimeIndex([datetime(2012, 1, 1)], tz=self.tzstr('EST')) @@ -709,28 +633,6 @@ def test_convert_datetime_list(self): assert dr.tz == dr2.tz assert dr2.name == 'foo' - def test_dateutil_tzoffset_support(self): - values = [188.5, 328.25] - tzinfo = tzoffset(None, 7200) - index = [datetime(2012, 5, 11, 11, tzinfo=tzinfo), - datetime(2012, 5, 11, 12, tzinfo=tzinfo)] - series = Series(data=values, index=index) - - assert series.index.tz == tzinfo - - # it works! #2443 - repr(series.index[0]) - - def test_getitem_pydatetime_tz(self): - index = date_range(start='2012-12-24 16:00', end='2012-12-24 18:00', - freq='H', tz=self.tzstr('Europe/Berlin')) - ts = Series(index=index, data=index.hour) - time_pandas = Timestamp('2012-12-24 17:00', - tz=self.tzstr('Europe/Berlin')) - time_datetime = self.localize( - self.tz('Europe/Berlin'), datetime(2012, 12, 24, 17, 0)) - assert ts[time_pandas] == ts[time_datetime] - def test_index_drop_dont_lose_tz(self): # #2621 ind = date_range("2012-12-01", periods=10, tz="utc") @@ -1056,33 +958,6 @@ def test_tz_localize_roundtrip(self): tm.assert_index_equal(reset, idx) assert reset.tzinfo is None - def test_series_tz_localize(self): - - rng = date_range('1/1/2011', periods=100, freq='H') - ts = Series(1, index=rng) - - result = ts.tz_localize('utc') - assert result.index.tz.zone == 'UTC' - - # Can't localize if already tz-aware - rng = date_range('1/1/2011', periods=100, freq='H', tz='utc') - ts = Series(1, index=rng) - tm.assert_raises_regex(TypeError, 'Already tz-aware', - ts.tz_localize, 'US/Eastern') - - def test_series_tz_convert(self): - rng = date_range('1/1/2011', periods=200, freq='D', tz='US/Eastern') - ts = Series(1, index=rng) - - result = ts.tz_convert('Europe/Berlin') - assert result.index.tz.zone == 'Europe/Berlin' - - # can't convert tz-naive - rng = date_range('1/1/2011', periods=200, freq='D') - ts = Series(1, index=rng) - tm.assert_raises_regex(TypeError, "Cannot convert tz-naive", - ts.tz_convert, 'US/Eastern') - def test_tz_convert_roundtrip(self): for tz in self.timezones: idx1 = date_range(start='2014-01-01', end='2014-12-31', freq='M', @@ -1127,12 +1002,6 @@ def test_join_utc_convert(self): def test_join_aware(self): rng = date_range('1/1/2011', periods=10, freq='H') - ts = Series(np.random.randn(len(rng)), index=rng) - - ts_utc = ts.tz_localize('utc') - - pytest.raises(Exception, ts.__add__, ts_utc) - pytest.raises(Exception, ts_utc.__add__, ts) # non-overlapping rng = date_range("2012-11-15 00:00:00", periods=6, freq="H", @@ -1144,127 +1013,6 @@ def test_join_aware(self): result = rng.union(rng2) assert result.tz.zone == 'UTC' - def test_series_align_aware(self): - idx1 = date_range('2001', periods=5, freq='H', tz='US/Eastern') - ser = Series(np.random.randn(len(idx1)), index=idx1) - ser_central = ser.tz_convert('US/Central') - # # different timezones convert to UTC - - new1, new2 = ser.align(ser_central) - assert new1.index.tz == pytz.UTC - assert new2.index.tz == pytz.UTC - - def test_append_aware(self): - rng1 = date_range('1/1/2011 01:00', periods=1, freq='H', - tz='US/Eastern') - rng2 = date_range('1/1/2011 02:00', periods=1, freq='H', - tz='US/Eastern') - ts1 = Series([1], index=rng1) - ts2 = Series([2], index=rng2) - ts_result = ts1.append(ts2) - - exp_index = DatetimeIndex(['2011-01-01 01:00', '2011-01-01 02:00'], - tz='US/Eastern') - exp = Series([1, 2], index=exp_index) - assert_series_equal(ts_result, exp) - assert ts_result.index.tz == rng1.tz - - rng1 = date_range('1/1/2011 01:00', periods=1, freq='H', tz='UTC') - rng2 = date_range('1/1/2011 02:00', periods=1, freq='H', tz='UTC') - ts1 = Series([1], index=rng1) - ts2 = Series([2], index=rng2) - ts_result = ts1.append(ts2) - - exp_index = DatetimeIndex(['2011-01-01 01:00', '2011-01-01 02:00'], - tz='UTC') - exp = Series([1, 2], index=exp_index) - assert_series_equal(ts_result, exp) - utc = rng1.tz - assert utc == ts_result.index.tz - - # GH 7795 - # different tz coerces to object dtype, not UTC - rng1 = date_range('1/1/2011 01:00', periods=1, freq='H', - tz='US/Eastern') - rng2 = date_range('1/1/2011 02:00', periods=1, freq='H', - tz='US/Central') - ts1 = Series([1], index=rng1) - ts2 = Series([2], index=rng2) - ts_result = ts1.append(ts2) - exp_index = Index([Timestamp('1/1/2011 01:00', tz='US/Eastern'), - Timestamp('1/1/2011 02:00', tz='US/Central')]) - exp = Series([1, 2], index=exp_index) - assert_series_equal(ts_result, exp) - - def test_append_dst(self): - rng1 = date_range('1/1/2016 01:00', periods=3, freq='H', - tz='US/Eastern') - rng2 = date_range('8/1/2016 01:00', periods=3, freq='H', - tz='US/Eastern') - ts1 = Series([1, 2, 3], index=rng1) - ts2 = Series([10, 11, 12], index=rng2) - ts_result = ts1.append(ts2) - - exp_index = DatetimeIndex(['2016-01-01 01:00', '2016-01-01 02:00', - '2016-01-01 03:00', '2016-08-01 01:00', - '2016-08-01 02:00', '2016-08-01 03:00'], - tz='US/Eastern') - exp = Series([1, 2, 3, 10, 11, 12], index=exp_index) - assert_series_equal(ts_result, exp) - assert ts_result.index.tz == rng1.tz - - def test_append_aware_naive(self): - rng1 = date_range('1/1/2011 01:00', periods=1, freq='H') - rng2 = date_range('1/1/2011 02:00', periods=1, freq='H', - tz='US/Eastern') - ts1 = Series(np.random.randn(len(rng1)), index=rng1) - ts2 = Series(np.random.randn(len(rng2)), index=rng2) - ts_result = ts1.append(ts2) - - assert ts_result.index.equals(ts1.index.astype(object).append( - ts2.index.astype(object))) - - # mixed - rng1 = date_range('1/1/2011 01:00', periods=1, freq='H') - rng2 = lrange(100) - ts1 = Series(np.random.randn(len(rng1)), index=rng1) - ts2 = Series(np.random.randn(len(rng2)), index=rng2) - ts_result = ts1.append(ts2) - assert ts_result.index.equals(ts1.index.astype(object).append( - ts2.index)) - - def test_series_add_tz_mismatch_converts_to_utc(self): - rng = date_range('1/1/2011', periods=10, freq='H', tz='US/Eastern') - ts = Series(np.random.randn(len(rng)), index=rng) - - ts_moscow = ts.tz_convert('Europe/Moscow') - - result = ts + ts_moscow - assert result.index.tz is pytz.utc - - result = ts_moscow + ts - assert result.index.tz is pytz.utc - - def test_arith_utc_convert(self): - rng = date_range('1/1/2011', periods=100, freq='H', tz='utc') - - perm = np.random.permutation(100)[:90] - ts1 = Series(np.random.randn(90), - index=rng.take(perm).tz_convert('US/Eastern')) - - perm = np.random.permutation(100)[:90] - ts2 = Series(np.random.randn(90), - index=rng.take(perm).tz_convert('Europe/Berlin')) - - result = ts1 + ts2 - - uts1 = ts1.tz_convert('utc') - uts2 = ts2.tz_convert('utc') - expected = uts1 + uts2 - - assert result.index.tz == pytz.UTC - assert_series_equal(result, expected) - def test_intersection(self): rng = date_range('1/1/2011', periods=100, freq='H', tz='utc') From d5eead6bdd745a98d8828e93a9e2718008af4d0a Mon Sep 17 00:00:00 2001 From: Sangwoong Yoon Date: Tue, 6 Feb 2018 23:16:13 +0900 Subject: [PATCH 19/20] DOC/ERR: better error message on no common merge keys (#19427) --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/core/frame.py | 2 +- pandas/core/reshape/merge.py | 7 ++++++- pandas/tests/reshape/merge/test_merge.py | 8 ++++++++ 4 files changed, 16 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index ca625f492b61f..54dba831f7216 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -667,6 +667,7 @@ Reshaping - Bug in :func:`DataFrame.stack`, :func:`DataFrame.unstack`, :func:`Series.unstack` which were not returning subclasses (:issue:`15563`) - Bug in timezone comparisons, manifesting as a conversion of the index to UTC in ``.concat()`` (:issue:`18523`) - Bug in :func:`concat` when concatting sparse and dense series it returns only a ``SparseDataFrame``. Should be a ``DataFrame``. (:issue:`18914`, :issue:`18686`, and :issue:`16874`) +- Improved error message for :func:`DataFrame.merge` when there is no common merge key (:issue:`19427`) - diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 201d8ba427c8a..3d1983f65d70d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -233,7 +233,7 @@ -------- merge_ordered merge_asof - +DataFrame.join """ # ----------------------------------------------------------------------- diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 3ec78ce52c6e5..9dbb327e3d956 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1021,7 +1021,12 @@ def _validate_specification(self): common_cols = self.left.columns.intersection( self.right.columns) if len(common_cols) == 0: - raise MergeError('No common columns to perform merge on') + raise MergeError( + 'No common columns to perform merge on. ' + 'Merge options: left_on={lon}, right_on={ron}, ' + 'left_index={lidx}, right_index={ridx}' + .format(lon=self.left_on, ron=self.right_on, + lidx=self.left_index, ridx=self.right_index)) if not common_cols.is_unique: raise MergeError("Data columns not unique: {common!r}" .format(common=common_cols)) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index f63c206c0c407..32f83ab972be5 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -270,6 +270,14 @@ def test_no_overlap_more_informative_error(self): df2 = DataFrame({'y': ['b', 'c']}, index=[dt, dt]) pytest.raises(MergeError, merge, df1, df2) + msg = ('No common columns to perform merge on. ' + 'Merge options: left_on={lon}, right_on={ron}, ' + 'left_index={lidx}, right_index={ridx}' + .format(lon=None, ron=None, lidx=False, ridx=False)) + + with tm.assert_raises_regex(MergeError, msg): + merge(df1, df2) + def test_merge_non_unique_indexes(self): dt = datetime(2012, 5, 1) From 93c86aa13e1b7816c762b4ff372aef80a7830af8 Mon Sep 17 00:00:00 2001 From: miker985 Date: Tue, 6 Feb 2018 06:17:14 -0800 Subject: [PATCH 20/20] BUGFIX - AttributeError raised in StataReader.value_labels() (#19510) --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/io/stata.py | 8 +++++--- pandas/tests/io/test_stata.py | 10 ++++++++++ 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 54dba831f7216..b5bf7ccbda0b6 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -627,6 +627,7 @@ I/O - Bug in :func:`DataFrame.to_parquet` where an exception was raised if the write destination is S3 (:issue:`19134`) - :class:`Interval` now supported in :func:`DataFrame.to_excel` for all Excel file types (:issue:`19242`) - :class:`Timedelta` now supported in :func:`DataFrame.to_excel` for xls file type (:issue:`19242`, :issue:`9155`) +- Bug in :meth:`pandas.io.stata.StataReader.value_labels` raising an ``AttributeError`` when called on very old files. Now returns an empty dict (:issue:`19417`) Plotting ^^^^^^^^ diff --git a/pandas/io/stata.py b/pandas/io/stata.py index adbff06364dbe..ee6975ea1d938 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1341,12 +1341,14 @@ def _null_terminate(self, s): return s def _read_value_labels(self): - if self.format_version <= 108: - # Value labels are not supported in version 108 and earlier. - return if self._value_labels_read: # Don't read twice return + if self.format_version <= 108: + # Value labels are not supported in version 108 and earlier. + self._value_labels_read = True + self.value_label_dict = dict() + return if self.format_version >= 117: self.path_or_buf.seek(self.seek_value_labels) diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 89d76061329a3..4e259d0994bdb 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -589,6 +589,16 @@ def test_105(self): df0['psch_dis'] = df0["psch_dis"].astype(np.float32) tm.assert_frame_equal(df.head(3), df0) + def test_value_labels_old_format(self): + # GH 19417 + # + # Test that value_labels() returns an empty dict if the file format + # predates supporting value labels. + dpath = os.path.join(self.dirpath, 'S4_EDUC1.dta') + reader = StataReader(dpath) + assert reader.value_labels() == {} + reader.close() + def test_date_export_formats(self): columns = ['tc', 'td', 'tw', 'tm', 'tq', 'th', 'ty'] conversions = {c: c for c in columns}