From e797a2d93333ac6b48b56c3a63f0e667ad97e7e2 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Mon, 8 Aug 2016 08:44:01 +0900 Subject: [PATCH] BUG: freqstr may be parsed incorrectly --- doc/source/whatsnew/v0.19.0.txt | 1 + pandas/tseries/frequencies.py | 11 +- pandas/tseries/tests/test_frequencies.py | 386 ++++++++++++----------- 3 files changed, 215 insertions(+), 183 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index a041e175d5f1a..cbf6ff7659112 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -963,6 +963,7 @@ Bug Fixes - Bug in ``Index.union`` returns an incorrect result with a named empty index (:issue:`13432`) - Bugs in ``Index.difference`` and ``DataFrame.join`` raise in Python3 when using mixed-integer indexes (:issue:`13432`, :issue:`12814`) - Bug in ``.to_excel()`` when DataFrame contains a MultiIndex which contains a label with a NaN value (:issue:`13511`) +- Bug in invalid frequency offset string like "D1", "-2-3H" may not raise ``ValueError (:issue:`13930`) - Bug in ``agg()`` function on groupby dataframe changes dtype of ``datetime64[ns]`` column to ``float64`` (:issue:`12821`) diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index 3011e8dc0ae3d..5da26d191980e 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -464,7 +464,14 @@ def to_offset(freq): delta = None stride_sign = None try: - for stride, name, _ in opattern.findall(freq): + splitted = re.split(opattern, freq) + if splitted[-1] != '' and not splitted[-1].isspace(): + # the last element must be blank + raise ValueError('includes garbage characters') + for sep, stride, name in zip(splitted[0::4], splitted[1::4], + splitted[2::4]): + if sep != '' and not sep.isspace(): + raise ValueError('separator must be spaces') offset = get_offset(name) if stride_sign is None: stride_sign = -1 if stride.startswith('-') else 1 @@ -486,7 +493,7 @@ def to_offset(freq): # hack to handle WOM-1MON -opattern = re.compile(r'([\-]?\d*)\s*([A-Za-z]+([\-@][\dA-Za-z\-]+)?)') +opattern = re.compile(r'([\-]?\d*)\s*([A-Za-z]+([\-][\dA-Za-z\-]+)?)') def _base_and_stride(freqstr): diff --git a/pandas/tseries/tests/test_frequencies.py b/pandas/tseries/tests/test_frequencies.py index 268933fada7a2..5ba98f15aed8d 100644 --- a/pandas/tseries/tests/test_frequencies.py +++ b/pandas/tseries/tests/test_frequencies.py @@ -18,187 +18,210 @@ from pandas import Timedelta -def test_to_offset_multiple(): - freqstr = '2h30min' - freqstr2 = '2h 30min' - - result = frequencies.to_offset(freqstr) - assert (result == frequencies.to_offset(freqstr2)) - expected = offsets.Minute(150) - assert (result == expected) - - freqstr = '2h30min15s' - result = frequencies.to_offset(freqstr) - expected = offsets.Second(150 * 60 + 15) - assert (result == expected) - - freqstr = '2h 60min' - result = frequencies.to_offset(freqstr) - expected = offsets.Hour(3) - assert (result == expected) - - freqstr = '15l500u' - result = frequencies.to_offset(freqstr) - expected = offsets.Micro(15500) - assert (result == expected) - - freqstr = '10s75L' - result = frequencies.to_offset(freqstr) - expected = offsets.Milli(10075) - assert (result == expected) - - freqstr = '2800N' - result = frequencies.to_offset(freqstr) - expected = offsets.Nano(2800) - assert (result == expected) - - freqstr = '2SM' - result = frequencies.to_offset(freqstr) - expected = offsets.SemiMonthEnd(2) - assert (result == expected) - - freqstr = '2SM-16' - result = frequencies.to_offset(freqstr) - expected = offsets.SemiMonthEnd(2, day_of_month=16) - assert (result == expected) - - freqstr = '2SMS-14' - result = frequencies.to_offset(freqstr) - expected = offsets.SemiMonthBegin(2, day_of_month=14) - assert (result == expected) - - freqstr = '2SMS-15' - result = frequencies.to_offset(freqstr) - expected = offsets.SemiMonthBegin(2) - assert (result == expected) - - # malformed - try: - frequencies.to_offset('2h20m') - except ValueError: - pass - else: - assert (False) - - -def test_to_offset_negative(): - freqstr = '-1S' - result = frequencies.to_offset(freqstr) - assert (result.n == -1) - - freqstr = '-5min10s' - result = frequencies.to_offset(freqstr) - assert (result.n == -310) - - freqstr = '-2SM' - result = frequencies.to_offset(freqstr) - assert (result.n == -2) - - freqstr = '-1SMS' - result = frequencies.to_offset(freqstr) - assert (result.n == -1) - - -def test_to_offset_leading_zero(): - freqstr = '00H 00T 01S' - result = frequencies.to_offset(freqstr) - assert (result.n == 1) - - freqstr = '-00H 03T 14S' - result = frequencies.to_offset(freqstr) - assert (result.n == -194) - - -def test_to_offset_pd_timedelta(): - # Tests for #9064 - td = Timedelta(days=1, seconds=1) - result = frequencies.to_offset(td) - expected = offsets.Second(86401) - assert (expected == result) - - td = Timedelta(days=-1, seconds=1) - result = frequencies.to_offset(td) - expected = offsets.Second(-86399) - assert (expected == result) - - td = Timedelta(hours=1, minutes=10) - result = frequencies.to_offset(td) - expected = offsets.Minute(70) - assert (expected == result) - - td = Timedelta(hours=1, minutes=-10) - result = frequencies.to_offset(td) - expected = offsets.Minute(50) - assert (expected == result) - - td = Timedelta(weeks=1) - result = frequencies.to_offset(td) - expected = offsets.Day(7) - assert (expected == result) - - td1 = Timedelta(hours=1) - result1 = frequencies.to_offset(td1) - result2 = frequencies.to_offset('60min') - assert (result1 == result2) - - td = Timedelta(microseconds=1) - result = frequencies.to_offset(td) - expected = offsets.Micro(1) - assert (expected == result) - - td = Timedelta(microseconds=0) - tm.assertRaises(ValueError, lambda: frequencies.to_offset(td)) - - -def test_anchored_shortcuts(): - result = frequencies.to_offset('W') - expected = frequencies.to_offset('W-SUN') - assert (result == expected) - - result1 = frequencies.to_offset('Q') - result2 = frequencies.to_offset('Q-DEC') - expected = offsets.QuarterEnd(startingMonth=12) - assert (result1 == expected) - assert (result2 == expected) - - result1 = frequencies.to_offset('Q-MAY') - expected = offsets.QuarterEnd(startingMonth=5) - assert (result1 == expected) - - result1 = frequencies.to_offset('SM') - result2 = frequencies.to_offset('SM-15') - expected = offsets.SemiMonthEnd(day_of_month=15) - assert (result1 == expected) - assert (result2 == expected) - - result = frequencies.to_offset('SM-1') - expected = offsets.SemiMonthEnd(day_of_month=1) - assert (result == expected) - - result = frequencies.to_offset('SM-27') - expected = offsets.SemiMonthEnd(day_of_month=27) - assert (result == expected) - - result = frequencies.to_offset('SMS-2') - expected = offsets.SemiMonthBegin(day_of_month=2) - assert (result == expected) - - result = frequencies.to_offset('SMS-27') - expected = offsets.SemiMonthBegin(day_of_month=27) - assert (result == expected) - - # ensure invalid cases fail as expected - invalid_anchors = ['SM-0', 'SM-28', 'SM-29', - 'SM-FOO', 'BSM', 'SM--1' - 'SMS-1', 'SMS-28', 'SMS-30', - 'SMS-BAR', 'BSMS', 'SMS--2'] - for invalid_anchor in invalid_anchors: - try: - frequencies.to_offset(invalid_anchor) - except ValueError: - pass - else: - raise AssertionError(invalid_anchor) +class TestToOffset(tm.TestCase): + + def test_to_offset_multiple(self): + freqstr = '2h30min' + freqstr2 = '2h 30min' + + result = frequencies.to_offset(freqstr) + assert (result == frequencies.to_offset(freqstr2)) + expected = offsets.Minute(150) + assert (result == expected) + + freqstr = '2h30min15s' + result = frequencies.to_offset(freqstr) + expected = offsets.Second(150 * 60 + 15) + assert (result == expected) + + freqstr = '2h 60min' + result = frequencies.to_offset(freqstr) + expected = offsets.Hour(3) + assert (result == expected) + + freqstr = '15l500u' + result = frequencies.to_offset(freqstr) + expected = offsets.Micro(15500) + assert (result == expected) + + freqstr = '10s75L' + result = frequencies.to_offset(freqstr) + expected = offsets.Milli(10075) + assert (result == expected) + + freqstr = '2800N' + result = frequencies.to_offset(freqstr) + expected = offsets.Nano(2800) + assert (result == expected) + + freqstr = '2SM' + result = frequencies.to_offset(freqstr) + expected = offsets.SemiMonthEnd(2) + assert (result == expected) + + freqstr = '2SM-16' + result = frequencies.to_offset(freqstr) + expected = offsets.SemiMonthEnd(2, day_of_month=16) + assert (result == expected) + + freqstr = '2SMS-14' + result = frequencies.to_offset(freqstr) + expected = offsets.SemiMonthBegin(2, day_of_month=14) + assert (result == expected) + + freqstr = '2SMS-15' + result = frequencies.to_offset(freqstr) + expected = offsets.SemiMonthBegin(2) + assert (result == expected) + + # malformed + with tm.assertRaisesRegexp(ValueError, 'Invalid frequency: 2h20m'): + frequencies.to_offset('2h20m') + + def test_to_offset_negative(self): + freqstr = '-1S' + result = frequencies.to_offset(freqstr) + assert (result.n == -1) + + freqstr = '-5min10s' + result = frequencies.to_offset(freqstr) + assert (result.n == -310) + + freqstr = '-2SM' + result = frequencies.to_offset(freqstr) + assert (result.n == -2) + + freqstr = '-1SMS' + result = frequencies.to_offset(freqstr) + assert (result.n == -1) + + def test_to_offset_invalid(self): + # GH 13930 + with tm.assertRaisesRegexp(ValueError, 'Invalid frequency: U1'): + frequencies.to_offset('U1') + with tm.assertRaisesRegexp(ValueError, 'Invalid frequency: -U'): + frequencies.to_offset('-U') + with tm.assertRaisesRegexp(ValueError, 'Invalid frequency: 3U1'): + frequencies.to_offset('3U1') + with tm.assertRaisesRegexp(ValueError, 'Invalid frequency: -2-3U'): + frequencies.to_offset('-2-3U') + with tm.assertRaisesRegexp(ValueError, 'Invalid frequency: -2D:3H'): + frequencies.to_offset('-2D:3H') + + # ToDo: Must be fixed in #8419 + with tm.assertRaisesRegexp(ValueError, 'Invalid frequency: .5S'): + frequencies.to_offset('.5S') + + # split offsets with spaces are valid + assert frequencies.to_offset('2D 3H') == offsets.Hour(51) + assert frequencies.to_offset('2 D3 H') == offsets.Hour(51) + assert frequencies.to_offset('2 D 3 H') == offsets.Hour(51) + assert frequencies.to_offset(' 2 D 3 H ') == offsets.Hour(51) + assert frequencies.to_offset(' H ') == offsets.Hour() + assert frequencies.to_offset(' 3 H ') == offsets.Hour(3) + + # special cases + assert frequencies.to_offset('2SMS-15') == offsets.SemiMonthBegin(2) + with tm.assertRaisesRegexp(ValueError, + 'Invalid frequency: 2SMS-15-15'): + frequencies.to_offset('2SMS-15-15') + with tm.assertRaisesRegexp(ValueError, 'Invalid frequency: 2SMS-15D'): + frequencies.to_offset('2SMS-15D') + + def test_to_offset_leading_zero(self): + freqstr = '00H 00T 01S' + result = frequencies.to_offset(freqstr) + assert (result.n == 1) + + freqstr = '-00H 03T 14S' + result = frequencies.to_offset(freqstr) + assert (result.n == -194) + + def test_to_offset_pd_timedelta(self): + # Tests for #9064 + td = Timedelta(days=1, seconds=1) + result = frequencies.to_offset(td) + expected = offsets.Second(86401) + assert (expected == result) + + td = Timedelta(days=-1, seconds=1) + result = frequencies.to_offset(td) + expected = offsets.Second(-86399) + assert (expected == result) + + td = Timedelta(hours=1, minutes=10) + result = frequencies.to_offset(td) + expected = offsets.Minute(70) + assert (expected == result) + + td = Timedelta(hours=1, minutes=-10) + result = frequencies.to_offset(td) + expected = offsets.Minute(50) + assert (expected == result) + + td = Timedelta(weeks=1) + result = frequencies.to_offset(td) + expected = offsets.Day(7) + assert (expected == result) + + td1 = Timedelta(hours=1) + result1 = frequencies.to_offset(td1) + result2 = frequencies.to_offset('60min') + assert (result1 == result2) + + td = Timedelta(microseconds=1) + result = frequencies.to_offset(td) + expected = offsets.Micro(1) + assert (expected == result) + + td = Timedelta(microseconds=0) + tm.assertRaises(ValueError, lambda: frequencies.to_offset(td)) + + def test_anchored_shortcuts(self): + result = frequencies.to_offset('W') + expected = frequencies.to_offset('W-SUN') + assert (result == expected) + + result1 = frequencies.to_offset('Q') + result2 = frequencies.to_offset('Q-DEC') + expected = offsets.QuarterEnd(startingMonth=12) + assert (result1 == expected) + assert (result2 == expected) + + result1 = frequencies.to_offset('Q-MAY') + expected = offsets.QuarterEnd(startingMonth=5) + assert (result1 == expected) + + result1 = frequencies.to_offset('SM') + result2 = frequencies.to_offset('SM-15') + expected = offsets.SemiMonthEnd(day_of_month=15) + assert (result1 == expected) + assert (result2 == expected) + + result = frequencies.to_offset('SM-1') + expected = offsets.SemiMonthEnd(day_of_month=1) + assert (result == expected) + + result = frequencies.to_offset('SM-27') + expected = offsets.SemiMonthEnd(day_of_month=27) + assert (result == expected) + + result = frequencies.to_offset('SMS-2') + expected = offsets.SemiMonthBegin(day_of_month=2) + assert (result == expected) + + result = frequencies.to_offset('SMS-27') + expected = offsets.SemiMonthBegin(day_of_month=27) + assert (result == expected) + + # ensure invalid cases fail as expected + invalid_anchors = ['SM-0', 'SM-28', 'SM-29', + 'SM-FOO', 'BSM', 'SM--1' + 'SMS-1', 'SMS-28', 'SMS-30', + 'SMS-BAR', 'BSMS', 'SMS--2'] + for invalid_anchor in invalid_anchors: + with tm.assertRaisesRegexp(ValueError, 'Invalid frequency: '): + frequencies.to_offset(invalid_anchor) def test_get_rule_month(): @@ -275,6 +298,7 @@ def _assert_depr(freq, expected, aliases): class TestFrequencyCode(tm.TestCase): + def test_freq_code(self): self.assertEqual(frequencies.get_freq('A'), 1000) self.assertEqual(frequencies.get_freq('3A'), 1000)