From 272cabb03c6720a922e63d97188640f29d92a24a Mon Sep 17 00:00:00 2001 From: Ming Li <14131823+minggli@users.noreply.github.com> Date: Mon, 29 Jan 2018 12:42:59 +0000 Subject: [PATCH 001/214] Regression in make_block_same_class (tests failing for new fastparquet release) (#19434) --- doc/source/io.rst | 2 +- pandas/core/internals.py | 9 +++++++-- pandas/tests/internals/test_internals.py | 7 +++++++ pandas/tests/io/test_parquet.py | 22 +++++++++++++++++----- 4 files changed, 32 insertions(+), 8 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index ae04996b4fddf..4199f161501ec 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -4537,7 +4537,7 @@ See the documentation for `pyarrow `__ and .. note:: These engines are very similar and should read/write nearly identical parquet format files. - Currently ``pyarrow`` does not support timedelta data, and ``fastparquet`` does not support timezone aware datetimes (they are coerced to UTC). + Currently ``pyarrow`` does not support timedelta data, ``fastparquet>=0.1.4`` supports timezone aware datetimes. These libraries differ by having different underlying dependencies (``fastparquet`` by using ``numba``, while ``pyarrow`` uses a c-library). .. ipython:: python diff --git a/pandas/core/internals.py b/pandas/core/internals.py index c2d3d0852384c..ec884035fe0c4 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -224,12 +224,17 @@ def make_block_scalar(self, values): """ return ScalarBlock(values) - def make_block_same_class(self, values, placement=None, ndim=None): + def make_block_same_class(self, values, placement=None, ndim=None, + dtype=None): """ Wrap given values in a block of same type as self. """ + if dtype is not None: + # issue 19431 fastparquet is passing this + warnings.warn("dtype argument is deprecated, will be removed " + "in a future release.", FutureWarning) if placement is None: placement = self.mgr_locs return make_block(values, placement=placement, ndim=ndim, - klass=self.__class__) + klass=self.__class__, dtype=dtype) def __unicode__(self): diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 57884e9816ed3..f17306b8b52f9 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -285,6 +285,13 @@ def test_delete(self): with pytest.raises(Exception): newb.delete(3) + def test_make_block_same_class(self): + # issue 19431 + block = create_block('M8[ns, US/Eastern]', [3]) + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + block.make_block_same_class(block.values, dtype=block.values.dtype) + class TestDatetimeBlock(object): diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 6c172c80514e7..11cbea8ce6331 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -71,6 +71,15 @@ def fp(): return 'fastparquet' +@pytest.fixture +def fp_lt_014(): + if not _HAVE_FASTPARQUET: + pytest.skip("fastparquet is not installed") + if LooseVersion(fastparquet.__version__) >= LooseVersion('0.1.4'): + pytest.skip("fastparquet is >= 0.1.4") + return 'fastparquet' + + @pytest.fixture def df_compat(): return pd.DataFrame({'A': [1, 2, 3], 'B': 'foo'}) @@ -435,8 +444,10 @@ def test_basic(self, fp, df_full): df = df_full # additional supported types for fastparquet + if LooseVersion(fastparquet.__version__) >= LooseVersion('0.1.4'): + df['datetime_tz'] = pd.date_range('20130101', periods=3, + tz='US/Eastern') df['timedelta'] = pd.timedelta_range('1 day', periods=3) - check_round_trip(df, fp) @pytest.mark.skip(reason="not supported") @@ -468,14 +479,15 @@ def test_categorical(self, fp): df = pd.DataFrame({'a': pd.Categorical(list('abc'))}) check_round_trip(df, fp) - def test_datetime_tz(self, fp): - # doesn't preserve tz + def test_datetime_tz(self, fp_lt_014): + + # fastparquet<0.1.4 doesn't preserve tz df = pd.DataFrame({'a': pd.date_range('20130101', periods=3, tz='US/Eastern')}) - # warns on the coercion with catch_warnings(record=True): - check_round_trip(df, fp, expected=df.astype('datetime64[ns]')) + check_round_trip(df, fp_lt_014, + expected=df.astype('datetime64[ns]')) def test_filter_row_groups(self, fp): d = {'a': list(range(0, 3))} From ad468e9af645a98e8e28a884dc90f98b7bf57380 Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Mon, 29 Jan 2018 15:07:02 +0100 Subject: [PATCH 002/214] TST: fix test for MultiIndexPyIntEngine on 32 bit (#19440) closes #19439 --- pandas/tests/test_multilevel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 9582264a8c716..65332ae7153e2 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -1611,7 +1611,7 @@ def test_pyint_engine(self): index = MultiIndex.from_tuples(keys) assert index.get_loc(keys[idx]) == idx - expected = np.arange(idx + 1, dtype='int64') + expected = np.arange(idx + 1, dtype=np.intp) result = index.get_indexer([keys[i] for i in expected]) tm.assert_numpy_array_equal(result, expected) From f4833211199aa4e340eae396572aef3c59eb2bea Mon Sep 17 00:00:00 2001 From: luzpaz Date: Mon, 29 Jan 2018 09:14:33 -0500 Subject: [PATCH 003/214] Misc typos (#19430) Found via `codespell -q 3` --- asv_bench/benchmarks/replace.py | 8 ++++---- asv_bench/benchmarks/rolling.py | 16 ++++++++-------- doc/source/api.rst | 2 +- doc/source/io.rst | 2 +- doc/sphinxext/numpydoc/tests/test_docscrape.py | 6 +++--- pandas/_libs/tslibs/timedeltas.pyx | 2 +- pandas/_libs/tslibs/timezones.pyx | 2 +- pandas/core/frame.py | 2 +- pandas/core/indexes/datetimelike.py | 2 +- pandas/core/sparse/frame.py | 2 +- pandas/core/sparse/series.py | 2 +- pandas/core/strings.py | 2 +- pandas/core/util/hashing.py | 2 +- pandas/io/formats/format.py | 2 +- pandas/io/pytables.py | 2 +- pandas/tests/categorical/test_constructors.py | 2 +- pandas/tests/frame/test_constructors.py | 2 +- pandas/tests/io/formats/test_format.py | 2 +- pandas/tests/series/test_analytics.py | 2 +- pandas/tests/series/test_operators.py | 2 +- pandas/tests/sparse/frame/test_frame.py | 2 +- pandas/util/testing.py | 2 +- 22 files changed, 34 insertions(+), 34 deletions(-) diff --git a/asv_bench/benchmarks/replace.py b/asv_bench/benchmarks/replace.py index 6330a2b36c516..41208125e8f32 100644 --- a/asv_bench/benchmarks/replace.py +++ b/asv_bench/benchmarks/replace.py @@ -44,15 +44,15 @@ class Convert(object): goal_time = 0.5 params = (['DataFrame', 'Series'], ['Timestamp', 'Timedelta']) - param_names = ['contructor', 'replace_data'] + param_names = ['constructor', 'replace_data'] - def setup(self, contructor, replace_data): + def setup(self, constructor, replace_data): N = 10**3 data = {'Series': pd.Series(np.random.randint(N, size=N)), 'DataFrame': pd.DataFrame({'A': np.random.randint(N, size=N), 'B': np.random.randint(N, size=N)})} self.to_replace = {i: getattr(pd, replace_data) for i in range(N)} - self.data = data[contructor] + self.data = data[constructor] - def time_replace(self, contructor, replace_data): + def time_replace(self, constructor, replace_data): self.data.replace(self.to_replace) diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index 45142c53dcd01..59cf7d090a622 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -12,14 +12,14 @@ class Methods(object): ['int', 'float'], ['median', 'mean', 'max', 'min', 'std', 'count', 'skew', 'kurt', 'sum', 'corr', 'cov']) - param_names = ['contructor', 'window', 'dtype', 'method'] + param_names = ['constructor', 'window', 'dtype', 'method'] - def setup(self, contructor, window, dtype, method): + def setup(self, constructor, window, dtype, method): N = 10**5 arr = np.random.random(N).astype(dtype) - self.roll = getattr(pd, contructor)(arr).rolling(window) + self.roll = getattr(pd, constructor)(arr).rolling(window) - def time_rolling(self, contructor, window, dtype, method): + def time_rolling(self, constructor, window, dtype, method): getattr(self.roll, method)() @@ -30,12 +30,12 @@ class Quantile(object): [10, 1000], ['int', 'float'], [0, 0.5, 1]) - param_names = ['contructor', 'window', 'dtype', 'percentile'] + param_names = ['constructor', 'window', 'dtype', 'percentile'] - def setup(self, contructor, window, dtype, percentile): + def setup(self, constructor, window, dtype, percentile): N = 10**5 arr = np.random.random(N).astype(dtype) - self.roll = getattr(pd, contructor)(arr).rolling(window) + self.roll = getattr(pd, constructor)(arr).rolling(window) - def time_quantile(self, contructor, window, dtype, percentile): + def time_quantile(self, constructor, window, dtype, percentile): self.roll.quantile(percentile) diff --git a/doc/source/api.rst b/doc/source/api.rst index ddd09327935ce..44f87aa3e1cec 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -2500,7 +2500,7 @@ Scalar introspection Extensions ---------- -These are primarily intented for library authors looking to extend pandas +These are primarily intended for library authors looking to extend pandas objects. .. currentmodule:: pandas diff --git a/doc/source/io.rst b/doc/source/io.rst index 4199f161501ec..60dc89f8fd495 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -2675,7 +2675,7 @@ file, and the ``sheet_name`` indicating which sheet to parse. +++++++++++++++++++ To facilitate working with multiple sheets from the same file, the ``ExcelFile`` -class can be used to wrap the file and can be be passed into ``read_excel`` +class can be used to wrap the file and can be passed into ``read_excel`` There will be a performance benefit for reading multiple sheets as the file is read into memory only once. diff --git a/doc/sphinxext/numpydoc/tests/test_docscrape.py b/doc/sphinxext/numpydoc/tests/test_docscrape.py index b682504e1618f..b412124d774bb 100755 --- a/doc/sphinxext/numpydoc/tests/test_docscrape.py +++ b/doc/sphinxext/numpydoc/tests/test_docscrape.py @@ -42,7 +42,7 @@ ------- out : ndarray The drawn samples, arranged according to `shape`. If the - shape given is (m,n,...), then the shape of `out` is is + shape given is (m,n,...), then the shape of `out` is (m,n,...,N). In other words, each entry ``out[i,j,...,:]`` is an N-dimensional @@ -222,7 +222,7 @@ def test_str(): ------- out : ndarray The drawn samples, arranged according to `shape`. If the - shape given is (m,n,...), then the shape of `out` is is + shape given is (m,n,...), then the shape of `out` is (m,n,...,N). In other words, each entry ``out[i,j,...,:]`` is an N-dimensional @@ -340,7 +340,7 @@ def test_sphinx_str(): **out** : ndarray The drawn samples, arranged according to `shape`. If the - shape given is (m,n,...), then the shape of `out` is is + shape given is (m,n,...), then the shape of `out` is (m,n,...,N). In other words, each entry ``out[i,j,...,:]`` is an N-dimensional diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 1e6ea7794dfff..37693068e0974 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -897,7 +897,7 @@ class Timedelta(_Timedelta): Represents a duration, the difference between two dates or times. Timedelta is the pandas equivalent of python's ``datetime.timedelta`` - and is interchangable with it in most cases. + and is interchangeable with it in most cases. Parameters ---------- diff --git a/pandas/_libs/tslibs/timezones.pyx b/pandas/_libs/tslibs/timezones.pyx index c22e0b8e555a3..215ae9ce087ee 100644 --- a/pandas/_libs/tslibs/timezones.pyx +++ b/pandas/_libs/tslibs/timezones.pyx @@ -295,7 +295,7 @@ cpdef bint tz_compare(object start, object end): timezones. For example `` and `` are essentially same - timezones but aren't evaluted such, but the string representation + timezones but aren't evaluated such, but the string representation for both of these is `'Europe/Paris'`. This exists only to add a notion of equality to pytz-style zones diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 7328cd336babf..788b236b0ec59 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4115,7 +4115,7 @@ def combine(self, other, func, fill_value=None, overwrite=True): series[this_mask] = fill_value otherSeries[other_mask] = fill_value - # if we have different dtypes, possibily promote + # if we have different dtypes, possibly promote new_dtype = this_dtype if not is_dtype_equal(this_dtype, other_dtype): new_dtype = find_common_type([this_dtype, other_dtype]) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index f43c6dc567f69..8e77c7a7fa48c 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -332,7 +332,7 @@ def freqstr(self): @cache_readonly def inferred_freq(self): """ - Trys to return a string representing a frequency guess, + Tryies to return a string representing a frequency guess, generated by infer_freq. Returns None if it can't autodetect the frequency. """ diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index 99bf0d5b7ac51..91dc44e3f185e 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -120,7 +120,7 @@ def __init__(self, data=None, index=None, columns=None, default_kind=None, if dtype is not None: mgr = mgr.astype(dtype) else: - msg = ('SparseDataFrame called with unkown type "{data_type}" ' + msg = ('SparseDataFrame called with unknown type "{data_type}" ' 'for data argument') raise TypeError(msg.format(data_type=type(data).__name__)) diff --git a/pandas/core/sparse/series.py b/pandas/core/sparse/series.py index 4e207f9d1838c..1c23527cf57c4 100644 --- a/pandas/core/sparse/series.py +++ b/pandas/core/sparse/series.py @@ -493,7 +493,7 @@ def _set_value(self, label, value, takeable=False): values = self.to_dense() # if the label doesn't exist, we will create a new object here - # and possibily change the index + # and possibly change the index new_values = values._set_value(label, value, takeable=takeable) if new_values is not None: values = new_values diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 5c31b9a5668ff..12c7feb5f2b15 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1395,7 +1395,7 @@ def _validate(data): elif isinstance(data, Index): # can't use ABCIndex to exclude non-str - # see scc/inferrence.pyx which can contain string values + # see src/inference.pyx which can contain string values allowed_types = ('string', 'unicode', 'mixed', 'mixed-integer') if data.inferred_type not in allowed_types: message = ("Can only use .str accessor with string values " diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index 0c82773b75c28..7edb5b16ce77a 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -210,7 +210,7 @@ def _hash_categorical(c, encoding, hash_key): # we have uint64, as we don't directly support missing values # we don't want to use take_nd which will coerce to float - # instead, directly construt the result with a + # instead, directly construct the result with a # max(np.uint64) as the missing value indicator # # TODO: GH 15362 diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 2293032ebb8a1..bca0b64cb53fe 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1961,7 +1961,7 @@ def formatter(value): def get_result_as_array(self): """ Returns the float values converted into strings using - the parameters given at initalisation, as a numpy array + the parameters given at initialisation, as a numpy array """ if self.formatter is not None: diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 106823199ee93..5376473f83f22 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -3763,7 +3763,7 @@ def write(self, **kwargs): class LegacyTable(Table): """ an appendable table: allow append/query/delete operations to a - (possibily) already existing appendable table this table ALLOWS + (possibly) already existing appendable table this table ALLOWS append (but doesn't require them), and stores the data in a format that can be easily searched diff --git a/pandas/tests/categorical/test_constructors.py b/pandas/tests/categorical/test_constructors.py index b29d75bed5c6f..6cc34770a65e0 100644 --- a/pandas/tests/categorical/test_constructors.py +++ b/pandas/tests/categorical/test_constructors.py @@ -382,7 +382,7 @@ def test_constructor_from_categorical_with_unknown_dtype(self): ordered=True) tm.assert_categorical_equal(result, expected) - def test_contructor_from_categorical_string(self): + def test_constructor_from_categorical_string(self): values = Categorical(['a', 'b', 'd']) # use categories, ordered result = Categorical(values, categories=['a', 'b', 'c'], ordered=True, diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 8b57e96e6fa06..b24ae22162a34 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -543,7 +543,7 @@ def test_nested_dict_frame_constructor(self): tm.assert_frame_equal(result, df) def _check_basic_constructor(self, empty): - # mat: 2d matrix with shpae (3, 2) to input. empty - makes sized + # mat: 2d matrix with shape (3, 2) to input. empty - makes sized # objects mat = empty((2, 3), dtype=float) # 2-D input diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index b277d8256e612..e0ce27de5c31f 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -2531,7 +2531,7 @@ def test_date_tz(self): [datetime(2013, 1, 1), pd.NaT], utc=True).format() assert formatted[0] == "2013-01-01 00:00:00+00:00" - def test_date_explict_date_format(self): + def test_date_explicit_date_format(self): formatted = pd.to_datetime([datetime(2003, 2, 1), pd.NaT]).format( date_format="%m-%d-%Y", na_rep="UT") assert formatted[0] == "02-01-2003" diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index f2b7c20b774b0..0e6e44e839464 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -43,7 +43,7 @@ def test_empty(self, method, unit, use_bottleneck): result = getattr(s, method)() assert result == unit - # Explict + # Explicit result = getattr(s, method)(min_count=0) assert result == unit diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index 7505e6b0cec3b..38e5753d1752d 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -1163,7 +1163,7 @@ def test_timedelta_floordiv(self, scalar_td): ('NCC1701D', 'NCC1701D', 'NCC1701D')]) def test_td64_series_with_tdi(self, names): # GH#17250 make sure result dtype is correct - # GH#19043 make sure names are propogated correctly + # GH#19043 make sure names are propagated correctly tdi = pd.TimedeltaIndex(['0 days', '1 day'], name=names[0]) ser = Series([Timedelta(hours=3), Timedelta(hours=4)], name=names[1]) expected = Series([Timedelta(hours=3), Timedelta(days=1, hours=4)], diff --git a/pandas/tests/sparse/frame/test_frame.py b/pandas/tests/sparse/frame/test_frame.py index 2b589ebd4735e..0b7948cc32d24 100644 --- a/pandas/tests/sparse/frame/test_frame.py +++ b/pandas/tests/sparse/frame/test_frame.py @@ -218,7 +218,7 @@ def test_constructor_from_unknown_type(self): class Unknown: pass with pytest.raises(TypeError, - message='SparseDataFrame called with unkown type ' + message='SparseDataFrame called with unknown type ' '"Unknown" for data argument'): SparseDataFrame(Unknown()) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 941bdcbc8b064..0009e26f8b100 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -2401,7 +2401,7 @@ class for all warnings. To check that no warning is returned, into errors. Valid values are: - * "error" - turns matching warnings into exeptions + * "error" - turns matching warnings into exceptions * "ignore" - discard the warning * "always" - always emit a warning * "default" - print the warning the first time it is generated From e0d9651b2721b2a009e23b5597fa7549521538c8 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 29 Jan 2018 22:39:09 +0100 Subject: [PATCH 004/214] Change Future to DeprecationWarning for make_block_same_class (#19442) --- pandas/core/internals.py | 2 +- pandas/tests/internals/test_internals.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index ec884035fe0c4..f3e5e4c99a899 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -230,7 +230,7 @@ def make_block_same_class(self, values, placement=None, ndim=None, if dtype is not None: # issue 19431 fastparquet is passing this warnings.warn("dtype argument is deprecated, will be removed " - "in a future release.", FutureWarning) + "in a future release.", DeprecationWarning) if placement is None: placement = self.mgr_locs return make_block(values, placement=placement, ndim=ndim, diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index f17306b8b52f9..e3490f465b24a 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -288,9 +288,10 @@ def test_delete(self): def test_make_block_same_class(self): # issue 19431 block = create_block('M8[ns, US/Eastern]', [3]) - with tm.assert_produces_warning(FutureWarning, + with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): - block.make_block_same_class(block.values, dtype=block.values.dtype) + block.make_block_same_class(block.values.values, + dtype=block.values.dtype) class TestDatetimeBlock(object): From 4618a0918e1bbdb40a493d8a32d46ab8c94fd0b4 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 29 Jan 2018 15:59:32 -0800 Subject: [PATCH 005/214] catch PerformanceWarning (#19446) --- pandas/tests/series/test_operators.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index 38e5753d1752d..8feee6e6cff68 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -19,6 +19,7 @@ from pandas.core.indexes.timedeltas import Timedelta import pandas.core.nanops as nanops +from pandas.errors import PerformanceWarning from pandas.compat import range, zip from pandas import compat from pandas.util.testing import (assert_series_equal, assert_almost_equal, @@ -871,8 +872,9 @@ def test_timedelta64_operations_with_DateOffset(self): expected = Series([timedelta(minutes=4, seconds=3)] * 3) assert_series_equal(result, expected) - result = td + Series([pd.offsets.Minute(1), pd.offsets.Second(3), - pd.offsets.Hour(2)]) + with tm.assert_produces_warning(PerformanceWarning): + result = td + Series([pd.offsets.Minute(1), pd.offsets.Second(3), + pd.offsets.Hour(2)]) expected = Series([timedelta(minutes=6, seconds=3), timedelta( minutes=5, seconds=6), timedelta(hours=2, minutes=5, seconds=3)]) assert_series_equal(result, expected) From 44bbd5a4d33643c9270bbefd7419f45aecaa4667 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 30 Jan 2018 06:36:16 -0500 Subject: [PATCH 006/214] CI: pin pymysql<0.8.0 (#19461) --- ci/requirements-3.6.run | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/requirements-3.6.run b/ci/requirements-3.6.run index 822144a80bc9a..e30461d06b8ea 100644 --- a/ci/requirements-3.6.run +++ b/ci/requirements-3.6.run @@ -13,7 +13,7 @@ lxml html5lib jinja2 sqlalchemy -pymysql +pymysql<0.8.0 feather-format pyarrow psycopg2 From 8a567750e5d56b604411808dabf7c1c700be717a Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Tue, 30 Jan 2018 12:36:36 +0100 Subject: [PATCH 007/214] TST: fix (other check of) test for MultiIndexPyIntEngine on 32 bit (#19455) --- pandas/tests/test_multilevel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 65332ae7153e2..79e05c90a21b0 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -1617,7 +1617,7 @@ def test_pyint_engine(self): # With missing key: idces = range(len(keys)) - expected = np.array([-1] + list(idces), dtype='int64') + expected = np.array([-1] + list(idces), dtype=np.intp) missing = tuple([0, 1] * 5 * N) result = index.get_indexer([missing] + [keys[i] for i in idces]) tm.assert_numpy_array_equal(result, expected) From 238499ab0a48a0ad4a2011e2ce1c6a02c86124eb Mon Sep 17 00:00:00 2001 From: Tim Hoffmann <2836374+timhoffm@users.noreply.github.com> Date: Tue, 30 Jan 2018 12:37:16 +0100 Subject: [PATCH 008/214] remove reference to deprecated .ix from 10min.rst (#19452) --- doc/source/10min.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/10min.rst b/doc/source/10min.rst index da7679d8a3f54..fbbe94a72c71e 100644 --- a/doc/source/10min.rst +++ b/doc/source/10min.rst @@ -154,7 +154,7 @@ Selection While standard Python / Numpy expressions for selecting and setting are intuitive and come in handy for interactive work, for production code, we recommend the optimized pandas data access methods, ``.at``, ``.iat``, - ``.loc``, ``.iloc`` and ``.ix``. + ``.loc`` and ``.iloc``. See the indexing documentation :ref:`Indexing and Selecting Data ` and :ref:`MultiIndex / Advanced Indexing `. From d37d867cbb5665ef8cae1ff580fa377eccc84253 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 30 Jan 2018 15:55:19 -0800 Subject: [PATCH 009/214] remove unused (#19466) --- pandas/_libs/src/period_helper.c | 32 ------------------------------ pandas/_libs/src/period_helper.h | 10 ---------- pandas/_libs/tslibs/period.pyx | 9 --------- pandas/tests/scalar/test_period.py | 3 --- 4 files changed, 54 deletions(-) diff --git a/pandas/_libs/src/period_helper.c b/pandas/_libs/src/period_helper.c index 01fc46481d5b4..f1367978bd6c9 100644 --- a/pandas/_libs/src/period_helper.c +++ b/pandas/_libs/src/period_helper.c @@ -1275,38 +1275,6 @@ npy_int64 get_python_ordinal(npy_int64 period_ordinal, int freq) { return toDaily(period_ordinal, 'E', &af_info) + ORD_OFFSET; } -char *str_replace(const char *s, const char *old, const char *new) { - char *ret; - int i, count = 0; - size_t newlen = strlen(new); - size_t oldlen = strlen(old); - - for (i = 0; s[i] != '\0'; i++) { - if (strstr(&s[i], old) == &s[i]) { - count++; - i += oldlen - 1; - } - } - - ret = PyArray_malloc(i + 1 + count * (newlen - oldlen)); - if (ret == NULL) { - return (char *)PyErr_NoMemory(); - } - - i = 0; - while (*s) { - if (strstr(s, old) == s) { - strncpy(&ret[i], new, sizeof(char) * newlen); - i += newlen; - s += oldlen; - } else { - ret[i++] = *s++; - } - } - ret[i] = '\0'; - - return ret; -} // function to generate a nice string representation of the period // object, originally from DateObject_strftime diff --git a/pandas/_libs/src/period_helper.h b/pandas/_libs/src/period_helper.h index 45afc074cab72..35dd20848a2ec 100644 --- a/pandas/_libs/src/period_helper.h +++ b/pandas/_libs/src/period_helper.h @@ -112,15 +112,6 @@ frequency conversion routines. #define INT_ERR_CODE INT32_MIN -#define MEM_CHECK(item) \ - if (item == NULL) { \ - return PyErr_NoMemory(); \ - } -#define ERR_CHECK(item) \ - if (item == NULL) { \ - return NULL; \ - } - typedef struct asfreq_info { int from_week_end; // day the week ends on in the "from" frequency int to_week_end; // day the week ends on in the "to" frequency @@ -182,7 +173,6 @@ int pminute(npy_int64 ordinal, int freq); int psecond(npy_int64 ordinal, int freq); int pdays_in_month(npy_int64 ordinal, int freq); -double getAbsTime(int freq, npy_int64 dailyDate, npy_int64 originalDate); char *c_strftime(struct date_info *dinfo, char *fmt); int get_yq(npy_int64 ordinal, int freq, int *quarter, int *year); diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 46365035a0b9a..e2caebe4c4afc 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -372,15 +372,6 @@ cdef object _period_strftime(int64_t value, int freq, object fmt): ctypedef int (*accessor)(int64_t ordinal, int freq) except INT32_MIN -def get_period_field(int code, int64_t value, int freq): - cdef accessor f = _get_accessor_func(code) - if f is NULL: - raise ValueError('Unrecognized period code: %d' % code) - if value == iNaT: - return np.nan - return f(value, freq) - - def get_period_field_arr(int code, ndarray[int64_t] arr, int freq): cdef: Py_ssize_t i, sz diff --git a/pandas/tests/scalar/test_period.py b/pandas/tests/scalar/test_period.py index ce733829c2315..41b3bb55bfff1 100644 --- a/pandas/tests/scalar/test_period.py +++ b/pandas/tests/scalar/test_period.py @@ -914,9 +914,6 @@ def test_round_trip(self): class TestPeriodField(object): - def test_get_period_field_raises_on_out_of_range(self): - pytest.raises(ValueError, libperiod.get_period_field, -1, 0, 0) - def test_get_period_field_array_raises_on_out_of_range(self): pytest.raises(ValueError, libperiod.get_period_field_arr, -1, np.empty(1), 0) From 65639e67b045a4849c47583d3b32144089a81bb4 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 31 Jan 2018 02:48:53 -0800 Subject: [PATCH 010/214] setup.py fixup, closes #19467 (#19472) --- setup.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 859d50303ecb1..721e6f62bd3e4 100755 --- a/setup.py +++ b/setup.py @@ -416,7 +416,7 @@ def get_tag(self): cmdclass['build_src'] = DummyBuildSrc cmdclass['build_ext'] = CheckingBuildExt -lib_depends = ['reduce', 'inference'] +lib_depends = ['inference'] def srcpath(name=None, suffix='.pyx', subdir='src'): @@ -508,11 +508,12 @@ def pxd(name): 'sources': ['pandas/_libs/src/parser/tokenizer.c', 'pandas/_libs/src/parser/io.c']}, '_libs.reduction': { - 'pyxfile': '_libs/reduction'}, + 'pyxfile': '_libs/reduction', + 'pxdfiles': ['_libs/src/util']}, '_libs.tslibs.period': { 'pyxfile': '_libs/tslibs/period', 'pxdfiles': ['_libs/src/util', - '_libs/lib', + '_libs/missing', '_libs/tslibs/timedeltas', '_libs/tslibs/timezones', '_libs/tslibs/nattype'], From d9daec83341baa1ae660245d12e76999feeb8d2d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 31 Jan 2018 03:17:05 -0800 Subject: [PATCH 011/214] Centralize Arithmetic Tests (#19471) --- pandas/tests/frame/test_arithmetic.py | 103 ++++++++++ pandas/tests/frame/test_operators.py | 91 +-------- pandas/tests/series/test_arithmetic.py | 211 +++++++++++++++++++- pandas/tests/series/test_datetime_values.py | 11 +- pandas/tests/series/test_operators.py | 61 ------ pandas/tests/series/test_period.py | 116 +---------- 6 files changed, 316 insertions(+), 277 deletions(-) diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 3f4e3877a276a..9b99a7b73b82b 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -1,11 +1,114 @@ # -*- coding: utf-8 -*- +import pytest import numpy as np import pandas as pd import pandas.util.testing as tm +# ------------------------------------------------------------------- +# Comparisons + +class TestFrameComparisons(object): + def test_df_boolean_comparison_error(self): + # GH#4576 + # boolean comparisons with a tuple/list give unexpected results + df = pd.DataFrame(np.arange(6).reshape((3, 2))) + + # not shape compatible + with pytest.raises(ValueError): + df == (2, 2) + with pytest.raises(ValueError): + df == [2, 2] + + def test_df_float_none_comparison(self): + df = pd.DataFrame(np.random.randn(8, 3), index=range(8), + columns=['A', 'B', 'C']) + + with pytest.raises(TypeError): + df.__eq__(None) + + def test_df_string_comparison(self): + df = pd.DataFrame([{"a": 1, "b": "foo"}, {"a": 2, "b": "bar"}]) + mask_a = df.a > 1 + tm.assert_frame_equal(df[mask_a], df.loc[1:1, :]) + tm.assert_frame_equal(df[-mask_a], df.loc[0:0, :]) + + mask_b = df.b == "foo" + tm.assert_frame_equal(df[mask_b], df.loc[0:0, :]) + tm.assert_frame_equal(df[-mask_b], df.loc[1:1, :]) + + @pytest.mark.parametrize('opname', ['eq', 'ne', 'gt', 'lt', 'ge', 'le']) + def test_df_flex_cmp_constant_return_types(self, opname): + # GH#15077, non-empty DataFrame + df = pd.DataFrame({'x': [1, 2, 3], 'y': [1., 2., 3.]}) + const = 2 + + result = getattr(df, opname)(const).get_dtype_counts() + tm.assert_series_equal(result, pd.Series([2], ['bool'])) + + @pytest.mark.parametrize('opname', ['eq', 'ne', 'gt', 'lt', 'ge', 'le']) + def test_df_flex_cmp_constant_return_types_empty(self, opname): + # GH#15077 empty DataFrame + df = pd.DataFrame({'x': [1, 2, 3], 'y': [1., 2., 3.]}) + const = 2 + + empty = df.iloc[:0] + result = getattr(empty, opname)(const).get_dtype_counts() + tm.assert_series_equal(result, pd.Series([2], ['bool'])) + + +# ------------------------------------------------------------------- +# Arithmetic + +class TestFrameArithmetic(object): + + @pytest.mark.xfail(reason='GH#7996 datetime64 units not converted to nano') + def test_df_sub_datetime64_not_ns(self): + df = pd.DataFrame(pd.date_range('20130101', periods=3)) + dt64 = np.datetime64('2013-01-01') + assert dt64.dtype == 'datetime64[D]' + res = df - dt64 + expected = pd.DataFrame([pd.Timedelta(days=0), pd.Timedelta(days=1), + pd.Timedelta(days=2)]) + tm.assert_frame_equal(res, expected) + + @pytest.mark.parametrize('data', [ + [1, 2, 3], + [1.1, 2.2, 3.3], + [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-01-02'), pd.NaT], + ['x', 'y', 1]]) + @pytest.mark.parametrize('dtype', [None, object]) + def test_df_radd_str_invalid(self, dtype, data): + df = pd.DataFrame(data, dtype=dtype) + with pytest.raises(TypeError): + 'foo_' + df + + @pytest.mark.parametrize('dtype', [None, object]) + def test_df_with_dtype_radd_int(self, dtype): + df = pd.DataFrame([1, 2, 3], dtype=dtype) + expected = pd.DataFrame([2, 3, 4], dtype=dtype) + result = 1 + df + tm.assert_frame_equal(result, expected) + result = df + 1 + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize('dtype', [None, object]) + def test_df_with_dtype_radd_nan(self, dtype): + df = pd.DataFrame([1, 2, 3], dtype=dtype) + expected = pd.DataFrame([np.nan, np.nan, np.nan], dtype=dtype) + result = np.nan + df + tm.assert_frame_equal(result, expected) + result = df + np.nan + tm.assert_frame_equal(result, expected) + + def test_df_radd_str(self): + df = pd.DataFrame(['x', np.nan, 'x']) + tm.assert_frame_equal('a' + df, pd.DataFrame(['ax', np.nan, 'ax'])) + tm.assert_frame_equal(df + 'a', pd.DataFrame(['xa', np.nan, 'xa'])) + + class TestPeriodFrameArithmetic(object): def test_ops_frame_period(self): diff --git a/pandas/tests/frame/test_operators.py b/pandas/tests/frame/test_operators.py index 0bc4a7df6a55b..bdccbec6111d3 100644 --- a/pandas/tests/frame/test_operators.py +++ b/pandas/tests/frame/test_operators.py @@ -10,7 +10,7 @@ from numpy import nan, random import numpy as np -from pandas.compat import lrange, range +from pandas.compat import range from pandas import compat from pandas import (DataFrame, Series, MultiIndex, Timestamp, date_range) @@ -28,53 +28,6 @@ _check_mixed_int) -class TestDataFrameArithmetic(object): - - @pytest.mark.xfail(reason='GH#7996 datetime64 units not converted to nano') - def test_frame_sub_datetime64_not_ns(self): - df = pd.DataFrame(date_range('20130101', periods=3)) - dt64 = np.datetime64('2013-01-01') - assert dt64.dtype == 'datetime64[D]' - res = df - dt64 - expected = pd.DataFrame([pd.Timedelta(days=0), pd.Timedelta(days=1), - pd.Timedelta(days=2)]) - tm.assert_frame_equal(res, expected) - - @pytest.mark.parametrize('data', [ - [1, 2, 3], - [1.1, 2.2, 3.3], - [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-01-02'), pd.NaT], - ['x', 'y', 1]]) - @pytest.mark.parametrize('dtype', [None, object]) - def test_frame_radd_str_invalid(self, dtype, data): - df = DataFrame(data, dtype=dtype) - with pytest.raises(TypeError): - 'foo_' + df - - @pytest.mark.parametrize('dtype', [None, object]) - def test_frame_with_dtype_radd_int(self, dtype): - df = pd.DataFrame([1, 2, 3], dtype=dtype) - expected = pd.DataFrame([2, 3, 4], dtype=dtype) - result = 1 + df - assert_frame_equal(result, expected) - result = df + 1 - assert_frame_equal(result, expected) - - @pytest.mark.parametrize('dtype', [None, object]) - def test_frame_with_dtype_radd_nan(self, dtype): - df = pd.DataFrame([1, 2, 3], dtype=dtype) - expected = pd.DataFrame([np.nan, np.nan, np.nan], dtype=dtype) - result = np.nan + df - assert_frame_equal(result, expected) - result = df + np.nan - assert_frame_equal(result, expected) - - def test_frame_radd_str(self): - df = pd.DataFrame(['x', np.nan, 'x']) - assert_frame_equal('a' + df, pd.DataFrame(['ax', np.nan, 'ax'])) - assert_frame_equal(df + 'a', pd.DataFrame(['xa', np.nan, 'xa'])) - - class TestDataFrameOperators(TestData): def test_operators(self): @@ -714,22 +667,6 @@ def _test_seq(df, idx_ser, col_ser): exp = DataFrame({'col': [False, True, False]}) assert_frame_equal(result, exp) - def test_return_dtypes_bool_op_costant(self): - # GH15077 - df = DataFrame({'x': [1, 2, 3], 'y': [1., 2., 3.]}) - const = 2 - - # not empty DataFrame - for op in ['eq', 'ne', 'gt', 'lt', 'ge', 'le']: - result = getattr(df, op)(const).get_dtype_counts() - tm.assert_series_equal(result, Series([2], ['bool'])) - - # empty DataFrame - empty = df.iloc[:0] - for op in ['eq', 'ne', 'gt', 'lt', 'ge', 'le']: - result = getattr(empty, op)(const).get_dtype_counts() - tm.assert_series_equal(result, Series([2], ['bool'])) - def test_dti_tz_convert_to_utc(self): base = pd.DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03'], tz='UTC') @@ -1009,22 +946,6 @@ def test_comparison_protected_from_errstate(self): result = (missing_df < 0).values tm.assert_numpy_array_equal(result, expected) - def test_string_comparison(self): - df = DataFrame([{"a": 1, "b": "foo"}, {"a": 2, "b": "bar"}]) - mask_a = df.a > 1 - assert_frame_equal(df[mask_a], df.loc[1:1, :]) - assert_frame_equal(df[-mask_a], df.loc[0:0, :]) - - mask_b = df.b == "foo" - assert_frame_equal(df[mask_b], df.loc[0:0, :]) - assert_frame_equal(df[-mask_b], df.loc[1:1, :]) - - def test_float_none_comparison(self): - df = DataFrame(np.random.randn(8, 3), index=lrange(8), - columns=['A', 'B', 'C']) - - pytest.raises(TypeError, df.__eq__, None) - def test_boolean_comparison(self): # GH 4576 @@ -1091,16 +1012,6 @@ def test_boolean_comparison(self): result = df == tup assert_frame_equal(result, expected) - def test_boolean_comparison_error(self): - - # GH 4576 - # boolean comparisons with a tuple/list give unexpected results - df = DataFrame(np.arange(6).reshape((3, 2))) - - # not shape compatible - pytest.raises(ValueError, lambda: df == (2, 2)) - pytest.raises(ValueError, lambda: df == [2, 2]) - def test_combine_generic(self): df1 = self.frame df2 = self.frame.loc[self.frame.index[:-5], ['A', 'B', 'C']] diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index ca558dd6b7cd5..1d9fa9dc15531 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -1,13 +1,20 @@ # -*- coding: utf-8 -*- -from datetime import timedelta +from datetime import datetime, timedelta import operator import numpy as np +import pytest + +from pandas import Series, Timestamp, Period +from pandas._libs.tslibs.period import IncompatibleFrequency import pandas as pd import pandas.util.testing as tm +# ------------------------------------------------------------------ +# Comparisons + class TestSeriesComparison(object): def test_compare_invalid(self): # GH#8058 @@ -17,8 +24,39 @@ def test_compare_invalid(self): b.name = pd.Timestamp('2000-01-01') tm.assert_series_equal(a / b, 1 / (b / a)) + @pytest.mark.parametrize('opname', ['eq', 'ne', 'gt', 'lt', 'ge', 'le']) + def test_ser_flex_cmp_return_dtypes(self, opname): + # GH#15115 + ser = Series([1, 3, 2], index=range(3)) + const = 2 + + result = getattr(ser, opname)(const).get_dtype_counts() + tm.assert_series_equal(result, Series([1], ['bool'])) + + @pytest.mark.parametrize('opname', ['eq', 'ne', 'gt', 'lt', 'ge', 'le']) + def test_ser_flex_cmp_return_dtypes_empty(self, opname): + # GH#15115 empty Series case + ser = Series([1, 3, 2], index=range(3)) + empty = ser.iloc[:0] + const = 2 + + result = getattr(empty, opname)(const).get_dtype_counts() + tm.assert_series_equal(result, Series([1], ['bool'])) + class TestTimestampSeriesComparison(object): + def test_dt64ser_cmp_period_scalar(self): + ser = Series(pd.period_range('2000-01-01', periods=10, freq='D')) + val = Period('2000-01-04', freq='D') + result = ser > val + expected = Series([x > val for x in ser]) + tm.assert_series_equal(result, expected) + + val = ser[5] + result = ser > val + expected = Series([x > val for x in ser]) + tm.assert_series_equal(result, expected) + def test_timestamp_compare_series(self): # make sure we can compare Timestamps on the right AND left hand side # GH#4982 @@ -81,6 +119,170 @@ def test_compare_timedelta_series(self): tm.assert_series_equal(actual, expected) +class TestPeriodSeriesComparisons(object): + @pytest.mark.parametrize('freq', ['M', '2M', '3M']) + def test_cmp_series_period_scalar(self, freq): + # GH 13200 + base = Series([Period(x, freq=freq) for x in + ['2011-01', '2011-02', '2011-03', '2011-04']]) + p = Period('2011-02', freq=freq) + + exp = Series([False, True, False, False]) + tm.assert_series_equal(base == p, exp) + tm.assert_series_equal(p == base, exp) + + exp = Series([True, False, True, True]) + tm.assert_series_equal(base != p, exp) + tm.assert_series_equal(p != base, exp) + + exp = Series([False, False, True, True]) + tm.assert_series_equal(base > p, exp) + tm.assert_series_equal(p < base, exp) + + exp = Series([True, False, False, False]) + tm.assert_series_equal(base < p, exp) + tm.assert_series_equal(p > base, exp) + + exp = Series([False, True, True, True]) + tm.assert_series_equal(base >= p, exp) + tm.assert_series_equal(p <= base, exp) + + exp = Series([True, True, False, False]) + tm.assert_series_equal(base <= p, exp) + tm.assert_series_equal(p >= base, exp) + + # different base freq + msg = "Input has different freq=A-DEC from Period" + with tm.assert_raises_regex(IncompatibleFrequency, msg): + base <= Period('2011', freq='A') + + with tm.assert_raises_regex(IncompatibleFrequency, msg): + Period('2011', freq='A') >= base + + @pytest.mark.parametrize('freq', ['M', '2M', '3M']) + def test_cmp_series_period_series(self, freq): + # GH#13200 + base = Series([Period(x, freq=freq) for x in + ['2011-01', '2011-02', '2011-03', '2011-04']]) + + ser = Series([Period(x, freq=freq) for x in + ['2011-02', '2011-01', '2011-03', '2011-05']]) + + exp = Series([False, False, True, False]) + tm.assert_series_equal(base == ser, exp) + + exp = Series([True, True, False, True]) + tm.assert_series_equal(base != ser, exp) + + exp = Series([False, True, False, False]) + tm.assert_series_equal(base > ser, exp) + + exp = Series([True, False, False, True]) + tm.assert_series_equal(base < ser, exp) + + exp = Series([False, True, True, False]) + tm.assert_series_equal(base >= ser, exp) + + exp = Series([True, False, True, True]) + tm.assert_series_equal(base <= ser, exp) + + ser2 = Series([Period(x, freq='A') for x in + ['2011', '2011', '2011', '2011']]) + + # different base freq + msg = "Input has different freq=A-DEC from Period" + with tm.assert_raises_regex(IncompatibleFrequency, msg): + base <= ser2 + + def test_cmp_series_period_series_mixed_freq(self): + # GH#13200 + base = Series([Period('2011', freq='A'), + Period('2011-02', freq='M'), + Period('2013', freq='A'), + Period('2011-04', freq='M')]) + + ser = Series([Period('2012', freq='A'), + Period('2011-01', freq='M'), + Period('2013', freq='A'), + Period('2011-05', freq='M')]) + + exp = Series([False, False, True, False]) + tm.assert_series_equal(base == ser, exp) + + exp = Series([True, True, False, True]) + tm.assert_series_equal(base != ser, exp) + + exp = Series([False, True, False, False]) + tm.assert_series_equal(base > ser, exp) + + exp = Series([True, False, False, True]) + tm.assert_series_equal(base < ser, exp) + + exp = Series([False, True, True, False]) + tm.assert_series_equal(base >= ser, exp) + + exp = Series([True, False, True, True]) + tm.assert_series_equal(base <= ser, exp) + + +# ------------------------------------------------------------------ +# Arithmetic + +class TestSeriesArithmetic(object): + # Standard, numeric, or otherwise not-Timestamp/Timedelta/Period dtypes + @pytest.mark.parametrize('data', [ + [1, 2, 3], + [1.1, 2.2, 3.3], + [Timestamp('2011-01-01'), Timestamp('2011-01-02'), pd.NaT], + ['x', 'y', 1]]) + @pytest.mark.parametrize('dtype', [None, object]) + def test_series_radd_str_invalid(self, dtype, data): + ser = Series(data, dtype=dtype) + with pytest.raises(TypeError): + 'foo_' + ser + + # TODO: parametrize, better name + def test_object_ser_add_invalid(self): + # invalid ops + obj_ser = tm.makeObjectSeries() + obj_ser.name = 'objects' + with pytest.raises(Exception): + obj_ser + 1 + with pytest.raises(Exception): + obj_ser + np.array(1, dtype=np.int64) + with pytest.raises(Exception): + obj_ser - 1 + with pytest.raises(Exception): + obj_ser - np.array(1, dtype=np.int64) + + @pytest.mark.parametrize('dtype', [None, object]) + def test_series_with_dtype_radd_nan(self, dtype): + ser = pd.Series([1, 2, 3], dtype=dtype) + expected = pd.Series([np.nan, np.nan, np.nan], dtype=dtype) + + result = np.nan + ser + tm.assert_series_equal(result, expected) + + result = ser + np.nan + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize('dtype', [None, object]) + def test_series_with_dtype_radd_int(self, dtype): + ser = pd.Series([1, 2, 3], dtype=dtype) + expected = pd.Series([2, 3, 4], dtype=dtype) + + result = 1 + ser + tm.assert_series_equal(result, expected) + + result = ser + 1 + tm.assert_series_equal(result, expected) + + def test_series_radd_str(self): + ser = pd.Series(['x', np.nan, 'x']) + tm.assert_series_equal('a' + ser, pd.Series(['ax', np.nan, 'ax'])) + tm.assert_series_equal(ser + 'a', pd.Series(['xa', np.nan, 'xa'])) + + class TestPeriodSeriesArithmetic(object): def test_ops_series_timedelta(self): # GH 13043 @@ -134,3 +336,10 @@ def test_timestamp_sub_series(self): np.timedelta64(1, 'D')]) tm.assert_series_equal(ser - ts, delta_series) tm.assert_series_equal(ts - ser, -delta_series) + + def test_dt64ser_sub_datetime_dtype(self): + ts = Timestamp(datetime(1993, 1, 7, 13, 30, 00)) + dt = datetime(1993, 6, 22, 13, 30) + ser = Series([ts]) + result = pd.to_timedelta(np.abs(ser - dt)) + assert result.dtype == 'timedelta64[ns]' diff --git a/pandas/tests/series/test_datetime_values.py b/pandas/tests/series/test_datetime_values.py index b79d8def905af..49b4600b10738 100644 --- a/pandas/tests/series/test_datetime_values.py +++ b/pandas/tests/series/test_datetime_values.py @@ -11,7 +11,7 @@ from pandas.core.dtypes.common import is_integer_dtype, is_list_like from pandas import (Index, Series, DataFrame, bdate_range, date_range, period_range, timedelta_range, - PeriodIndex, Timestamp, DatetimeIndex, TimedeltaIndex) + PeriodIndex, DatetimeIndex, TimedeltaIndex) import pandas.core.common as com from pandas.util.testing import assert_series_equal @@ -377,15 +377,6 @@ def test_dt_accessor_api(self): s.dt assert not hasattr(s, 'dt') - def test_sub_of_datetime_from_TimeSeries(self): - from pandas.core.tools.timedeltas import to_timedelta - from datetime import datetime - a = Timestamp(datetime(1993, 0o1, 0o7, 13, 30, 00)) - b = datetime(1993, 6, 22, 13, 30) - a = Series([a]) - result = to_timedelta(np.abs(a - b)) - assert result.dtype == 'timedelta64[ns]' - def test_between(self): s = Series(bdate_range('1/1/2000', periods=20).astype(object)) s[::2] = np.nan diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index 8feee6e6cff68..05ccb25960b1f 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -1686,15 +1686,6 @@ def test_operators_empty_int_corner(self): s2 = Series({'x': 0.}) assert_series_equal(s1 * s2, Series([np.nan], index=['x'])) - def test_invalid_ops(self): - # invalid ops - pytest.raises(Exception, self.objSeries.__add__, 1) - pytest.raises(Exception, self.objSeries.__add__, - np.array(1, dtype=np.int64)) - pytest.raises(Exception, self.objSeries.__sub__, 1) - pytest.raises(Exception, self.objSeries.__sub__, - np.array(1, dtype=np.int64)) - @pytest.mark.parametrize("m", [1, 3, 10]) @pytest.mark.parametrize("unit", ['D', 'h', 'm', 's', 'ms', 'us', 'ns']) def test_timedelta64_conversions(self, m, unit): @@ -1817,20 +1808,6 @@ def test_ops_datetimelike_align(self): result = (dt2.to_frame() - dt.to_frame())[0] assert_series_equal(result, expected) - def test_return_dtypes_bool_op_costant(self): - # gh15115 - s = pd.Series([1, 3, 2], index=range(3)) - const = 2 - for op in ['eq', 'ne', 'gt', 'lt', 'ge', 'le']: - result = getattr(s, op)(const).get_dtype_counts() - tm.assert_series_equal(result, Series([1], ['bool'])) - - # empty Series - empty = s.iloc[:0] - for op in ['eq', 'ne', 'gt', 'lt', 'ge', 'le']: - result = getattr(empty, op)(const).get_dtype_counts() - tm.assert_series_equal(result, Series([1], ['bool'])) - def test_operators_bitwise(self): # GH 9016: support bitwise op for integer types index = list('bca') @@ -2115,11 +2092,6 @@ def test_series_frame_radd_bug(self): with pytest.raises(TypeError): self.ts + datetime.now() - def test_series_radd_str(self): - ser = pd.Series(['x', np.nan, 'x']) - assert_series_equal('a' + ser, pd.Series(['ax', np.nan, 'ax'])) - assert_series_equal(ser + 'a', pd.Series(['xa', np.nan, 'xa'])) - @pytest.mark.parametrize('dtype', [None, object]) def test_series_with_dtype_radd_timedelta(self, dtype): ser = pd.Series([pd.Timedelta('1 days'), pd.Timedelta('2 days'), @@ -2133,39 +2105,6 @@ def test_series_with_dtype_radd_timedelta(self, dtype): result = ser + pd.Timedelta('3 days') assert_series_equal(result, expected) - @pytest.mark.parametrize('dtype', [None, object]) - def test_series_with_dtype_radd_int(self, dtype): - ser = pd.Series([1, 2, 3], dtype=dtype) - expected = pd.Series([2, 3, 4], dtype=dtype) - - result = 1 + ser - assert_series_equal(result, expected) - - result = ser + 1 - assert_series_equal(result, expected) - - @pytest.mark.parametrize('dtype', [None, object]) - def test_series_with_dtype_radd_nan(self, dtype): - ser = pd.Series([1, 2, 3], dtype=dtype) - expected = pd.Series([np.nan, np.nan, np.nan], dtype=dtype) - - result = np.nan + ser - assert_series_equal(result, expected) - - result = ser + np.nan - assert_series_equal(result, expected) - - @pytest.mark.parametrize('data', [ - [1, 2, 3], - [1.1, 2.2, 3.3], - [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-01-02'), pd.NaT], - ['x', 'y', 1]]) - @pytest.mark.parametrize('dtype', [None, object]) - def test_series_radd_str_invalid(self, dtype, data): - ser = Series(data, dtype=dtype) - with pytest.raises(TypeError): - 'foo_' + ser - def test_operators_frame(self): # rpow does not work with DataFrame df = DataFrame({'A': self.ts}) diff --git a/pandas/tests/series/test_period.py b/pandas/tests/series/test_period.py index 9d5ef5e51ff20..8ff2071e351d0 100644 --- a/pandas/tests/series/test_period.py +++ b/pandas/tests/series/test_period.py @@ -3,7 +3,7 @@ import pandas as pd import pandas.util.testing as tm import pandas.core.indexes.period as period -from pandas import Series, period_range, DataFrame, Period +from pandas import Series, period_range, DataFrame def _permute(obj): @@ -63,17 +63,6 @@ def test_dropna(self): tm.assert_series_equal(s.dropna(), Series([pd.Period('2011-01', freq='M')])) - def test_series_comparison_scalars(self): - val = pd.Period('2000-01-04', freq='D') - result = self.series > val - expected = pd.Series([x > val for x in self.series]) - tm.assert_series_equal(result, expected) - - val = self.series[5] - result = self.series > val - expected = pd.Series([x > val for x in self.series]) - tm.assert_series_equal(result, expected) - def test_between(self): left, right = self.series[[2, 7]] result = self.series.between(left, right) @@ -128,109 +117,6 @@ def test_intercept_astype_object(self): result = df.values.squeeze() assert (result[:, 0] == expected.values).all() - def test_comp_series_period_scalar(self): - # GH 13200 - for freq in ['M', '2M', '3M']: - base = Series([Period(x, freq=freq) for x in - ['2011-01', '2011-02', '2011-03', '2011-04']]) - p = Period('2011-02', freq=freq) - - exp = pd.Series([False, True, False, False]) - tm.assert_series_equal(base == p, exp) - tm.assert_series_equal(p == base, exp) - - exp = pd.Series([True, False, True, True]) - tm.assert_series_equal(base != p, exp) - tm.assert_series_equal(p != base, exp) - - exp = pd.Series([False, False, True, True]) - tm.assert_series_equal(base > p, exp) - tm.assert_series_equal(p < base, exp) - - exp = pd.Series([True, False, False, False]) - tm.assert_series_equal(base < p, exp) - tm.assert_series_equal(p > base, exp) - - exp = pd.Series([False, True, True, True]) - tm.assert_series_equal(base >= p, exp) - tm.assert_series_equal(p <= base, exp) - - exp = pd.Series([True, True, False, False]) - tm.assert_series_equal(base <= p, exp) - tm.assert_series_equal(p >= base, exp) - - # different base freq - msg = "Input has different freq=A-DEC from Period" - with tm.assert_raises_regex( - period.IncompatibleFrequency, msg): - base <= Period('2011', freq='A') - - with tm.assert_raises_regex( - period.IncompatibleFrequency, msg): - Period('2011', freq='A') >= base - - def test_comp_series_period_series(self): - # GH 13200 - for freq in ['M', '2M', '3M']: - base = Series([Period(x, freq=freq) for x in - ['2011-01', '2011-02', '2011-03', '2011-04']]) - - s = Series([Period(x, freq=freq) for x in - ['2011-02', '2011-01', '2011-03', '2011-05']]) - - exp = Series([False, False, True, False]) - tm.assert_series_equal(base == s, exp) - - exp = Series([True, True, False, True]) - tm.assert_series_equal(base != s, exp) - - exp = Series([False, True, False, False]) - tm.assert_series_equal(base > s, exp) - - exp = Series([True, False, False, True]) - tm.assert_series_equal(base < s, exp) - - exp = Series([False, True, True, False]) - tm.assert_series_equal(base >= s, exp) - - exp = Series([True, False, True, True]) - tm.assert_series_equal(base <= s, exp) - - s2 = Series([Period(x, freq='A') for x in - ['2011', '2011', '2011', '2011']]) - - # different base freq - msg = "Input has different freq=A-DEC from Period" - with tm.assert_raises_regex( - period.IncompatibleFrequency, msg): - base <= s2 - - def test_comp_series_period_object(self): - # GH 13200 - base = Series([Period('2011', freq='A'), Period('2011-02', freq='M'), - Period('2013', freq='A'), Period('2011-04', freq='M')]) - - s = Series([Period('2012', freq='A'), Period('2011-01', freq='M'), - Period('2013', freq='A'), Period('2011-05', freq='M')]) - - exp = Series([False, False, True, False]) - tm.assert_series_equal(base == s, exp) - - exp = Series([True, True, False, True]) - tm.assert_series_equal(base != s, exp) - - exp = Series([False, True, False, False]) - tm.assert_series_equal(base > s, exp) - - exp = Series([True, False, False, True]) - tm.assert_series_equal(base < s, exp) - - exp = Series([False, True, True, False]) - tm.assert_series_equal(base >= s, exp) - - exp = Series([True, False, True, True]) - tm.assert_series_equal(base <= s, exp) - def test_align_series(self): rng = period_range('1/1/2000', '1/1/2010', freq='A') ts = Series(np.random.randn(len(rng)), index=rng) From 01cbc645ec4e3858ea0a098d5afc46f22a7e3e06 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 31 Jan 2018 03:30:21 -0800 Subject: [PATCH 012/214] implement bits of numpy_helper in cython where possible (#19450) --- pandas/_libs/src/numpy_helper.h | 40 --------------- pandas/_libs/src/util.pxd | 89 +++++++++++++++++++++++++++++---- setup.py | 3 +- 3 files changed, 81 insertions(+), 51 deletions(-) diff --git a/pandas/_libs/src/numpy_helper.h b/pandas/_libs/src/numpy_helper.h index 6c2029fff8a1a..844be9b292be3 100644 --- a/pandas/_libs/src/numpy_helper.h +++ b/pandas/_libs/src/numpy_helper.h @@ -18,33 +18,6 @@ The full license is in the LICENSE file, distributed with this software. PANDAS_INLINE npy_int64 get_nat(void) { return NPY_MIN_INT64; } -PANDAS_INLINE int is_integer_object(PyObject* obj) { - return (!PyBool_Check(obj)) && PyArray_IsIntegerScalar(obj); -} - -PANDAS_INLINE int is_float_object(PyObject* obj) { - return (PyFloat_Check(obj) || PyArray_IsScalar(obj, Floating)); -} -PANDAS_INLINE int is_complex_object(PyObject* obj) { - return (PyComplex_Check(obj) || PyArray_IsScalar(obj, ComplexFloating)); -} - -PANDAS_INLINE int is_bool_object(PyObject* obj) { - return (PyBool_Check(obj) || PyArray_IsScalar(obj, Bool)); -} - -PANDAS_INLINE int is_string_object(PyObject* obj) { - return (PyString_Check(obj) || PyUnicode_Check(obj)); -} - -PANDAS_INLINE int is_datetime64_object(PyObject* obj) { - return PyArray_IsScalar(obj, Datetime); -} - -PANDAS_INLINE int is_timedelta64_object(PyObject* obj) { - return PyArray_IsScalar(obj, Timedelta); -} - PANDAS_INLINE int assign_value_1d(PyArrayObject* ap, Py_ssize_t _i, PyObject* v) { npy_intp i = (npy_intp)_i; @@ -80,17 +53,4 @@ void set_array_not_contiguous(PyArrayObject* ao) { ao->flags &= ~(NPY_C_CONTIGUOUS | NPY_F_CONTIGUOUS); } -// If arr is zerodim array, return a proper array scalar (e.g. np.int64). -// Otherwise, return arr as is. -PANDAS_INLINE PyObject* unbox_if_zerodim(PyObject* arr) { - if (PyArray_IsZeroDim(arr)) { - PyObject* ret; - ret = PyArray_ToScalar(PyArray_DATA(arr), arr); - return ret; - } else { - Py_INCREF(arr); - return arr; - } -} - #endif // PANDAS__LIBS_SRC_NUMPY_HELPER_H_ diff --git a/pandas/_libs/src/util.pxd b/pandas/_libs/src/util.pxd index be6591a118dc5..cf23df1279f34 100644 --- a/pandas/_libs/src/util.pxd +++ b/pandas/_libs/src/util.pxd @@ -1,24 +1,76 @@ -from numpy cimport ndarray +from numpy cimport ndarray, NPY_C_CONTIGUOUS, NPY_F_CONTIGUOUS cimport numpy as cnp +cnp.import_array() + cimport cpython +from cpython cimport PyTypeObject + +cdef extern from "Python.h": + # Note: importing extern-style allows us to declare these as nogil + # functions, whereas `from cpython cimport` does not. + bint PyUnicode_Check(object obj) nogil + bint PyString_Check(object obj) nogil + bint PyBool_Check(object obj) nogil + bint PyFloat_Check(object obj) nogil + bint PyComplex_Check(object obj) nogil + bint PyObject_TypeCheck(object obj, PyTypeObject* type) nogil + + +cdef extern from "numpy/arrayobject.h": + PyTypeObject PyFloatingArrType_Type + +cdef extern from "numpy/ndarrayobject.h": + PyTypeObject PyTimedeltaArrType_Type + PyTypeObject PyDatetimeArrType_Type + PyTypeObject PyComplexFloatingArrType_Type + PyTypeObject PyBoolArrType_Type + + bint PyArray_IsIntegerScalar(obj) nogil + bint PyArray_Check(obj) nogil + +# -------------------------------------------------------------------- +# Type Checking + +cdef inline bint is_string_object(object obj) nogil: + return PyString_Check(obj) or PyUnicode_Check(obj) + + +cdef inline bint is_integer_object(object obj) nogil: + return not PyBool_Check(obj) and PyArray_IsIntegerScalar(obj) + + +cdef inline bint is_float_object(object obj) nogil: + return (PyFloat_Check(obj) or + (PyObject_TypeCheck(obj, &PyFloatingArrType_Type))) + +cdef inline bint is_complex_object(object obj) nogil: + return (PyComplex_Check(obj) or + PyObject_TypeCheck(obj, &PyComplexFloatingArrType_Type)) + + +cdef inline bint is_bool_object(object obj) nogil: + return (PyBool_Check(obj) or + PyObject_TypeCheck(obj, &PyBoolArrType_Type)) + + +cdef inline bint is_timedelta64_object(object obj) nogil: + return PyObject_TypeCheck(obj, &PyTimedeltaArrType_Type) + + +cdef inline bint is_datetime64_object(object obj) nogil: + return PyObject_TypeCheck(obj, &PyDatetimeArrType_Type) + +# -------------------------------------------------------------------- cdef extern from "numpy_helper.h": void set_array_not_contiguous(ndarray ao) - int is_integer_object(object) - int is_float_object(object) - int is_complex_object(object) - int is_bool_object(object) - int is_string_object(object) - int is_datetime64_object(object) - int is_timedelta64_object(object) int assign_value_1d(ndarray, Py_ssize_t, object) except -1 cnp.int64_t get_nat() object get_value_1d(ndarray, Py_ssize_t) char *get_c_string(object) except NULL object char_to_string(char*) - object unbox_if_zerodim(object arr) ctypedef fused numeric: cnp.int8_t @@ -112,3 +164,22 @@ cdef inline bint _checknan(object val): cdef inline bint is_period_object(object val): return getattr(val, '_typ', '_typ') == 'period' + + +cdef inline object unbox_if_zerodim(object arr): + """ + If arr is zerodim array, return a proper array scalar (e.g. np.int64). + Otherwise, return arr as is. + + Parameters + ---------- + arr : object + + Returns + ------- + result : object + """ + if cnp.PyArray_IsZeroDim(arr): + return cnp.PyArray_ToScalar(cnp.PyArray_DATA(arr), arr) + else: + return arr diff --git a/setup.py b/setup.py index 721e6f62bd3e4..27943a776c414 100755 --- a/setup.py +++ b/setup.py @@ -687,8 +687,7 @@ def pxd(name): ext.sources[0] = root + suffix ujson_ext = Extension('pandas._libs.json', - depends=['pandas/_libs/src/ujson/lib/ultrajson.h', - 'pandas/_libs/src/numpy_helper.h'], + depends=['pandas/_libs/src/ujson/lib/ultrajson.h'], sources=(['pandas/_libs/src/ujson/python/ujson.c', 'pandas/_libs/src/ujson/python/objToJSON.c', 'pandas/_libs/src/ujson/python/JSONtoObj.c', From 1bd7b3ad1644ad9d26ac02f507ec7cc0832377bb Mon Sep 17 00:00:00 2001 From: Ming Li <14131823+minggli@users.noreply.github.com> Date: Wed, 31 Jan 2018 11:34:12 +0000 Subject: [PATCH 013/214] [#7292] BUG: asfreq / pct_change strange behavior (#19410) --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/core/generic.py | 1 + pandas/tests/frame/test_timeseries.py | 36 +++++++++++++++++++++++++- pandas/tests/series/test_timeseries.py | 31 +++++++++++++++++++++- 4 files changed, 67 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 1890636bc8e1a..4a5f0bda8c692 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -420,6 +420,7 @@ Datetimelike - Bug in ``.astype()`` to non-ns timedelta units would hold the incorrect dtype (:issue:`19176`, :issue:`19223`, :issue:`12425`) - Bug in subtracting :class:`Series` from ``NaT`` incorrectly returning ``NaT`` (:issue:`19158`) - Bug in :func:`Series.truncate` which raises ``TypeError`` with a monotonic ``PeriodIndex`` (:issue:`17717`) +- Bug in :func:`~DataFrame.pct_change` using ``periods`` and ``freq`` returned different length outputs (:issue:`7292`) Timezones ^^^^^^^^^ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6e777281b11e1..bee954aa9bba8 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7315,6 +7315,7 @@ def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None, rs = (data.div(data.shift(periods=periods, freq=freq, axis=axis, **kwargs)) - 1) + rs = rs.reindex_like(data) if freq is None: mask = isna(com._values_from_object(self)) np.putmask(rs.values, mask, np.nan) diff --git a/pandas/tests/frame/test_timeseries.py b/pandas/tests/frame/test_timeseries.py index 3af798acdede5..e6b47fd69cb05 100644 --- a/pandas/tests/frame/test_timeseries.py +++ b/pandas/tests/frame/test_timeseries.py @@ -108,7 +108,9 @@ def test_pct_change(self): rs = self.tsframe.pct_change(freq='5D') filled = self.tsframe.fillna(method='pad') - assert_frame_equal(rs, filled / filled.shift(freq='5D') - 1) + assert_frame_equal(rs, + (filled / filled.shift(freq='5D') - 1) + .reindex_like(filled)) def test_pct_change_shift_over_nas(self): s = Series([1., 1.5, np.nan, 2.5, 3.]) @@ -120,6 +122,38 @@ def test_pct_change_shift_over_nas(self): edf = DataFrame({'a': expected, 'b': expected}) assert_frame_equal(chg, edf) + def test_pct_change_periods_freq(self): + # GH 7292 + rs_freq = self.tsframe.pct_change(freq='5B') + rs_periods = self.tsframe.pct_change(5) + assert_frame_equal(rs_freq, rs_periods) + + rs_freq = self.tsframe.pct_change(freq='3B', fill_method=None) + rs_periods = self.tsframe.pct_change(3, fill_method=None) + assert_frame_equal(rs_freq, rs_periods) + + rs_freq = self.tsframe.pct_change(freq='3B', fill_method='bfill') + rs_periods = self.tsframe.pct_change(3, fill_method='bfill') + assert_frame_equal(rs_freq, rs_periods) + + rs_freq = self.tsframe.pct_change(freq='7B', + fill_method='pad', + limit=1) + rs_periods = self.tsframe.pct_change(7, fill_method='pad', limit=1) + assert_frame_equal(rs_freq, rs_periods) + + rs_freq = self.tsframe.pct_change(freq='7B', + fill_method='bfill', + limit=3) + rs_periods = self.tsframe.pct_change(7, fill_method='bfill', limit=3) + assert_frame_equal(rs_freq, rs_periods) + + empty_ts = DataFrame(index=self.tsframe.index, + columns=self.tsframe.columns) + rs_freq = empty_ts.pct_change(freq='14B') + rs_periods = empty_ts.pct_change(14) + assert_frame_equal(rs_freq, rs_periods) + def test_frame_ctor_datetime64_column(self): rng = date_range('1/1/2000 00:00:00', '1/1/2000 1:59:50', freq='10s') dates = np.asarray(rng) diff --git a/pandas/tests/series/test_timeseries.py b/pandas/tests/series/test_timeseries.py index 7be801629e387..7a1aff1cc223c 100644 --- a/pandas/tests/series/test_timeseries.py +++ b/pandas/tests/series/test_timeseries.py @@ -344,7 +344,9 @@ def test_pct_change(self): rs = self.ts.pct_change(freq='5D') filled = self.ts.fillna(method='pad') - assert_series_equal(rs, filled / filled.shift(freq='5D') - 1) + assert_series_equal(rs, + (filled / filled.shift(freq='5D') - 1) + .reindex_like(filled)) def test_pct_change_shift_over_nas(self): s = Series([1., 1.5, np.nan, 2.5, 3.]) @@ -353,6 +355,33 @@ def test_pct_change_shift_over_nas(self): expected = Series([np.nan, 0.5, np.nan, 2.5 / 1.5 - 1, .2]) assert_series_equal(chg, expected) + def test_pct_change_periods_freq(self): + # GH 7292 + rs_freq = self.ts.pct_change(freq='5B') + rs_periods = self.ts.pct_change(5) + assert_series_equal(rs_freq, rs_periods) + + rs_freq = self.ts.pct_change(freq='3B', fill_method=None) + rs_periods = self.ts.pct_change(3, fill_method=None) + assert_series_equal(rs_freq, rs_periods) + + rs_freq = self.ts.pct_change(freq='3B', fill_method='bfill') + rs_periods = self.ts.pct_change(3, fill_method='bfill') + assert_series_equal(rs_freq, rs_periods) + + rs_freq = self.ts.pct_change(freq='7B', fill_method='pad', limit=1) + rs_periods = self.ts.pct_change(7, fill_method='pad', limit=1) + assert_series_equal(rs_freq, rs_periods) + + rs_freq = self.ts.pct_change(freq='7B', fill_method='bfill', limit=3) + rs_periods = self.ts.pct_change(7, fill_method='bfill', limit=3) + assert_series_equal(rs_freq, rs_periods) + + empty_ts = Series(index=self.ts.index) + rs_freq = empty_ts.pct_change(freq='14B') + rs_periods = empty_ts.pct_change(14) + assert_series_equal(rs_freq, rs_periods) + def test_autocorr(self): # Just run the function corr1 = self.ts.autocorr() From fb3b23782534c925ab7158c59dcb32c8f8390d71 Mon Sep 17 00:00:00 2001 From: Paul Reidy Date: Wed, 31 Jan 2018 12:15:14 +0000 Subject: [PATCH 014/214] DEPR: Deprecate from_items (#18529) --- doc/source/whatsnew/v0.23.0.txt | 2 +- pandas/core/frame.py | 15 ++++- pandas/io/stata.py | 9 +-- pandas/tests/frame/test_constructors.py | 69 ++++++++++++++------ pandas/tests/frame/test_nonunique_indexes.py | 7 +- pandas/tests/io/parser/common.py | 6 +- pandas/tests/io/test_excel.py | 58 ++++++++-------- pandas/tests/io/test_stata.py | 3 +- 8 files changed, 106 insertions(+), 63 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 4a5f0bda8c692..592c0788070a1 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -329,7 +329,7 @@ Deprecations - :func:`read_excel` has deprecated the ``skip_footer`` parameter. Use ``skipfooter`` instead (:issue:`18836`) - The ``is_copy`` attribute is deprecated and will be removed in a future version (:issue:`18801`). - ``IntervalIndex.from_intervals`` is deprecated in favor of the :class:`IntervalIndex` constructor (:issue:`19263`) - +- :func:``DataFrame.from_items`` is deprecated. Use :func:``DataFrame.from_dict()`` instead, or :func:``DataFrame.from_dict(OrderedDict())`` if you wish to preserve the key order (:issue:`17320`) .. _whatsnew_0230.prior_deprecations: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 788b236b0ec59..96d28581cfdd9 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -313,7 +313,7 @@ def _constructor(self): _constructor_sliced = Series _deprecations = NDFrame._deprecations | frozenset( - ['sortlevel', 'get_value', 'set_value', 'from_csv']) + ['sortlevel', 'get_value', 'set_value', 'from_csv', 'from_items']) @property def _constructor_expanddim(self): @@ -1246,6 +1246,12 @@ def to_records(self, index=True, convert_datetime64=True): @classmethod def from_items(cls, items, columns=None, orient='columns'): """ + .. deprecated:: 0.23.0 + from_items is deprecated and will be removed in a + future version. Use :meth:`DataFrame.from_dict(dict())` + instead. :meth:`DataFrame.from_dict(OrderedDict(...))` may be used + to preserve the key order. + Convert (key, value) pairs to DataFrame. The keys will be the axis index (usually the columns, but depends on the specified orientation). The values should be arrays or Series. @@ -1266,6 +1272,13 @@ def from_items(cls, items, columns=None, orient='columns'): ------- frame : DataFrame """ + + warnings.warn("from_items is deprecated. Please use " + "DataFrame.from_dict(dict()) instead. " + "DataFrame.from_dict(OrderedDict()) may be used to " + "preserve the key order.", + FutureWarning, stacklevel=2) + keys, values = lzip(*items) if orient == 'columns': diff --git a/pandas/io/stata.py b/pandas/io/stata.py index b409cf20e9a09..0922a4a9c3e9b 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -13,6 +13,7 @@ import datetime import struct import sys +from collections import OrderedDict import numpy as np from dateutil.relativedelta import relativedelta @@ -1571,7 +1572,7 @@ def read(self, nrows=None, convert_dates=None, else: data_formatted.append((col, data[col])) if requires_type_conversion: - data = DataFrame.from_items(data_formatted) + data = DataFrame.from_dict(OrderedDict(data_formatted)) del data_formatted self._do_convert_missing(data, convert_missing) @@ -1609,7 +1610,7 @@ def read(self, nrows=None, convert_dates=None, convert = True retyped_data.append((col, data[col].astype(dtype))) if convert: - data = DataFrame.from_items(retyped_data) + data = DataFrame.from_dict(OrderedDict(retyped_data)) if index_col is not None: data = data.set_index(data.pop(index_col)) @@ -1722,7 +1723,7 @@ def _do_convert_categoricals(self, data, value_label_dict, lbllist, cat_converted_data.append((col, cat_data)) else: cat_converted_data.append((col, data[col])) - data = DataFrame.from_items(cat_converted_data) + data = DataFrame.from_dict(OrderedDict(cat_converted_data)) return data def data_label(self): @@ -1997,7 +1998,7 @@ def _prepare_categoricals(self, data): data_formatted.append((col, values)) else: data_formatted.append((col, data[col])) - return DataFrame.from_items(data_formatted) + return DataFrame.from_dict(OrderedDict(data_formatted)) def _replace_nans(self, data): # return data diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index b24ae22162a34..8abd88d8a379c 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -871,7 +871,7 @@ def __len__(self, n): # GH 4297 # support Array import array - result = DataFrame.from_items([('A', array.array('i', range(10)))]) + result = DataFrame({'A': array.array('i', range(10))}) expected = DataFrame({'A': list(range(10))}) tm.assert_frame_equal(result, expected, check_dtype=False) @@ -1175,28 +1175,35 @@ def test_constructor_manager_resize(self): def test_constructor_from_items(self): items = [(c, self.frame[c]) for c in self.frame.columns] - recons = DataFrame.from_items(items) + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + recons = DataFrame.from_items(items) tm.assert_frame_equal(recons, self.frame) # pass some columns - recons = DataFrame.from_items(items, columns=['C', 'B', 'A']) + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + recons = DataFrame.from_items(items, columns=['C', 'B', 'A']) tm.assert_frame_equal(recons, self.frame.loc[:, ['C', 'B', 'A']]) # orient='index' row_items = [(idx, self.mixed_frame.xs(idx)) for idx in self.mixed_frame.index] - - recons = DataFrame.from_items(row_items, - columns=self.mixed_frame.columns, - orient='index') + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + recons = DataFrame.from_items(row_items, + columns=self.mixed_frame.columns, + orient='index') tm.assert_frame_equal(recons, self.mixed_frame) assert recons['A'].dtype == np.float64 with tm.assert_raises_regex(TypeError, "Must pass columns with " "orient='index'"): - DataFrame.from_items(row_items, orient='index') + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + DataFrame.from_items(row_items, orient='index') # orient='index', but thar be tuples arr = construct_1d_object_array_from_listlike( @@ -1204,15 +1211,19 @@ def test_constructor_from_items(self): self.mixed_frame['foo'] = arr row_items = [(idx, list(self.mixed_frame.xs(idx))) for idx in self.mixed_frame.index] - recons = DataFrame.from_items(row_items, - columns=self.mixed_frame.columns, - orient='index') + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + recons = DataFrame.from_items(row_items, + columns=self.mixed_frame.columns, + orient='index') tm.assert_frame_equal(recons, self.mixed_frame) assert isinstance(recons['foo'][0], tuple) - rs = DataFrame.from_items([('A', [1, 2, 3]), ('B', [4, 5, 6])], - orient='index', - columns=['one', 'two', 'three']) + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + rs = DataFrame.from_items([('A', [1, 2, 3]), ('B', [4, 5, 6])], + orient='index', + columns=['one', 'two', 'three']) xp = DataFrame([[1, 2, 3], [4, 5, 6]], index=['A', 'B'], columns=['one', 'two', 'three']) tm.assert_frame_equal(rs, xp) @@ -1222,12 +1233,28 @@ def test_constructor_from_items_scalars(self): with tm.assert_raises_regex(ValueError, r'The value in each \(key, value\) ' 'pair must be an array, Series, or dict'): - DataFrame.from_items([('A', 1), ('B', 4)]) + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + DataFrame.from_items([('A', 1), ('B', 4)]) with tm.assert_raises_regex(ValueError, r'The value in each \(key, value\) ' 'pair must be an array, Series, or dict'): - DataFrame.from_items([('A', 1), ('B', 2)], columns=['col1'], + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + DataFrame.from_items([('A', 1), ('B', 2)], columns=['col1'], + orient='index') + + def test_from_items_deprecation(self): + # GH 17320 + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + DataFrame.from_items([('A', [1, 2, 3]), ('B', [4, 5, 6])]) + + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + DataFrame.from_items([('A', [1, 2, 3]), ('B', [4, 5, 6])], + columns=['col1', 'col2', 'col3'], orient='index') def test_constructor_mix_series_nonseries(self): @@ -1256,13 +1283,13 @@ def test_constructor_column_duplicates(self): tm.assert_frame_equal(df, edf) - idf = DataFrame.from_items( - [('a', [8]), ('a', [5])], columns=['a', 'a']) + idf = DataFrame.from_records([(8, 5)], + columns=['a', 'a']) + tm.assert_frame_equal(idf, edf) - pytest.raises(ValueError, DataFrame.from_items, - [('a', [8]), ('a', [5]), ('b', [6])], - columns=['b', 'a', 'a']) + pytest.raises(ValueError, DataFrame.from_dict, + OrderedDict([('b', 8), ('a', 5), ('a', 6)])) def test_constructor_empty_with_string_dtype(self): # GH 9428 diff --git a/pandas/tests/frame/test_nonunique_indexes.py b/pandas/tests/frame/test_nonunique_indexes.py index f0a21cde4fbd9..36465db78361f 100644 --- a/pandas/tests/frame/test_nonunique_indexes.py +++ b/pandas/tests/frame/test_nonunique_indexes.py @@ -214,9 +214,10 @@ def check(result, expected=None): for index in [df.index, pd.Index(list('edcba'))]: this_df = df.copy() expected_ser = pd.Series(index.values, index=this_df.index) - expected_df = DataFrame.from_items([('A', expected_ser), - ('B', this_df['B']), - ('A', expected_ser)]) + expected_df = DataFrame({'A': expected_ser, + 'B': this_df['B'], + 'A': expected_ser}, + columns=['A', 'B', 'A']) this_df['A'] = index check(this_df, expected_df) diff --git a/pandas/tests/io/parser/common.py b/pandas/tests/io/parser/common.py index 8525cb42c2455..f677b356a77a5 100644 --- a/pandas/tests/io/parser/common.py +++ b/pandas/tests/io/parser/common.py @@ -8,6 +8,7 @@ import re import sys from datetime import datetime +from collections import OrderedDict import pytest import numpy as np @@ -924,8 +925,9 @@ def test_float_parser(self): def test_scientific_no_exponent(self): # see gh-12215 - df = DataFrame.from_items([('w', ['2e']), ('x', ['3E']), - ('y', ['42e']), ('z', ['632E'])]) + df = DataFrame.from_dict(OrderedDict([('w', ['2e']), ('x', ['3E']), + ('y', ['42e']), + ('z', ['632E'])])) data = df.to_csv(index=False) for prec in self.float_precision_choices: df_roundtrip = self.read_csv( diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index efbabcfd8fc4c..ebb8424b78ed4 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -6,6 +6,7 @@ from distutils.version import LooseVersion from functools import partial from warnings import catch_warnings +from collections import OrderedDict import numpy as np import pytest @@ -315,7 +316,7 @@ def test_excel_table(self): def test_reader_special_dtypes(self): - expected = DataFrame.from_items([ + expected = DataFrame.from_dict(OrderedDict([ ("IntCol", [1, 2, -3, 4, 0]), ("FloatCol", [1.25, 2.25, 1.83, 1.92, 0.0000000005]), ("BoolCol", [True, False, True, True, False]), @@ -325,8 +326,7 @@ def test_reader_special_dtypes(self): ("DateCol", [datetime(2013, 10, 30), datetime(2013, 10, 31), datetime(1905, 1, 1), datetime(2013, 12, 14), datetime(2015, 3, 14)]) - ]) - + ])) basename = 'test_types' # should read in correctly and infer types @@ -363,12 +363,12 @@ def test_reader_converters(self): basename = 'test_converters' - expected = DataFrame.from_items([ + expected = DataFrame.from_dict(OrderedDict([ ("IntCol", [1, 2, -3, -1000, 0]), ("FloatCol", [12.5, np.nan, 18.3, 19.2, 0.000000005]), ("BoolCol", ['Found', 'Found', 'Found', 'Not found', 'Found']), ("StrCol", ['1', np.nan, '3', '4', '5']), - ]) + ])) converters = {'IntCol': lambda x: int(x) if x != '' else -1000, 'FloatCol': lambda x: 10 * x if x else np.nan, @@ -718,32 +718,30 @@ def test_reader_seconds(self): if LooseVersion(xlrd.__VERSION__) >= LooseVersion("0.9.3"): # Xlrd >= 0.9.3 can handle Excel milliseconds. - expected = DataFrame.from_items([("Time", - [time(1, 2, 3), - time(2, 45, 56, 100000), - time(4, 29, 49, 200000), - time(6, 13, 42, 300000), - time(7, 57, 35, 400000), - time(9, 41, 28, 500000), - time(11, 25, 21, 600000), - time(13, 9, 14, 700000), - time(14, 53, 7, 800000), - time(16, 37, 0, 900000), - time(18, 20, 54)])]) + expected = DataFrame.from_dict({"Time": [time(1, 2, 3), + time(2, 45, 56, 100000), + time(4, 29, 49, 200000), + time(6, 13, 42, 300000), + time(7, 57, 35, 400000), + time(9, 41, 28, 500000), + time(11, 25, 21, 600000), + time(13, 9, 14, 700000), + time(14, 53, 7, 800000), + time(16, 37, 0, 900000), + time(18, 20, 54)]}) else: # Xlrd < 0.9.3 rounds Excel milliseconds. - expected = DataFrame.from_items([("Time", - [time(1, 2, 3), - time(2, 45, 56), - time(4, 29, 49), - time(6, 13, 42), - time(7, 57, 35), - time(9, 41, 29), - time(11, 25, 22), - time(13, 9, 15), - time(14, 53, 8), - time(16, 37, 1), - time(18, 20, 54)])]) + expected = DataFrame.from_dict({"Time": [time(1, 2, 3), + time(2, 45, 56), + time(4, 29, 49), + time(6, 13, 42), + time(7, 57, 35), + time(9, 41, 29), + time(11, 25, 22), + time(13, 9, 15), + time(14, 53, 8), + time(16, 37, 1), + time(18, 20, 54)]}) actual = self.get_exceldf('times_1900', 'Sheet1') tm.assert_frame_equal(actual, expected) @@ -1988,7 +1986,7 @@ def test_datetimes(self): datetime(2013, 1, 13, 18, 20, 52)] with ensure_clean(self.ext) as path: - write_frame = DataFrame.from_items([('A', datetimes)]) + write_frame = DataFrame({'A': datetimes}) write_frame.to_excel(path, 'Sheet1') read_frame = read_excel(path, 'Sheet1', header=0) diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index d0d7f881b37d0..89d76061329a3 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -8,6 +8,7 @@ import warnings from datetime import datetime from distutils.version import LooseVersion +from collections import OrderedDict import numpy as np import pandas as pd @@ -945,7 +946,7 @@ def test_categorical_order(self, file): cols.append((col, pd.Categorical.from_codes(codes, labels))) else: cols.append((col, pd.Series(labels, dtype=np.float32))) - expected = DataFrame.from_items(cols) + expected = DataFrame.from_dict(OrderedDict(cols)) # Read with and with out categoricals, ensure order is identical file = getattr(self, file) From 12ac43f98c6d3ce9c0c48ed75b80346881c1e4d4 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 31 Jan 2018 11:00:26 -0600 Subject: [PATCH 015/214] BUG: Fixed accessor for Categorical[Datetime] (#19469) * BUG: Fixed accessor for Categorical[Datetime] * Fixup --- pandas/core/indexes/accessors.py | 5 ++++- pandas/tests/series/test_datetime_values.py | 8 ++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexes/accessors.py b/pandas/core/indexes/accessors.py index d40230386216c..c5b300848876e 100644 --- a/pandas/core/indexes/accessors.py +++ b/pandas/core/indexes/accessors.py @@ -72,9 +72,12 @@ def _delegate_property_get(self, name): # blow up if we operate on categories if self.orig is not None: result = take_1d(result, self.orig.cat.codes) + index = self.orig.index + else: + index = self.index # return the result as a Series, which is by definition a copy - result = Series(result, index=self.index, name=self.name) + result = Series(result, index=index, name=self.name) # setting this object will show a SettingWithCopyWarning/Error result._is_copy = ("modifications to a property of a datetimelike " diff --git a/pandas/tests/series/test_datetime_values.py b/pandas/tests/series/test_datetime_values.py index 49b4600b10738..93c8ebc5f05df 100644 --- a/pandas/tests/series/test_datetime_values.py +++ b/pandas/tests/series/test_datetime_values.py @@ -259,6 +259,14 @@ def f(): pytest.raises(com.SettingWithCopyError, f) + def test_dt_namespace_accessor_categorical(self): + # GH 19468 + dti = DatetimeIndex(['20171111', '20181212']).repeat(2) + s = Series(pd.Categorical(dti), name='foo') + result = s.dt.year + expected = Series([2017, 2017, 2018, 2018], name='foo') + tm.assert_series_equal(result, expected) + def test_dt_accessor_no_new_attributes(self): # https://github.com/pandas-dev/pandas/issues/10673 s = Series(date_range('20130101', periods=5, freq='D')) From ca4ae4f1798fb63c707c6cca79e1f586cae63391 Mon Sep 17 00:00:00 2001 From: Tommy <10076072+tommyod@users.noreply.github.com> Date: Thu, 1 Feb 2018 00:54:15 +0100 Subject: [PATCH 016/214] DOC: Spellcheck of categorical.rst and visualization.rst (#19428) --- doc/source/categorical.rst | 187 ++++++++++++++++++----------------- doc/source/visualization.rst | 130 ++++++++++++++---------- 2 files changed, 176 insertions(+), 141 deletions(-) diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index 7364167611730..efcc04d688334 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -19,10 +19,11 @@ Categorical Data This is an introduction to pandas categorical data type, including a short comparison with R's ``factor``. -`Categoricals` are a pandas data type, which correspond to categorical variables in -statistics: a variable, which can take on only a limited, and usually fixed, -number of possible values (`categories`; `levels` in R). Examples are gender, social class, -blood types, country affiliations, observation time or ratings via Likert scales. +`Categoricals` are a pandas data type corresponding to categorical variables in +statistics. A categorical variable takes on a limited, and usually fixed, +number of possible values (`categories`; `levels` in R). Examples are gender, +social class, blood type, country affiliation, observation time or rating via +Likert scales. In contrast to statistical categorical variables, categorical data might have an order (e.g. 'strongly agree' vs 'agree' or 'first observation' vs. 'second observation'), but numerical @@ -48,16 +49,16 @@ See also the :ref:`API docs on categoricals`. Object Creation --------------- -Categorical `Series` or columns in a `DataFrame` can be created in several ways: +Categorical ``Series`` or columns in a ``DataFrame`` can be created in several ways: -By specifying ``dtype="category"`` when constructing a `Series`: +By specifying ``dtype="category"`` when constructing a ``Series``: .. ipython:: python s = pd.Series(["a","b","c","a"], dtype="category") s -By converting an existing `Series` or column to a ``category`` dtype: +By converting an existing ``Series`` or column to a ``category`` dtype: .. ipython:: python @@ -65,18 +66,17 @@ By converting an existing `Series` or column to a ``category`` dtype: df["B"] = df["A"].astype('category') df -By using some special functions: +By using special functions, such as :func:`~pandas.cut`, which groups data into +discrete bins. See the :ref:`example on tiling ` in the docs. .. ipython:: python df = pd.DataFrame({'value': np.random.randint(0, 100, 20)}) - labels = [ "{0} - {1}".format(i, i + 9) for i in range(0, 100, 10) ] + labels = ["{0} - {1}".format(i, i + 9) for i in range(0, 100, 10)] df['group'] = pd.cut(df.value, range(0, 105, 10), right=False, labels=labels) df.head(10) -See :ref:`documentation ` for :func:`~pandas.cut`. - By passing a :class:`pandas.Categorical` object to a `Series` or assigning it to a `DataFrame`. .. ipython:: python @@ -89,10 +89,11 @@ By passing a :class:`pandas.Categorical` object to a `Series` or assigning it to df["B"] = raw_cat df -Anywhere above we passed a keyword ``dtype='category'``, we used the default behavior of +In the examples above where we passed ``dtype='category'``, we used the default +behavior: -1. categories are inferred from the data -2. categories are unordered. +1. Categories are inferred from the data. +2. Categories are unordered. To control those behaviors, instead of passing ``'category'``, use an instance of :class:`~pandas.api.types.CategoricalDtype`. @@ -123,8 +124,8 @@ Categorical data has a specific ``category`` :ref:`dtype `: In contrast to R's `factor` function, there is currently no way to assign/change labels at creation time. Use `categories` to change the categories after creation time. -To get back to the original Series or `numpy` array, use ``Series.astype(original_dtype)`` or -``np.asarray(categorical)``: +To get back to the original ``Series`` or NumPy array, use +``Series.astype(original_dtype)`` or ``np.asarray(categorical)``: .. ipython:: python @@ -135,8 +136,9 @@ To get back to the original Series or `numpy` array, use ``Series.astype(origina s2.astype(str) np.asarray(s2) -If you have already `codes` and `categories`, you can use the :func:`~pandas.Categorical.from_codes` -constructor to save the factorize step during normal constructor mode: +If you already have `codes` and `categories`, you can use the +:func:`~pandas.Categorical.from_codes` constructor to save the factorize step +during normal constructor mode: .. ipython:: python @@ -171,7 +173,7 @@ by default. A :class:`~pandas.api.types.CategoricalDtype` can be used in any place pandas expects a `dtype`. For example :func:`pandas.read_csv`, -:func:`pandas.DataFrame.astype`, or in the Series constructor. +:func:`pandas.DataFrame.astype`, or in the ``Series`` constructor. .. note:: @@ -185,8 +187,8 @@ Equality Semantics ~~~~~~~~~~~~~~~~~~ Two instances of :class:`~pandas.api.types.CategoricalDtype` compare equal -whenever they have the same categories and orderedness. When comparing two -unordered categoricals, the order of the ``categories`` is not considered +whenever they have the same categories and order. When comparing two +unordered categoricals, the order of the ``categories`` is not considered. .. ipython:: python @@ -198,7 +200,7 @@ unordered categoricals, the order of the ``categories`` is not considered # Unequal, since the second CategoricalDtype is ordered c1 == CategoricalDtype(['a', 'b', 'c'], ordered=True) -All instances of ``CategoricalDtype`` compare equal to the string ``'category'`` +All instances of ``CategoricalDtype`` compare equal to the string ``'category'``. .. ipython:: python @@ -215,8 +217,8 @@ All instances of ``CategoricalDtype`` compare equal to the string ``'category'`` Description ----------- -Using ``.describe()`` on categorical data will produce similar output to a `Series` or -`DataFrame` of type ``string``. +Using :meth:`~DataFrame.describe` on categorical data will produce similar +output to a ``Series`` or ``DataFrame`` of type ``string``. .. ipython:: python @@ -230,10 +232,10 @@ Using ``.describe()`` on categorical data will produce similar output to a `Seri Working with categories ----------------------- -Categorical data has a `categories` and a `ordered` property, which list their possible values and -whether the ordering matters or not. These properties are exposed as ``s.cat.categories`` and -``s.cat.ordered``. If you don't manually specify categories and ordering, they are inferred from the -passed in values. +Categorical data has a `categories` and a `ordered` property, which list their +possible values and whether the ordering matters or not. These properties are +exposed as ``s.cat.categories`` and ``s.cat.ordered``. If you don't manually +specify categories and ordering, they are inferred from the passed arguments. .. ipython:: python @@ -251,13 +253,13 @@ It's also possible to pass in the categories in a specific order: .. note:: - New categorical data are NOT automatically ordered. You must explicitly pass ``ordered=True`` to - indicate an ordered ``Categorical``. + New categorical data are **not** automatically ordered. You must explicitly + pass ``ordered=True`` to indicate an ordered ``Categorical``. .. note:: - The result of ``Series.unique()`` is not always the same as ``Series.cat.categories``, + The result of :meth:`~Series.unique` is not always the same as ``Series.cat.categories``, because ``Series.unique()`` has a couple of guarantees, namely that it returns categories in the order of appearance, and it only includes values that are actually present. @@ -275,8 +277,10 @@ It's also possible to pass in the categories in a specific order: Renaming categories ~~~~~~~~~~~~~~~~~~~ -Renaming categories is done by assigning new values to the ``Series.cat.categories`` property or -by using the :func:`Categorical.rename_categories` method: +Renaming categories is done by assigning new values to the +``Series.cat.categories`` property or by using the +:meth:`~pandas.Categorical.rename_categories` method: + .. ipython:: python @@ -296,8 +300,8 @@ by using the :func:`Categorical.rename_categories` method: .. note:: - Be aware that assigning new categories is an inplace operations, while most other operation - under ``Series.cat`` per default return a new Series of dtype `category`. + Be aware that assigning new categories is an inplace operation, while most other operations + under ``Series.cat`` per default return a new ``Series`` of dtype `category`. Categories must be unique or a `ValueError` is raised: @@ -320,7 +324,8 @@ Categories must also not be ``NaN`` or a `ValueError` is raised: Appending new categories ~~~~~~~~~~~~~~~~~~~~~~~~ -Appending categories can be done by using the :func:`Categorical.add_categories` method: +Appending categories can be done by using the +:meth:`~pandas.Categorical.add_categories` method: .. ipython:: python @@ -331,8 +336,9 @@ Appending categories can be done by using the :func:`Categorical.add_categories` Removing categories ~~~~~~~~~~~~~~~~~~~ -Removing categories can be done by using the :func:`Categorical.remove_categories` method. Values -which are removed are replaced by ``np.nan``.: +Removing categories can be done by using the +:meth:`~pandas.Categorical.remove_categories` method. Values which are removed +are replaced by ``np.nan``.: .. ipython:: python @@ -353,8 +359,10 @@ Removing unused categories can also be done: Setting categories ~~~~~~~~~~~~~~~~~~ -If you want to do remove and add new categories in one step (which has some speed advantage), -or simply set the categories to a predefined scale, use :func:`Categorical.set_categories`. +If you want to do remove and add new categories in one step (which has some +speed advantage), or simply set the categories to a predefined scale, +use :meth:`~pandas.Categorical.set_categories`. + .. ipython:: python @@ -366,7 +374,7 @@ or simply set the categories to a predefined scale, use :func:`Categorical.set_c .. note:: Be aware that :func:`Categorical.set_categories` cannot know whether some category is omitted intentionally or because it is misspelled or (under Python3) due to a type difference (e.g., - numpys S1 dtype and Python strings). This can result in surprising behaviour! + NumPy S1 dtype and Python strings). This can result in surprising behaviour! Sorting and Order ----------------- @@ -374,7 +382,7 @@ Sorting and Order .. _categorical.sort: If categorical data is ordered (``s.cat.ordered == True``), then the order of the categories has a -meaning and certain operations are possible. If the categorical is unordered, ``.min()/.max()`` will raise a `TypeError`. +meaning and certain operations are possible. If the categorical is unordered, ``.min()/.max()`` will raise a ``TypeError``. .. ipython:: python @@ -411,8 +419,8 @@ This is even true for strings and numeric data: Reordering ~~~~~~~~~~ -Reordering the categories is possible via the :func:`Categorical.reorder_categories` and -the :func:`Categorical.set_categories` methods. For :func:`Categorical.reorder_categories`, all +Reordering the categories is possible via the :meth:`Categorical.reorder_categories` and +the :meth:`Categorical.set_categories` methods. For :meth:`Categorical.reorder_categories`, all old categories must be included in the new categories and no new categories are allowed. This will necessarily make the sort order the same as the categories order. @@ -428,16 +436,16 @@ necessarily make the sort order the same as the categories order. .. note:: Note the difference between assigning new categories and reordering the categories: the first - renames categories and therefore the individual values in the `Series`, but if the first + renames categories and therefore the individual values in the ``Series``, but if the first position was sorted last, the renamed value will still be sorted last. Reordering means that the way values are sorted is different afterwards, but not that individual values in the - `Series` are changed. + ``Series`` are changed. .. note:: - If the `Categorical` is not ordered, ``Series.min()`` and ``Series.max()`` will raise + If the ``Categorical`` is not ordered, :meth:`Series.min` and :meth:`Series.max` will raise ``TypeError``. Numeric operations like ``+``, ``-``, ``*``, ``/`` and operations based on them - (e.g. ``Series.median()``, which would need to compute the mean between two values if the length + (e.g. :meth:`Series.median`, which would need to compute the mean between two values if the length of an array is even) do not work and raise a ``TypeError``. Multi Column Sorting @@ -464,19 +472,19 @@ Comparisons Comparing categorical data with other objects is possible in three cases: - * comparing equality (``==`` and ``!=``) to a list-like object (list, Series, array, + * Comparing equality (``==`` and ``!=``) to a list-like object (list, Series, array, ...) of the same length as the categorical data. - * all comparisons (``==``, ``!=``, ``>``, ``>=``, ``<``, and ``<=``) of categorical data to + * All comparisons (``==``, ``!=``, ``>``, ``>=``, ``<``, and ``<=``) of categorical data to another categorical Series, when ``ordered==True`` and the `categories` are the same. - * all comparisons of a categorical data to a scalar. + * All comparisons of a categorical data to a scalar. All other comparisons, especially "non-equality" comparisons of two categoricals with different -categories or a categorical with any list-like object, will raise a TypeError. +categories or a categorical with any list-like object, will raise a ``TypeError``. .. note:: - Any "non-equality" comparisons of categorical data with a `Series`, `np.array`, `list` or - categorical data with different categories or ordering will raise an `TypeError` because custom + Any "non-equality" comparisons of categorical data with a ``Series``, ``np.array``, ``list`` or + categorical data with different categories or ordering will raise a ``TypeError`` because custom categories ordering could be interpreted in two ways: one with taking into account the ordering and one without. @@ -546,11 +554,11 @@ When you compare two unordered categoricals with the same categories, the order Operations ---------- -Apart from ``Series.min()``, ``Series.max()`` and ``Series.mode()``, the following operations are -possible with categorical data: +Apart from :meth:`Series.min`, :meth:`Series.max` and :meth:`Series.mode`, the +following operations are possible with categorical data: -`Series` methods like `Series.value_counts()` will use all categories, even if some categories are not -present in the data: +``Series`` methods like :meth:`Series.value_counts` will use all categories, +even if some categories are not present in the data: .. ipython:: python @@ -588,8 +596,8 @@ that only values already in `categories` can be assigned. Getting ~~~~~~~ -If the slicing operation returns either a `DataFrame` or a column of type `Series`, -the ``category`` dtype is preserved. +If the slicing operation returns either a ``DataFrame`` or a column of type +``Series``, the ``category`` dtype is preserved. .. ipython:: python @@ -602,8 +610,8 @@ the ``category`` dtype is preserved. df.loc["h":"j","cats"] df[df["cats"] == "b"] -An example where the category type is not preserved is if you take one single row: the -resulting `Series` is of dtype ``object``: +An example where the category type is not preserved is if you take one single +row: the resulting ``Series`` is of dtype ``object``: .. ipython:: python @@ -620,10 +628,11 @@ of length "1". df.at["h","cats"] # returns a string .. note:: - This is a difference to R's `factor` function, where ``factor(c(1,2,3))[1]`` + The is in contrast to R's `factor` function, where ``factor(c(1,2,3))[1]`` returns a single value `factor`. -To get a single value `Series` of type ``category`` pass in a list with a single value: +To get a single value ``Series`` of type ``category``, you pass in a list with +a single value: .. ipython:: python @@ -632,8 +641,8 @@ To get a single value `Series` of type ``category`` pass in a list with a single String and datetime accessors ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The accessors ``.dt`` and ``.str`` will work if the ``s.cat.categories`` are of an appropriate -type: +The accessors ``.dt`` and ``.str`` will work if the ``s.cat.categories`` are of +an appropriate type: .. ipython:: python @@ -676,8 +685,8 @@ That means, that the returned values from methods and properties on the accessor Setting ~~~~~~~ -Setting values in a categorical column (or `Series`) works as long as the value is included in the -`categories`: +Setting values in a categorical column (or ``Series``) works as long as the +value is included in the `categories`: .. ipython:: python @@ -704,7 +713,7 @@ Setting values by assigning categorical data will also check that the `categorie except ValueError as e: print("ValueError: " + str(e)) -Assigning a `Categorical` to parts of a column of other types will use the values: +Assigning a ``Categorical`` to parts of a column of other types will use the values: .. ipython:: python @@ -719,7 +728,7 @@ Assigning a `Categorical` to parts of a column of other types will use the value Merging ~~~~~~~ -You can concat two `DataFrames` containing categorical data together, +You can concat two ``DataFrames`` containing categorical data together, but the categories of these categoricals need to be the same: .. ipython:: python @@ -731,7 +740,7 @@ but the categories of these categoricals need to be the same: res res.dtypes -In this case the categories are not the same and so an error is raised: +In this case the categories are not the same, and therefore an error is raised: .. ipython:: python @@ -754,10 +763,10 @@ Unioning .. versionadded:: 0.19.0 -If you want to combine categoricals that do not necessarily have -the same categories, the ``union_categoricals`` function will -combine a list-like of categoricals. The new categories -will be the union of the categories being combined. +If you want to combine categoricals that do not necessarily have the same +categories, the :func:`~pandas.api.types.union_categoricals` function will +combine a list-like of categoricals. The new categories will be the union of +the categories being combined. .. ipython:: python @@ -805,8 +814,9 @@ using the ``ignore_ordered=True`` argument. b = pd.Categorical(["c", "b", "a"], ordered=True) union_categoricals([a, b], ignore_order=True) -``union_categoricals`` also works with a ``CategoricalIndex``, or ``Series`` containing -categorical data, but note that the resulting array will always be a plain ``Categorical`` +:func:`~pandas.api.types.union_categoricals` also works with a +``CategoricalIndex``, or ``Series`` containing categorical data, but note that +the resulting array will always be a plain ``Categorical``: .. ipython:: python @@ -956,7 +966,7 @@ Differences to R's `factor` The following differences to R's factor functions can be observed: -* R's `levels` are named `categories` +* R's `levels` are named `categories`. * R's `levels` are always of type string, while `categories` in pandas can be of any dtype. * It's not possible to specify labels at creation time. Use ``s.cat.rename_categories(new_labels)`` afterwards. @@ -1009,10 +1019,10 @@ an ``object`` dtype is a constant times the length of the data. `Categorical` is not a `numpy` array ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Currently, categorical data and the underlying `Categorical` is implemented as a python -object and not as a low-level `numpy` array dtype. This leads to some problems. +Currently, categorical data and the underlying ``Categorical`` is implemented as a Python +object and not as a low-level NumPy array dtype. This leads to some problems. -`numpy` itself doesn't know about the new `dtype`: +NumPy itself doesn't know about the new `dtype`: .. ipython:: python @@ -1041,7 +1051,7 @@ To check if a Series contains Categorical data, use ``hasattr(s, 'cat')``: hasattr(pd.Series(['a'], dtype='category'), 'cat') hasattr(pd.Series(['a']), 'cat') -Using `numpy` functions on a `Series` of type ``category`` should not work as `Categoricals` +Using NumPy functions on a ``Series`` of type ``category`` should not work as `Categoricals` are not numeric data (even in the case that ``.categories`` is numeric). .. ipython:: python @@ -1080,7 +1090,7 @@ and allows efficient indexing and storage of an index with a large number of dup See the :ref:`advanced indexing docs ` for a more detailed explanation. -Setting the index will create a ``CategoricalIndex`` +Setting the index will create a ``CategoricalIndex``: .. ipython:: python @@ -1095,8 +1105,9 @@ Setting the index will create a ``CategoricalIndex`` Side Effects ~~~~~~~~~~~~ -Constructing a `Series` from a `Categorical` will not copy the input `Categorical`. This -means that changes to the `Series` will in most cases change the original `Categorical`: +Constructing a ``Series`` from a ``Categorical`` will not copy the input +``Categorical``. This means that changes to the ``Series`` will in most cases +change the original ``Categorical``: .. ipython:: python @@ -1109,7 +1120,7 @@ means that changes to the `Series` will in most cases change the original `Categ df["cat"].cat.categories = [1,2,3,4,5] cat -Use ``copy=True`` to prevent such a behaviour or simply don't reuse `Categoricals`: +Use ``copy=True`` to prevent such a behaviour or simply don't reuse ``Categoricals``: .. ipython:: python @@ -1120,6 +1131,6 @@ Use ``copy=True`` to prevent such a behaviour or simply don't reuse `Categorical cat .. note:: - This also happens in some cases when you supply a `numpy` array instead of a `Categorical`: - using an int array (e.g. ``np.array([1,2,3,4])``) will exhibit the same behaviour, while using + This also happens in some cases when you supply a NumPy array instead of a ``Categorical``: + using an int array (e.g. ``np.array([1,2,3,4])``) will exhibit the same behavior, while using a string array (e.g. ``np.array(["a","b","c","a"])``) will not. diff --git a/doc/source/visualization.rst b/doc/source/visualization.rst index cbd17493beb7e..ee93f06fbc958 100644 --- a/doc/source/visualization.rst +++ b/doc/source/visualization.rst @@ -37,7 +37,8 @@ libraries that go beyond the basics documented here. Basic Plotting: ``plot`` ------------------------ -See the :ref:`cookbook` for some advanced strategies +We will demonstrate the basics, see the :ref:`cookbook` for +some advanced strategies. The ``plot`` method on Series and DataFrame is just a simple wrapper around :meth:`plt.plot() `: @@ -94,7 +95,8 @@ You can plot one column versus another using the `x` and `y` keywords in .. note:: - For more formatting and styling options, see :ref:`below `. + For more formatting and styling options, see + :ref:`formatting ` below. .. ipython:: python :suppress: @@ -107,14 +109,13 @@ Other Plots ----------- Plotting methods allow for a handful of plot styles other than the -default Line plot. These methods can be provided as the ``kind`` -keyword argument to :meth:`~DataFrame.plot`. -These include: +default line plot. These methods can be provided as the ``kind`` +keyword argument to :meth:`~DataFrame.plot`, and include: * :ref:`'bar' ` or :ref:`'barh' ` for bar plots * :ref:`'hist' ` for histogram * :ref:`'box' ` for boxplot -* :ref:`'kde' ` or ``'density'`` for density plots +* :ref:`'kde' ` or :ref:`'density' ` for density plots * :ref:`'area' ` for area plots * :ref:`'scatter' ` for scatter plots * :ref:`'hexbin' ` for hexagonal bin plots @@ -220,7 +221,7 @@ To get horizontal bar plots, use the ``barh`` method: Histograms ~~~~~~~~~~ -Histogram can be drawn by using the :meth:`DataFrame.plot.hist` and :meth:`Series.plot.hist` methods. +Histograms can be drawn by using the :meth:`DataFrame.plot.hist` and :meth:`Series.plot.hist` methods. .. ipython:: python @@ -238,7 +239,8 @@ Histogram can be drawn by using the :meth:`DataFrame.plot.hist` and :meth:`Serie plt.close('all') -Histogram can be stacked by ``stacked=True``. Bin size can be changed by ``bins`` keyword. +A histogram can be stacked using ``stacked=True``. Bin size can be changed +using the ``bins`` keyword. .. ipython:: python @@ -252,7 +254,9 @@ Histogram can be stacked by ``stacked=True``. Bin size can be changed by ``bins` plt.close('all') -You can pass other keywords supported by matplotlib ``hist``. For example, horizontal and cumulative histogram can be drawn by ``orientation='horizontal'`` and ``cumulative=True``. +You can pass other keywords supported by matplotlib ``hist``. For example, +horizontal and cumulative histograms can be drawn by +``orientation='horizontal'`` and ``cumulative=True``. .. ipython:: python @@ -463,7 +467,7 @@ keyword, will affect the output type as well: ``'both'`` Yes Series of namedtuples ================ ======= ========================== -``Groupby.boxplot`` always returns a Series of ``return_type``. +``Groupby.boxplot`` always returns a ``Series`` of ``return_type``. .. ipython:: python :okwarning: @@ -481,7 +485,9 @@ keyword, will affect the output type as well: plt.close('all') -Compare to: +The subplots above are split by the numeric columns first, then the value of +the ``g`` column. Below the subplots are first split by the value of ``g``, +then by the numeric columns. .. ipython:: python :okwarning: @@ -536,8 +542,8 @@ Scatter Plot ~~~~~~~~~~~~ Scatter plot can be drawn by using the :meth:`DataFrame.plot.scatter` method. -Scatter plot requires numeric columns for x and y axis. -These can be specified by ``x`` and ``y`` keywords each. +Scatter plot requires numeric columns for the x and y axes. +These can be specified by the ``x`` and ``y`` keywords. .. ipython:: python :suppress: @@ -581,8 +587,9 @@ each point: plt.close('all') -You can pass other keywords supported by matplotlib ``scatter``. -Below example shows a bubble chart using a dataframe column values as bubble size. +You can pass other keywords supported by matplotlib +:meth:`scatter `. The example below shows a +bubble chart using a column of the ``DataFrame`` as the bubble size. .. ipython:: python @@ -631,7 +638,7 @@ You can specify alternative aggregations by passing values to the ``C`` and and ``reduce_C_function`` is a function of one argument that reduces all the values in a bin to a single number (e.g. ``mean``, ``max``, ``sum``, ``std``). In this example the positions are given by columns ``a`` and ``b``, while the value is -given by column ``z``. The bins are aggregated with numpy's ``max`` function. +given by column ``z``. The bins are aggregated with NumPy's ``max`` function. .. ipython:: python :suppress: @@ -685,14 +692,16 @@ A ``ValueError`` will be raised if there are any negative values in your data. plt.close('all') -For pie plots it's best to use square figures, one's with an equal aspect ratio. You can create the -figure with equal width and height, or force the aspect ratio to be equal after plotting by -calling ``ax.set_aspect('equal')`` on the returned ``axes`` object. +For pie plots it's best to use square figures, i.e. a figure aspect ratio 1. +You can create the figure with equal width and height, or force the aspect ratio +to be equal after plotting by calling ``ax.set_aspect('equal')`` on the returned +``axes`` object. -Note that pie plot with :class:`DataFrame` requires that you either specify a target column by the ``y`` -argument or ``subplots=True``. When ``y`` is specified, pie plot of selected column -will be drawn. If ``subplots=True`` is specified, pie plots for each column are drawn as subplots. -A legend will be drawn in each pie plots by default; specify ``legend=False`` to hide it. +Note that pie plot with :class:`DataFrame` requires that you either specify a +target column by the ``y`` argument or ``subplots=True``. When ``y`` is +specified, pie plot of selected column will be drawn. If ``subplots=True`` is +specified, pie plots for each column are drawn as subplots. A legend will be +drawn in each pie plots by default; specify ``legend=False`` to hide it. .. ipython:: python :suppress: @@ -762,7 +771,7 @@ See the `matplotlib pie documentation `_ +for more information. By coloring these curves differently for each class it is possible to visualize data clustering. Curves belonging to samples of the same class will usually be closer together and form larger structures. @@ -883,8 +893,10 @@ of the same class will usually be closer together and form larger structures. Parallel Coordinates ~~~~~~~~~~~~~~~~~~~~ -Parallel coordinates is a plotting technique for plotting multivariate data. -It allows one to see clusters in data and to estimate other statistics visually. +Parallel coordinates is a plotting technique for plotting multivariate data, +see the `Wikipedia entry`_ +for an introduction. +Parallel coordinates allows one to see clusters in data and to estimate other statistics visually. Using parallel coordinates points are represented as connected line segments. Each vertical line represents one attribute. One set of connected line segments represents one data point. Points that tend to cluster will appear closer together. @@ -912,7 +924,9 @@ Lag Plot Lag plots are used to check if a data set or time series is random. Random data should not exhibit any structure in the lag plot. Non-random structure -implies that the underlying data are not random. +implies that the underlying data are not random. The ``lag`` argument may +be passed, and when ``lag=1`` the plot is essentially ``data[:-1]`` vs. +``data[1:]``. .. ipython:: python :suppress: @@ -947,7 +961,9 @@ If time series is random, such autocorrelations should be near zero for any and all time-lag separations. If time series is non-random then one or more of the autocorrelations will be significantly non-zero. The horizontal lines displayed in the plot correspond to 95% and 99% confidence bands. The dashed line is 99% -confidence band. +confidence band. See the +`Wikipedia entry`_ for more about +autocorrelation plots. .. ipython:: python :suppress: @@ -1016,6 +1032,8 @@ unit interval). The point in the plane, where our sample settles to (where the forces acting on our sample are at an equilibrium) is where a dot representing our sample will be drawn. Depending on which class that sample belongs it will be colored differently. +See the R package `Radviz`_ +for more information. **Note**: The "Iris" dataset is available `here `__. @@ -1046,7 +1064,7 @@ Setting the plot style From version 1.5 and up, matplotlib offers a range of preconfigured plotting styles. Setting the style can be used to easily give plots the general look that you want. Setting the style is as easy as calling ``matplotlib.style.use(my_plot_style)`` before -creating your plot. For example you could do ``matplotlib.style.use('ggplot')`` for ggplot-style +creating your plot. For example you could write ``matplotlib.style.use('ggplot')`` for ggplot-style plots. You can see the various available style names at ``matplotlib.style.available`` and it's very @@ -1147,7 +1165,7 @@ To plot data on a secondary y-axis, use the ``secondary_y`` keyword: plt.close('all') -To plot some columns in a DataFrame, give the column names to the ``secondary_y`` +To plot some columns in a ``DataFrame``, give the column names to the ``secondary_y`` keyword: .. ipython:: python @@ -1248,7 +1266,7 @@ See the :meth:`autofmt_xdate ` method and the Subplots ~~~~~~~~ -Each Series in a DataFrame can be plotted on a different axis +Each ``Series`` in a ``DataFrame`` can be plotted on a different axis with the ``subplots`` keyword: .. ipython:: python @@ -1264,9 +1282,9 @@ with the ``subplots`` keyword: Using Layout and Targeting Multiple Axes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The layout of subplots can be specified by ``layout`` keyword. It can accept +The layout of subplots can be specified by the ``layout`` keyword. It can accept ``(rows, columns)``. The ``layout`` keyword can be used in -``hist`` and ``boxplot`` also. If input is invalid, ``ValueError`` will be raised. +``hist`` and ``boxplot`` also. If the input is invalid, a ``ValueError`` will be raised. The number of axes which can be contained by rows x columns specified by ``layout`` must be larger than the number of required subplots. If layout can contain more axes than required, @@ -1284,7 +1302,7 @@ or columns needed, given the other. plt.close('all') -The above example is identical to using +The above example is identical to using: .. ipython:: python @@ -1298,11 +1316,11 @@ The above example is identical to using The required number of columns (3) is inferred from the number of series to plot and the given number of rows (2). -Also, you can pass multiple axes created beforehand as list-like via ``ax`` keyword. -This allows to use more complicated layout. +You can pass multiple axes created beforehand as list-like via ``ax`` keyword. +This allows more complicated layouts. The passed axes must be the same number as the subplots being drawn. -When multiple axes are passed via ``ax`` keyword, ``layout``, ``sharex`` and ``sharey`` keywords +When multiple axes are passed via the ``ax`` keyword, ``layout``, ``sharex`` and ``sharey`` keywords don't affect to the output. You should explicitly pass ``sharex=False`` and ``sharey=False``, otherwise you will see a warning. @@ -1359,13 +1377,13 @@ Another option is passing an ``ax`` argument to :meth:`Series.plot` to plot on a Plotting With Error Bars ~~~~~~~~~~~~~~~~~~~~~~~~ -Plotting with error bars is now supported in the :meth:`DataFrame.plot` and :meth:`Series.plot` +Plotting with error bars is supported in :meth:`DataFrame.plot` and :meth:`Series.plot`. -Horizontal and vertical errorbars can be supplied to the ``xerr`` and ``yerr`` keyword arguments to :meth:`~DataFrame.plot()`. The error values can be specified using a variety of formats. +Horizontal and vertical error bars can be supplied to the ``xerr`` and ``yerr`` keyword arguments to :meth:`~DataFrame.plot()`. The error values can be specified using a variety of formats: -- As a :class:`DataFrame` or ``dict`` of errors with column names matching the ``columns`` attribute of the plotting :class:`DataFrame` or matching the ``name`` attribute of the :class:`Series` -- As a ``str`` indicating which of the columns of plotting :class:`DataFrame` contain the error values -- As raw values (``list``, ``tuple``, or ``np.ndarray``). Must be the same length as the plotting :class:`DataFrame`/:class:`Series` +- As a :class:`DataFrame` or ``dict`` of errors with column names matching the ``columns`` attribute of the plotting :class:`DataFrame` or matching the ``name`` attribute of the :class:`Series`. +- As a ``str`` indicating which of the columns of plotting :class:`DataFrame` contain the error values. +- As raw values (``list``, ``tuple``, or ``np.ndarray``). Must be the same length as the plotting :class:`DataFrame`/:class:`Series`. Asymmetrical error bars are also supported, however raw error values must be provided in this case. For a ``M`` length :class:`Series`, a ``Mx2`` array should be provided indicating lower and upper (or left and right) errors. For a ``MxN`` :class:`DataFrame`, asymmetrical errors should be in a ``Mx2xN`` array. @@ -1420,7 +1438,10 @@ Plotting with matplotlib table is now supported in :meth:`DataFrame.plot` and : plt.close('all') -Also, you can pass different :class:`DataFrame` or :class:`Series` for ``table`` keyword. The data will be drawn as displayed in print method (not transposed automatically). If required, it should be transposed manually as below example. +Also, you can pass a different :class:`DataFrame` or :class:`Series` to the +``table`` keyword. The data will be drawn as displayed in print method +(not transposed automatically). If required, it should be transposed manually +as seen in the example below. .. ipython:: python @@ -1434,7 +1455,10 @@ Also, you can pass different :class:`DataFrame` or :class:`Series` for ``table`` plt.close('all') -Finally, there is a helper function ``pandas.plotting.table`` to create a table from :class:`DataFrame` and :class:`Series`, and add it to an ``matplotlib.Axes``. This function can accept keywords which matplotlib table has. +There also exists a helper function ``pandas.plotting.table``, which creates a +table from :class:`DataFrame` or :class:`Series`, and adds it to an +``matplotlib.Axes`` instance. This function can accept keywords which the +matplotlib `table `__ has. .. ipython:: python @@ -1461,18 +1485,18 @@ Colormaps A potential issue when plotting a large number of columns is that it can be difficult to distinguish some series due to repetition in the default colors. To -remedy this, DataFrame plotting supports the use of the ``colormap=`` argument, +remedy this, ``DataFrame`` plotting supports the use of the ``colormap`` argument, which accepts either a Matplotlib `colormap `__ or a string that is a name of a colormap registered with Matplotlib. A visualization of the default matplotlib colormaps is available `here -`__. +`__. As matplotlib does not directly support colormaps for line-based plots, the colors are selected based on an even spacing determined by the number of columns -in the DataFrame. There is no consideration made for background color, so some +in the ``DataFrame``. There is no consideration made for background color, so some colormaps will produce lines that are not easily visible. -To use the cubehelix colormap, we can simply pass ``'cubehelix'`` to ``colormap=`` +To use the cubehelix colormap, we can pass ``colormap='cubehelix'``. .. ipython:: python :suppress: @@ -1494,7 +1518,7 @@ To use the cubehelix colormap, we can simply pass ``'cubehelix'`` to ``colormap= plt.close('all') -or we can pass the colormap itself +Alternatively, we can pass the colormap itself: .. ipython:: python @@ -1565,9 +1589,9 @@ Plotting directly with matplotlib In some situations it may still be preferable or necessary to prepare plots directly with matplotlib, for instance when a certain type of plot or -customization is not (yet) supported by pandas. Series and DataFrame objects -behave like arrays and can therefore be passed directly to matplotlib functions -without explicit casts. +customization is not (yet) supported by pandas. ``Series`` and ``DataFrame`` +objects behave like arrays and can therefore be passed directly to +matplotlib functions without explicit casts. pandas also automatically registers formatters and locators that recognize date indices, thereby extending date and time support to practically all plot types From 3597de0ada3869c8841d05ef4353195a6e124184 Mon Sep 17 00:00:00 2001 From: topper-123 Date: Thu, 1 Feb 2018 09:12:00 +0000 Subject: [PATCH 017/214] DEPR/CLN: Remove pd.rolling_*, pd.expanding* and pd.ewm* (#18723) * remove pd.running_*, pd.expanding_* and pd.ewm* and related code * added test_expanding_func and test_expanding_apply * recreate _check_ndarray inline in _check_moment_func --- doc/source/computation.rst | 11 +- doc/source/whatsnew/v0.23.0.txt | 2 + pandas/__init__.py | 1 - pandas/stats/__init__.py | 0 pandas/stats/api.py | 7 - pandas/stats/moments.py | 855 -------------------------------- pandas/tests/api/test_api.py | 17 +- pandas/tests/test_window.py | 725 +++++++++++---------------- 8 files changed, 282 insertions(+), 1336 deletions(-) delete mode 100644 pandas/stats/__init__.py delete mode 100644 pandas/stats/api.py delete mode 100644 pandas/stats/moments.py diff --git a/doc/source/computation.rst b/doc/source/computation.rst index 06afa440aa26c..a64542fa71705 100644 --- a/doc/source/computation.rst +++ b/doc/source/computation.rst @@ -209,19 +209,12 @@ Window Functions .. currentmodule:: pandas.core.window -.. warning:: - - Prior to version 0.18.0, ``pd.rolling_*``, ``pd.expanding_*``, and ``pd.ewm*`` were module level - functions and are now deprecated. These are replaced by using the :class:`~pandas.core.window.Rolling`, :class:`~pandas.core.window.Expanding` and :class:`~pandas.core.window.EWM`. objects and a corresponding method call. - - The deprecation warning will show the new syntax, see an example :ref:`here `. - -For working with data, a number of windows functions are provided for +For working with data, a number of window functions are provided for computing common *window* or *rolling* statistics. Among these are count, sum, mean, median, correlation, variance, covariance, standard deviation, skewness, and kurtosis. -Starting in version 0.18.1, the ``rolling()`` and ``expanding()`` +The ``rolling()`` and ``expanding()`` functions can be used directly from DataFrameGroupBy objects, see the :ref:`groupby docs `. diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 592c0788070a1..2bd2bb199bf1f 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -361,6 +361,8 @@ Removal of prior version deprecations/changes - The ``labels`` attribute of the ``Categorical`` class has been removed in favor of :attribute:`Categorical.codes` (:issue:`7768`) - The ``flavor`` parameter have been removed from func:`to_sql` method (:issue:`13611`) - The modules `pandas.tools.hashing` and `pandas.util.hashing` have been removed (:issue:`16223`) +- The top-level functions ``pd.rolling_*``, ``pd.expanding_*`` and ``pd.ewm*`` have been removed (Deprecated since v0.18). + Instead, use the DataFrame/Series methods :attr:`~DataFrame.rolling`, :attr:`~DataFrame.expanding` and :attr:`~DataFrame.ewm` (:issue:`18723`) .. _whatsnew_0230.performance: diff --git a/pandas/__init__.py b/pandas/__init__.py index 78501620d780b..97ae73174c09c 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -41,7 +41,6 @@ from pandas.core.api import * from pandas.core.sparse.api import * -from pandas.stats.api import * from pandas.tseries.api import * from pandas.core.computation.api import * from pandas.core.reshape.api import * diff --git a/pandas/stats/__init__.py b/pandas/stats/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/pandas/stats/api.py b/pandas/stats/api.py deleted file mode 100644 index 2a11456d4f9e5..0000000000000 --- a/pandas/stats/api.py +++ /dev/null @@ -1,7 +0,0 @@ -""" -Common namespace of statistical functions -""" - -# flake8: noqa - -from pandas.stats.moments import * diff --git a/pandas/stats/moments.py b/pandas/stats/moments.py deleted file mode 100644 index 1cd98feb05ea0..0000000000000 --- a/pandas/stats/moments.py +++ /dev/null @@ -1,855 +0,0 @@ -""" -Provides rolling statistical moments and related descriptive -statistics implemented in Cython -""" -from __future__ import division - -import warnings -import numpy as np -from pandas.core.dtypes.common import is_scalar -from pandas.core.api import DataFrame, Series -from pandas.util._decorators import Substitution, Appender - -__all__ = ['rolling_count', 'rolling_max', 'rolling_min', - 'rolling_sum', 'rolling_mean', 'rolling_std', 'rolling_cov', - 'rolling_corr', 'rolling_var', 'rolling_skew', 'rolling_kurt', - 'rolling_quantile', 'rolling_median', 'rolling_apply', - 'rolling_window', - 'ewma', 'ewmvar', 'ewmstd', 'ewmvol', 'ewmcorr', 'ewmcov', - 'expanding_count', 'expanding_max', 'expanding_min', - 'expanding_sum', 'expanding_mean', 'expanding_std', - 'expanding_cov', 'expanding_corr', 'expanding_var', - 'expanding_skew', 'expanding_kurt', 'expanding_quantile', - 'expanding_median', 'expanding_apply'] - -# ----------------------------------------------------------------------------- -# Docs - -# The order of arguments for the _doc_template is: -# (header, args, kwargs, returns, notes) - -_doc_template = """ -%s - -Parameters ----------- -%s%s -Returns -------- -%s -%s -""" - -_roll_kw = """window : int - Size of the moving window. This is the number of observations used for - calculating the statistic. -min_periods : int, default None - Minimum number of observations in window required to have a value - (otherwise result is NA). -freq : string or DateOffset object, optional (default None) - Frequency to conform the data to before computing the statistic. Specified - as a frequency string or DateOffset object. -center : boolean, default False - Set the labels at the center of the window. -how : string, default '%s' - Method for down- or re-sampling -""" - -_roll_notes = r""" -Notes ------ -By default, the result is set to the right edge of the window. This can be -changed to the center of the window by setting ``center=True``. - -The `freq` keyword is used to conform time series data to a specified -frequency by resampling the data. This is done with the default parameters -of :meth:`~pandas.Series.resample` (i.e. using the `mean`). -""" - - -_ewm_kw = r"""com : float, optional - Specify decay in terms of center of mass, - :math:`\alpha = 1 / (1 + com),\text{ for } com \geq 0` -span : float, optional - Specify decay in terms of span, - :math:`\alpha = 2 / (span + 1),\text{ for } span \geq 1` -halflife : float, optional - Specify decay in terms of half-life, - :math:`\alpha = 1 - exp(log(0.5) / halflife),\text{ for } halflife > 0` -alpha : float, optional - Specify smoothing factor :math:`\alpha` directly, - :math:`0 < \alpha \leq 1` - - .. versionadded:: 0.18.0 - -min_periods : int, default 0 - Minimum number of observations in window required to have a value - (otherwise result is NA). -freq : None or string alias / date offset object, default=None - Frequency to conform to before computing statistic -adjust : boolean, default True - Divide by decaying adjustment factor in beginning periods to account for - imbalance in relative weightings (viewing EWMA as a moving average) -how : string, default 'mean' - Method for down- or re-sampling -ignore_na : boolean, default False - Ignore missing values when calculating weights; - specify True to reproduce pre-0.15.0 behavior -""" - -_ewm_notes = r""" -Notes ------ -Exactly one of center of mass, span, half-life, and alpha must be provided. -Allowed values and relationship between the parameters are specified in the -parameter descriptions above; see the link at the end of this section for -a detailed explanation. - -When adjust is True (default), weighted averages are calculated using weights - (1-alpha)**(n-1), (1-alpha)**(n-2), ..., 1-alpha, 1. - -When adjust is False, weighted averages are calculated recursively as: - weighted_average[0] = arg[0]; - weighted_average[i] = (1-alpha)*weighted_average[i-1] + alpha*arg[i]. - -When ignore_na is False (default), weights are based on absolute positions. -For example, the weights of x and y used in calculating the final weighted -average of [x, None, y] are (1-alpha)**2 and 1 (if adjust is True), and -(1-alpha)**2 and alpha (if adjust is False). - -When ignore_na is True (reproducing pre-0.15.0 behavior), weights are based on -relative positions. For example, the weights of x and y used in calculating -the final weighted average of [x, None, y] are 1-alpha and 1 (if adjust is -True), and 1-alpha and alpha (if adjust is False). - -More details can be found at -http://pandas.pydata.org/pandas-docs/stable/computation.html#exponentially-weighted-windows -""" - -_expanding_kw = """min_periods : int, default None - Minimum number of observations in window required to have a value - (otherwise result is NA). -freq : string or DateOffset object, optional (default None) - Frequency to conform the data to before computing the statistic. Specified - as a frequency string or DateOffset object. -""" - - -_type_of_input_retval = "y : type of input argument" - -_flex_retval = """y : type depends on inputs - DataFrame / DataFrame -> DataFrame (matches on columns) or Panel (pairwise) - DataFrame / Series -> Computes result for each column - Series / Series -> Series""" - -_pairwise_retval = "y : Panel whose items are df1.index values" - -_unary_arg = "arg : Series, DataFrame\n" - -_binary_arg_flex = """arg1 : Series, DataFrame, or ndarray -arg2 : Series, DataFrame, or ndarray, optional - if not supplied then will default to arg1 and produce pairwise output -""" - -_binary_arg = """arg1 : Series, DataFrame, or ndarray -arg2 : Series, DataFrame, or ndarray -""" - -_pairwise_arg = """df1 : DataFrame -df2 : DataFrame -""" - -_pairwise_kw = """pairwise : bool, default False - If False then only matching columns between arg1 and arg2 will be used and - the output will be a DataFrame. - If True then all pairwise combinations will be calculated and the output - will be a Panel in the case of DataFrame inputs. In the case of missing - elements, only complete pairwise observations will be used. -""" - -_ddof_kw = """ddof : int, default 1 - Delta Degrees of Freedom. The divisor used in calculations - is ``N - ddof``, where ``N`` represents the number of elements. -""" - -_bias_kw = r"""bias : boolean, default False - Use a standard estimation bias correction -""" - - -def ensure_compat(dispatch, name, arg, func_kw=None, *args, **kwargs): - """ - wrapper function to dispatch to the appropriate window functions - wraps/unwraps ndarrays for compat - - can be removed when ndarray support is removed - """ - is_ndarray = isinstance(arg, np.ndarray) - if is_ndarray: - if arg.ndim == 1: - arg = Series(arg) - elif arg.ndim == 2: - arg = DataFrame(arg) - else: - raise AssertionError("cannot support ndim > 2 for ndarray compat") - - warnings.warn("pd.{dispatch}_{name} is deprecated for ndarrays and " - "will be removed " - "in a future version" - .format(dispatch=dispatch, name=name), - FutureWarning, stacklevel=3) - - # get the functional keywords here - if func_kw is None: - func_kw = [] - kwds = {} - for k in func_kw: - value = kwargs.pop(k, None) - if value is not None: - kwds[k] = value - - # TODO: the below is only in place temporary until this module is removed. - kwargs.pop('freq', None) # freq removed in 0.23 - # how is a keyword that if not-None should be in kwds - how = kwargs.pop('how', None) - if how is not None: - kwds['how'] = how - - r = getattr(arg, dispatch)(**kwargs) - - if not is_ndarray: - - # give a helpful deprecation message - # with copy-pastable arguments - pargs = ','.join("{a}={b}".format(a=a, b=b) - for a, b in kwargs.items() if b is not None) - aargs = ','.join(args) - if len(aargs): - aargs += ',' - - def f(a, b): - if is_scalar(b): - return "{a}={b}".format(a=a, b=b) - return "{a}=<{b}>".format(a=a, b=type(b).__name__) - aargs = ','.join(f(a, b) for a, b in kwds.items() if b is not None) - warnings.warn("pd.{dispatch}_{name} is deprecated for {klass} " - "and will be removed in a future version, replace with " - "\n\t{klass}.{dispatch}({pargs}).{name}({aargs})" - .format(klass=type(arg).__name__, pargs=pargs, - aargs=aargs, dispatch=dispatch, name=name), - FutureWarning, stacklevel=3) - - result = getattr(r, name)(*args, **kwds) - - if is_ndarray: - result = result.values - return result - - -def rolling_count(arg, window, **kwargs): - """ - Rolling count of number of non-NaN observations inside provided window. - - Parameters - ---------- - arg : DataFrame or numpy ndarray-like - window : int - Size of the moving window. This is the number of observations used for - calculating the statistic. - freq : string or DateOffset object, optional (default None) - Frequency to conform the data to before computing the - statistic. Specified as a frequency string or DateOffset object. - center : boolean, default False - Whether the label should correspond with center of window - how : string, default 'mean' - Method for down- or re-sampling - - Returns - ------- - rolling_count : type of caller - - Notes - ----- - The `freq` keyword is used to conform time series data to a specified - frequency by resampling the data. This is done with the default parameters - of :meth:`~pandas.Series.resample` (i.e. using the `mean`). - - To learn more about the frequency strings, please see `this link - `__. - """ - return ensure_compat('rolling', 'count', arg, window=window, **kwargs) - - -@Substitution("Unbiased moving covariance.", _binary_arg_flex, - _roll_kw % 'None' + _pairwise_kw + _ddof_kw, _flex_retval, - _roll_notes) -@Appender(_doc_template) -def rolling_cov(arg1, arg2=None, window=None, pairwise=None, **kwargs): - if window is None and isinstance(arg2, (int, float)): - window = arg2 - arg2 = arg1 - pairwise = True if pairwise is None else pairwise # only default unset - elif arg2 is None: - arg2 = arg1 - pairwise = True if pairwise is None else pairwise # only default unset - return ensure_compat('rolling', - 'cov', - arg1, - other=arg2, - window=window, - pairwise=pairwise, - func_kw=['other', 'pairwise', 'ddof'], - **kwargs) - - -@Substitution("Moving sample correlation.", _binary_arg_flex, - _roll_kw % 'None' + _pairwise_kw, _flex_retval, _roll_notes) -@Appender(_doc_template) -def rolling_corr(arg1, arg2=None, window=None, pairwise=None, **kwargs): - if window is None and isinstance(arg2, (int, float)): - window = arg2 - arg2 = arg1 - pairwise = True if pairwise is None else pairwise # only default unset - elif arg2 is None: - arg2 = arg1 - pairwise = True if pairwise is None else pairwise # only default unset - return ensure_compat('rolling', - 'corr', - arg1, - other=arg2, - window=window, - pairwise=pairwise, - func_kw=['other', 'pairwise'], - **kwargs) - - -# ----------------------------------------------------------------------------- -# Exponential moving moments - - -@Substitution("Exponentially-weighted moving average", _unary_arg, _ewm_kw, - _type_of_input_retval, _ewm_notes) -@Appender(_doc_template) -def ewma(arg, com=None, span=None, halflife=None, alpha=None, min_periods=0, - freq=None, adjust=True, how=None, ignore_na=False): - return ensure_compat('ewm', - 'mean', - arg, - com=com, - span=span, - halflife=halflife, - alpha=alpha, - min_periods=min_periods, - freq=freq, - adjust=adjust, - how=how, - ignore_na=ignore_na) - - -@Substitution("Exponentially-weighted moving variance", _unary_arg, - _ewm_kw + _bias_kw, _type_of_input_retval, _ewm_notes) -@Appender(_doc_template) -def ewmvar(arg, com=None, span=None, halflife=None, alpha=None, min_periods=0, - bias=False, freq=None, how=None, ignore_na=False, adjust=True): - return ensure_compat('ewm', - 'var', - arg, - com=com, - span=span, - halflife=halflife, - alpha=alpha, - min_periods=min_periods, - freq=freq, - adjust=adjust, - how=how, - ignore_na=ignore_na, - bias=bias, - func_kw=['bias']) - - -@Substitution("Exponentially-weighted moving std", _unary_arg, - _ewm_kw + _bias_kw, _type_of_input_retval, _ewm_notes) -@Appender(_doc_template) -def ewmstd(arg, com=None, span=None, halflife=None, alpha=None, min_periods=0, - bias=False, freq=None, how=None, ignore_na=False, adjust=True): - return ensure_compat('ewm', - 'std', - arg, - com=com, - span=span, - halflife=halflife, - alpha=alpha, - min_periods=min_periods, - freq=freq, - adjust=adjust, - how=how, - ignore_na=ignore_na, - bias=bias, - func_kw=['bias']) - - -ewmvol = ewmstd - - -@Substitution("Exponentially-weighted moving covariance", _binary_arg_flex, - _ewm_kw + _pairwise_kw, _type_of_input_retval, _ewm_notes) -@Appender(_doc_template) -def ewmcov(arg1, arg2=None, com=None, span=None, halflife=None, alpha=None, - min_periods=0, bias=False, freq=None, pairwise=None, how=None, - ignore_na=False, adjust=True): - if arg2 is None: - arg2 = arg1 - pairwise = True if pairwise is None else pairwise - elif isinstance(arg2, (int, float)) and com is None: - com = arg2 - arg2 = arg1 - pairwise = True if pairwise is None else pairwise - - return ensure_compat('ewm', - 'cov', - arg1, - other=arg2, - com=com, - span=span, - halflife=halflife, - alpha=alpha, - min_periods=min_periods, - bias=bias, - freq=freq, - how=how, - ignore_na=ignore_na, - adjust=adjust, - pairwise=pairwise, - func_kw=['other', 'pairwise', 'bias']) - - -@Substitution("Exponentially-weighted moving correlation", _binary_arg_flex, - _ewm_kw + _pairwise_kw, _type_of_input_retval, _ewm_notes) -@Appender(_doc_template) -def ewmcorr(arg1, arg2=None, com=None, span=None, halflife=None, alpha=None, - min_periods=0, freq=None, pairwise=None, how=None, ignore_na=False, - adjust=True): - if arg2 is None: - arg2 = arg1 - pairwise = True if pairwise is None else pairwise - elif isinstance(arg2, (int, float)) and com is None: - com = arg2 - arg2 = arg1 - pairwise = True if pairwise is None else pairwise - return ensure_compat('ewm', - 'corr', - arg1, - other=arg2, - com=com, - span=span, - halflife=halflife, - alpha=alpha, - min_periods=min_periods, - freq=freq, - how=how, - ignore_na=ignore_na, - adjust=adjust, - pairwise=pairwise, - func_kw=['other', 'pairwise']) - -# --------------------------------------------------------------------- -# Python interface to Cython functions - - -def _rolling_func(name, desc, how=None, func_kw=None, additional_kw=''): - if how is None: - how_arg_str = 'None' - else: - how_arg_str = "'{how}".format(how=how) - - @Substitution(desc, _unary_arg, _roll_kw % how_arg_str + additional_kw, - _type_of_input_retval, _roll_notes) - @Appender(_doc_template) - def f(arg, window, min_periods=None, freq=None, center=False, - **kwargs): - - return ensure_compat('rolling', - name, - arg, - window=window, - min_periods=min_periods, - freq=freq, - center=center, - func_kw=func_kw, - **kwargs) - return f - - -rolling_max = _rolling_func('max', 'Moving maximum.', how='max') -rolling_min = _rolling_func('min', 'Moving minimum.', how='min') -rolling_sum = _rolling_func('sum', 'Moving sum.') -rolling_mean = _rolling_func('mean', 'Moving mean.') -rolling_median = _rolling_func('median', 'Moving median.', how='median') -rolling_std = _rolling_func('std', 'Moving standard deviation.', - func_kw=['ddof'], - additional_kw=_ddof_kw) -rolling_var = _rolling_func('var', 'Moving variance.', - func_kw=['ddof'], - additional_kw=_ddof_kw) -rolling_skew = _rolling_func('skew', 'Unbiased moving skewness.') -rolling_kurt = _rolling_func('kurt', 'Unbiased moving kurtosis.') - - -def rolling_quantile(arg, window, quantile, min_periods=None, freq=None, - center=False): - """Moving quantile. - - Parameters - ---------- - arg : Series, DataFrame - window : int - Size of the moving window. This is the number of observations used for - calculating the statistic. - quantile : float - 0 <= quantile <= 1 - min_periods : int, default None - Minimum number of observations in window required to have a value - (otherwise result is NA). - freq : string or DateOffset object, optional (default None) - Frequency to conform the data to before computing the - statistic. Specified as a frequency string or DateOffset object. - center : boolean, default False - Whether the label should correspond with center of window - - Returns - ------- - y : type of input argument - - Notes - ----- - By default, the result is set to the right edge of the window. This can be - changed to the center of the window by setting ``center=True``. - - The `freq` keyword is used to conform time series data to a specified - frequency by resampling the data. This is done with the default parameters - of :meth:`~pandas.Series.resample` (i.e. using the `mean`). - - To learn more about the frequency strings, please see `this link - `__. - """ - return ensure_compat('rolling', - 'quantile', - arg, - window=window, - freq=freq, - center=center, - min_periods=min_periods, - func_kw=['quantile'], - quantile=quantile) - - -def rolling_apply(arg, window, func, min_periods=None, freq=None, - center=False, args=(), kwargs={}): - """Generic moving function application. - - Parameters - ---------- - arg : Series, DataFrame - window : int - Size of the moving window. This is the number of observations used for - calculating the statistic. - func : function - Must produce a single value from an ndarray input - min_periods : int, default None - Minimum number of observations in window required to have a value - (otherwise result is NA). - freq : string or DateOffset object, optional (default None) - Frequency to conform the data to before computing the - statistic. Specified as a frequency string or DateOffset object. - center : boolean, default False - Whether the label should correspond with center of window - args : tuple - Passed on to func - kwargs : dict - Passed on to func - - Returns - ------- - y : type of input argument - - Notes - ----- - By default, the result is set to the right edge of the window. This can be - changed to the center of the window by setting ``center=True``. - - The `freq` keyword is used to conform time series data to a specified - frequency by resampling the data. This is done with the default parameters - of :meth:`~pandas.Series.resample` (i.e. using the `mean`). - - To learn more about the frequency strings, please see `this link - `__. - """ - return ensure_compat('rolling', - 'apply', - arg, - window=window, - freq=freq, - center=center, - min_periods=min_periods, - func_kw=['func', 'args', 'kwargs'], - func=func, - args=args, - kwargs=kwargs) - - -def rolling_window(arg, window=None, win_type=None, min_periods=None, - freq=None, center=False, mean=True, - axis=0, how=None, **kwargs): - """ - Applies a moving window of type ``window_type`` and size ``window`` - on the data. - - Parameters - ---------- - arg : Series, DataFrame - window : int or ndarray - Weighting window specification. If the window is an integer, then it is - treated as the window length and win_type is required - win_type : str, default None - Window type (see Notes) - min_periods : int, default None - Minimum number of observations in window required to have a value - (otherwise result is NA). - freq : string or DateOffset object, optional (default None) - Frequency to conform the data to before computing the - statistic. Specified as a frequency string or DateOffset object. - center : boolean, default False - Whether the label should correspond with center of window - mean : boolean, default True - If True computes weighted mean, else weighted sum - axis : {0, 1}, default 0 - how : string, default 'mean' - Method for down- or re-sampling - - Returns - ------- - y : type of input argument - - Notes - ----- - The recognized window types are: - - * ``boxcar`` - * ``triang`` - * ``blackman`` - * ``hamming`` - * ``bartlett`` - * ``parzen`` - * ``bohman`` - * ``blackmanharris`` - * ``nuttall`` - * ``barthann`` - * ``kaiser`` (needs beta) - * ``gaussian`` (needs std) - * ``general_gaussian`` (needs power, width) - * ``slepian`` (needs width). - - By default, the result is set to the right edge of the window. This can be - changed to the center of the window by setting ``center=True``. - - The `freq` keyword is used to conform time series data to a specified - frequency by resampling the data. This is done with the default parameters - of :meth:`~pandas.Series.resample` (i.e. using the `mean`). - - To learn more about the frequency strings, please see `this link - `__. - """ - func = 'mean' if mean else 'sum' - return ensure_compat('rolling', - func, - arg, - window=window, - win_type=win_type, - freq=freq, - center=center, - min_periods=min_periods, - axis=axis, - func_kw=kwargs.keys(), - **kwargs) - - -def _expanding_func(name, desc, func_kw=None, additional_kw=''): - @Substitution(desc, _unary_arg, _expanding_kw + additional_kw, - _type_of_input_retval, "") - @Appender(_doc_template) - def f(arg, min_periods=1, freq=None, **kwargs): - return ensure_compat('expanding', - name, - arg, - min_periods=min_periods, - func_kw=func_kw, - **kwargs) - return f - - -expanding_max = _expanding_func('max', 'Expanding maximum.') -expanding_min = _expanding_func('min', 'Expanding minimum.') -expanding_sum = _expanding_func('sum', 'Expanding sum.') -expanding_mean = _expanding_func('mean', 'Expanding mean.') -expanding_median = _expanding_func('median', 'Expanding median.') - -expanding_std = _expanding_func('std', 'Expanding standard deviation.', - func_kw=['ddof'], - additional_kw=_ddof_kw) -expanding_var = _expanding_func('var', 'Expanding variance.', - func_kw=['ddof'], - additional_kw=_ddof_kw) -expanding_skew = _expanding_func('skew', 'Unbiased expanding skewness.') -expanding_kurt = _expanding_func('kurt', 'Unbiased expanding kurtosis.') - - -def expanding_count(arg, freq=None): - """ - Expanding count of number of non-NaN observations. - - Parameters - ---------- - arg : DataFrame or numpy ndarray-like - freq : string or DateOffset object, optional (default None) - Frequency to conform the data to before computing the - statistic. Specified as a frequency string or DateOffset object. - - Returns - ------- - expanding_count : type of caller - - Notes - ----- - The `freq` keyword is used to conform time series data to a specified - frequency by resampling the data. This is done with the default parameters - of :meth:`~pandas.Series.resample` (i.e. using the `mean`). - - To learn more about the frequency strings, please see `this link - `__. - """ - return ensure_compat('expanding', 'count', arg, freq=freq) - - -def expanding_quantile(arg, quantile, min_periods=1, freq=None): - """Expanding quantile. - - Parameters - ---------- - arg : Series, DataFrame - quantile : float - 0 <= quantile <= 1 - min_periods : int, default None - Minimum number of observations in window required to have a value - (otherwise result is NA). - freq : string or DateOffset object, optional (default None) - Frequency to conform the data to before computing the - statistic. Specified as a frequency string or DateOffset object. - - Returns - ------- - y : type of input argument - - Notes - ----- - The `freq` keyword is used to conform time series data to a specified - frequency by resampling the data. This is done with the default parameters - of :meth:`~pandas.Series.resample` (i.e. using the `mean`). - - To learn more about the frequency strings, please see `this link - `__. - """ - return ensure_compat('expanding', - 'quantile', - arg, - freq=freq, - min_periods=min_periods, - func_kw=['quantile'], - quantile=quantile) - - -@Substitution("Unbiased expanding covariance.", _binary_arg_flex, - _expanding_kw + _pairwise_kw + _ddof_kw, _flex_retval, "") -@Appender(_doc_template) -def expanding_cov(arg1, arg2=None, min_periods=1, freq=None, - pairwise=None, ddof=1): - if arg2 is None: - arg2 = arg1 - pairwise = True if pairwise is None else pairwise - elif isinstance(arg2, (int, float)) and min_periods is None: - min_periods = arg2 - arg2 = arg1 - pairwise = True if pairwise is None else pairwise - return ensure_compat('expanding', - 'cov', - arg1, - other=arg2, - min_periods=min_periods, - pairwise=pairwise, - freq=freq, - ddof=ddof, - func_kw=['other', 'pairwise', 'ddof']) - - -@Substitution("Expanding sample correlation.", _binary_arg_flex, - _expanding_kw + _pairwise_kw, _flex_retval, "") -@Appender(_doc_template) -def expanding_corr(arg1, arg2=None, min_periods=1, freq=None, pairwise=None): - if arg2 is None: - arg2 = arg1 - pairwise = True if pairwise is None else pairwise - elif isinstance(arg2, (int, float)) and min_periods is None: - min_periods = arg2 - arg2 = arg1 - pairwise = True if pairwise is None else pairwise - return ensure_compat('expanding', - 'corr', - arg1, - other=arg2, - min_periods=min_periods, - pairwise=pairwise, - freq=freq, - func_kw=['other', 'pairwise', 'ddof']) - - -def expanding_apply(arg, func, min_periods=1, freq=None, - args=(), kwargs={}): - """Generic expanding function application. - - Parameters - ---------- - arg : Series, DataFrame - func : function - Must produce a single value from an ndarray input - min_periods : int, default None - Minimum number of observations in window required to have a value - (otherwise result is NA). - freq : string or DateOffset object, optional (default None) - Frequency to conform the data to before computing the - statistic. Specified as a frequency string or DateOffset object. - args : tuple - Passed on to func - kwargs : dict - Passed on to func - - Returns - ------- - y : type of input argument - - Notes - ----- - The `freq` keyword is used to conform time series data to a specified - frequency by resampling the data. This is done with the default parameters - of :meth:`~pandas.Series.resample` (i.e. using the `mean`). - - To learn more about the frequency strings, please see `this link - `__. - """ - return ensure_compat('expanding', - 'apply', - arg, - freq=freq, - min_periods=min_periods, - func_kw=['func', 'args', 'kwargs'], - func=func, - args=args, - kwargs=kwargs) diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index c20767b09178c..ea6c250420b13 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -35,8 +35,7 @@ class TestPDApi(Base): 'util', 'options', 'io'] # these are already deprecated; awaiting removal - deprecated_modules = ['stats', 'datetools', 'parser', - 'json', 'lib', 'tslib'] + deprecated_modules = ['datetools', 'parser', 'json', 'lib', 'tslib'] # misc misc = ['IndexSlice', 'NaT'] @@ -91,19 +90,7 @@ class TestPDApi(Base): deprecated_funcs_in_future = [] # these are already deprecated; awaiting removal - deprecated_funcs = ['ewma', 'ewmcorr', 'ewmcov', 'ewmstd', 'ewmvar', - 'ewmvol', 'expanding_apply', 'expanding_corr', - 'expanding_count', 'expanding_cov', 'expanding_kurt', - 'expanding_max', 'expanding_mean', 'expanding_median', - 'expanding_min', 'expanding_quantile', - 'expanding_skew', 'expanding_std', 'expanding_sum', - 'expanding_var', 'rolling_apply', - 'rolling_corr', 'rolling_count', 'rolling_cov', - 'rolling_kurt', 'rolling_max', 'rolling_mean', - 'rolling_median', 'rolling_min', 'rolling_quantile', - 'rolling_skew', 'rolling_std', 'rolling_sum', - 'rolling_var', 'rolling_window', - 'pnow', 'match', 'groupby', 'get_store', + deprecated_funcs = ['pnow', 'match', 'groupby', 'get_store', 'plot_params', 'scatter_matrix'] def test_api(self): diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index 6f9e872526d0a..22526d14a7168 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -1,6 +1,5 @@ from itertools import product import pytest -import sys import warnings from warnings import catch_warnings @@ -9,16 +8,15 @@ import numpy as np import pandas as pd -from pandas import (Series, DataFrame, bdate_range, isna, - notna, concat, Timestamp, Index) -import pandas.stats.moments as mom +from pandas import (Series, DataFrame, bdate_range, + isna, notna, concat, Timestamp, Index) import pandas.core.window as rwindow import pandas.tseries.offsets as offsets from pandas.core.base import SpecificationError from pandas.errors import UnsupportedFunctionCall import pandas.util.testing as tm import pandas.util._test_decorators as td -from pandas.compat import range, zip, PY3 +from pandas.compat import range, zip N, K = 100, 10 @@ -610,19 +608,6 @@ def test_numpy_compat(self): getattr(e, func), dtype=np.float64) -class TestDeprecations(Base): - """ test that we are catching deprecation warnings """ - - def setup_method(self, method): - self._create_data() - - def test_deprecations(self): - - with catch_warnings(record=True): - mom.rolling_mean(np.ones(10), 3, center=True, axis=0) - mom.rolling_mean(Series(np.ones(10)), 3, center=True, axis=0) - - # gh-12373 : rolling functions error on float32 data # make sure rolling functions works for different dtypes # @@ -863,72 +848,55 @@ def test_centered_axis_validation(self): .rolling(window=3, center=True, axis=2).mean()) def test_rolling_sum(self): - self._check_moment_func(mom.rolling_sum, np.nansum, name='sum', + self._check_moment_func(np.nansum, name='sum', zero_min_periods_equal=False) def test_rolling_count(self): counter = lambda x: np.isfinite(x).astype(float).sum() - self._check_moment_func(mom.rolling_count, counter, name='count', - has_min_periods=False, preserve_nan=False, + self._check_moment_func(counter, name='count', has_min_periods=False, fill_value=0) def test_rolling_mean(self): - self._check_moment_func(mom.rolling_mean, np.mean, name='mean') + self._check_moment_func(np.mean, name='mean') @td.skip_if_no_scipy def test_cmov_mean(self): # GH 8238 vals = np.array([6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, 9.48, 10.63, 14.48]) - xp = np.array([np.nan, np.nan, 9.962, 11.27, 11.564, 12.516, 12.818, - 12.952, np.nan, np.nan]) - - with catch_warnings(record=True): - rs = mom.rolling_mean(vals, 5, center=True) - tm.assert_almost_equal(xp, rs) - - xp = Series(rs) - rs = Series(vals).rolling(5, center=True).mean() - tm.assert_series_equal(xp, rs) + result = Series(vals).rolling(5, center=True).mean() + expected = Series([np.nan, np.nan, 9.962, 11.27, 11.564, 12.516, + 12.818, 12.952, np.nan, np.nan]) + tm.assert_series_equal(expected, result) @td.skip_if_no_scipy def test_cmov_window(self): # GH 8238 vals = np.array([6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, 9.48, 10.63, 14.48]) - xp = np.array([np.nan, np.nan, 9.962, 11.27, 11.564, 12.516, 12.818, - 12.952, np.nan, np.nan]) - - with catch_warnings(record=True): - rs = mom.rolling_window(vals, 5, 'boxcar', center=True) - tm.assert_almost_equal(xp, rs) - - xp = Series(rs) - rs = Series(vals).rolling(5, win_type='boxcar', center=True).mean() - tm.assert_series_equal(xp, rs) + result = Series(vals).rolling(5, win_type='boxcar', center=True).mean() + expected = Series([np.nan, np.nan, 9.962, 11.27, 11.564, 12.516, + 12.818, 12.952, np.nan, np.nan]) + tm.assert_series_equal(expected, result) @td.skip_if_no_scipy def test_cmov_window_corner(self): # GH 8238 # all nan - vals = np.empty(10, dtype=float) - vals.fill(np.nan) - with catch_warnings(record=True): - rs = mom.rolling_window(vals, 5, 'boxcar', center=True) - assert np.isnan(rs).all() + vals = pd.Series([np.nan] * 10) + result = vals.rolling(5, center=True, win_type='boxcar').mean() + assert np.isnan(result).all() # empty - vals = np.array([]) - with catch_warnings(record=True): - rs = mom.rolling_window(vals, 5, 'boxcar', center=True) - assert len(rs) == 0 + vals = pd.Series([]) + result = vals.rolling(5, center=True, win_type='boxcar').mean() + assert len(result) == 0 # shorter than window - vals = np.random.randn(5) - with catch_warnings(record=True): - rs = mom.rolling_window(vals, 10, 'boxcar') - assert np.isnan(rs).all() - assert len(rs) == 5 + vals = pd.Series(np.random.randn(5)) + result = vals.rolling(10, win_type='boxcar').mean() + assert np.isnan(result).all() + assert len(result) == 5 @td.skip_if_no_scipy def test_cmov_window_frame(self): @@ -1097,38 +1065,31 @@ def test_cmov_window_special_linear_range(self): tm.assert_series_equal(xp, rs) def test_rolling_median(self): - with catch_warnings(record=True): - self._check_moment_func(mom.rolling_median, np.median, - name='median') + self._check_moment_func(np.median, name='median') def test_rolling_min(self): + self._check_moment_func(np.min, name='min') - with catch_warnings(record=True): - self._check_moment_func(mom.rolling_min, np.min, name='min') - - with catch_warnings(record=True): - a = np.array([1, 2, 3, 4, 5]) - b = mom.rolling_min(a, window=100, min_periods=1) - tm.assert_almost_equal(b, np.ones(len(a))) + a = pd.Series([1, 2, 3, 4, 5]) + result = a.rolling(window=100, min_periods=1).min() + expected = pd.Series(np.ones(len(a))) + tm.assert_series_equal(result, expected) - pytest.raises(ValueError, mom.rolling_min, np.array([1, 2, 3]), - window=3, min_periods=5) + with pytest.raises(ValueError): + pd.Series([1, 2, 3]).rolling(window=3, min_periods=5).min() def test_rolling_max(self): + self._check_moment_func(np.max, name='max') - with catch_warnings(record=True): - self._check_moment_func(mom.rolling_max, np.max, name='max') + a = pd.Series([1, 2, 3, 4, 5], dtype=np.float64) + b = a.rolling(window=100, min_periods=1).max() + tm.assert_almost_equal(a, b) - with catch_warnings(record=True): - a = np.array([1, 2, 3, 4, 5], dtype=np.float64) - b = mom.rolling_max(a, window=100, min_periods=1) - tm.assert_almost_equal(a, b) - - pytest.raises(ValueError, mom.rolling_max, np.array([1, 2, 3]), - window=3, min_periods=5) + with pytest.raises(ValueError): + pd.Series([1, 2, 3]).rolling(window=3, min_periods=5).max() - def test_rolling_quantile(self): - qs = [0.0, .1, .5, .9, 1.0] + @pytest.mark.parametrize('q', [0.0, .1, .5, .9, 1.0]) + def test_rolling_quantile(self, q): def scoreatpercentile(a, per): values = np.sort(a, axis=0) @@ -1147,18 +1108,11 @@ def scoreatpercentile(a, per): return retval - for q in qs: - - def f(x, window, quantile, min_periods=None, freq=None, - center=False): - return mom.rolling_quantile(x, window, quantile, - min_periods=min_periods, freq=freq, - center=center) + def quantile_func(x): + return scoreatpercentile(x, q) - def alt(x): - return scoreatpercentile(x, q) - - self._check_moment_func(f, alt, name='quantile', quantile=q) + self._check_moment_func(quantile_func, name='quantile', + quantile=q) def test_rolling_quantile_np_percentile(self): # #9413: Tests that rolling window's quantile default behavior @@ -1207,15 +1161,10 @@ def test_rolling_apply(self): tm.assert_series_equal(ser, ser.rolling(10).apply(lambda x: x.mean())) - f = lambda x: x[np.isfinite(x)].mean() - - def roll_mean(x, window, min_periods=None, freq=None, center=False, - **kwargs): - return mom.rolling_apply(x, window, func=f, - min_periods=min_periods, freq=freq, - center=center) + def f(x): + return x[np.isfinite(x)].mean() - self._check_moment_func(roll_mean, np.mean, name='apply', func=f) + self._check_moment_func(np.mean, name='apply', func=f) # GH 8080 s = Series([None, None, None]) @@ -1228,39 +1177,34 @@ def roll_mean(x, window, min_periods=None, freq=None, center=False, def test_rolling_apply_out_of_bounds(self): # #1850 - arr = np.arange(4) + vals = pd.Series([1, 2, 3, 4]) - # it works! - with catch_warnings(record=True): - result = mom.rolling_apply(arr, 10, np.sum) - assert isna(result).all() + result = vals.rolling(10).apply(np.sum) + assert result.isna().all() - with catch_warnings(record=True): - result = mom.rolling_apply(arr, 10, np.sum, min_periods=1) - tm.assert_almost_equal(result, result) + result = vals.rolling(10, min_periods=1).apply(np.sum) + expected = pd.Series([1, 3, 6, 10], dtype=float) + tm.assert_almost_equal(result, expected) def test_rolling_std(self): - self._check_moment_func(mom.rolling_std, lambda x: np.std(x, ddof=1), + self._check_moment_func(lambda x: np.std(x, ddof=1), name='std') - self._check_moment_func(mom.rolling_std, lambda x: np.std(x, ddof=0), + self._check_moment_func(lambda x: np.std(x, ddof=0), name='std', ddof=0) def test_rolling_std_1obs(self): - with catch_warnings(record=True): - result = mom.rolling_std(np.array([1., 2., 3., 4., 5.]), - 1, min_periods=1) - expected = np.array([np.nan] * 5) - tm.assert_almost_equal(result, expected) + vals = pd.Series([1., 2., 3., 4., 5.]) - with catch_warnings(record=True): - result = mom.rolling_std(np.array([1., 2., 3., 4., 5.]), - 1, min_periods=1, ddof=0) - expected = np.zeros(5) - tm.assert_almost_equal(result, expected) + result = vals.rolling(1, min_periods=1).std() + expected = pd.Series([np.nan] * 5) + tm.assert_series_equal(result, expected) - with catch_warnings(record=True): - result = mom.rolling_std(np.array([np.nan, np.nan, 3., 4., 5.]), - 3, min_periods=2) + result = vals.rolling(1, min_periods=1).std(ddof=0) + expected = pd.Series([0.] * 5) + tm.assert_series_equal(result, expected) + + result = (pd.Series([np.nan, np.nan, 3, 4, 5]) + .rolling(3, min_periods=2).std()) assert np.isnan(result[2]) def test_rolling_std_neg_sqrt(self): @@ -1268,208 +1212,53 @@ def test_rolling_std_neg_sqrt(self): # Test move_nanstd for neg sqrt. - a = np.array([0.0011448196318903589, 0.00028718669878572767, - 0.00028718669878572767, 0.00028718669878572767, - 0.00028718669878572767]) - with catch_warnings(record=True): - b = mom.rolling_std(a, window=3) + a = pd.Series([0.0011448196318903589, 0.00028718669878572767, + 0.00028718669878572767, 0.00028718669878572767, + 0.00028718669878572767]) + b = a.rolling(window=3).std() assert np.isfinite(b[2:]).all() - with catch_warnings(record=True): - b = mom.ewmstd(a, span=3) + b = a.ewm(span=3).std() assert np.isfinite(b[2:]).all() def test_rolling_var(self): - self._check_moment_func(mom.rolling_var, lambda x: np.var(x, ddof=1), - test_stable=True, name='var') - self._check_moment_func(mom.rolling_var, lambda x: np.var(x, ddof=0), + self._check_moment_func(lambda x: np.var(x, ddof=1), + name='var') + self._check_moment_func(lambda x: np.var(x, ddof=0), name='var', ddof=0) @td.skip_if_no_scipy def test_rolling_skew(self): from scipy.stats import skew - self._check_moment_func(mom.rolling_skew, - lambda x: skew(x, bias=False), name='skew') + self._check_moment_func(lambda x: skew(x, bias=False), name='skew') @td.skip_if_no_scipy def test_rolling_kurt(self): from scipy.stats import kurtosis - self._check_moment_func(mom.rolling_kurt, - lambda x: kurtosis(x, bias=False), name='kurt') - - def test_fperr_robustness(self): - # TODO: remove this once python 2.5 out of picture - if PY3: - pytest.skip("doesn't work on python 3") - - # #2114 - data = '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x1a@\xaa\xaa\xaa\xaa\xaa\xaa\x02@8\x8e\xe38\x8e\xe3\xe8?z\t\xed%\xb4\x97\xd0?\xa2\x0c<\xdd\x9a\x1f\xb6?\x82\xbb\xfa&y\x7f\x9d?\xac\'\xa7\xc4P\xaa\x83?\x90\xdf\xde\xb0k8j?`\xea\xe9u\xf2zQ?*\xe37\x9d\x98N7?\xe2.\xf5&v\x13\x1f?\xec\xc9\xf8\x19\xa4\xb7\x04?\x90b\xf6w\x85\x9f\xeb>\xb5A\xa4\xfaXj\xd2>F\x02\xdb\xf8\xcb\x8d\xb8>.\xac<\xfb\x87^\xa0>\xe8:\xa6\xf9_\xd3\x85>\xfb?\xe2cUU\xfd?\xfc\x7fA\xed8\x8e\xe3?\xa5\xaa\xac\x91\xf6\x12\xca?n\x1cs\xb6\xf9a\xb1?\xe8%D\xf3L-\x97?5\xddZD\x11\xe7~?#>\xe7\x82\x0b\x9ad?\xd9R4Y\x0fxK?;7x;\nP2?N\xf4JO\xb8j\x18?4\xf81\x8a%G\x00?\x9a\xf5\x97\r2\xb4\xe5>\xcd\x9c\xca\xbcB\xf0\xcc>3\x13\x87(\xd7J\xb3>\x99\x19\xb4\xe0\x1e\xb9\x99>ff\xcd\x95\x14&\x81>\x88\x88\xbc\xc7p\xddf>`\x0b\xa6_\x96|N>@\xb2n\xea\x0eS4>U\x98\x938i\x19\x1b>\x8eeb\xd0\xf0\x10\x02>\xbd\xdc-k\x96\x16\xe8=(\x93\x1e\xf2\x0e\x0f\xd0=\xe0n\xd3Bii\xb5=*\xe9\x19Y\x8c\x8c\x9c=\xc6\xf0\xbb\x90]\x08\x83=]\x96\xfa\xc0|`i=>d\xfc\xd5\xfd\xeaP=R0\xfb\xc7\xa7\x8e6=\xc2\x95\xf9_\x8a\x13\x1e=\xd6c\xa6\xea\x06\r\x04=r\xda\xdd8\t\xbc\xea<\xf6\xe6\x93\xd0\xb0\xd2\xd1<\x9d\xdeok\x96\xc3\xb7<&~\xea9s\xaf\x9f\xb8\x02@\xc6\xd2&\xfd\xa8\xf5\xe8?\xd9\xe1\x19\xfe\xc5\xa3\xd0?v\x82"\xa8\xb2/\xb6?\x9dX\x835\xee\x94\x9d?h\x90W\xce\x9e\xb8\x83?\x8a\xc0th~Kj?\\\x80\xf8\x9a\xa9\x87Q?%\xab\xa0\xce\x8c_7?1\xe4\x80\x13\x11*\x1f? \x98\x00\r\xb6\xc6\x04?\x80u\xabf\x9d\xb3\xeb>UNrD\xbew\xd2>\x1c\x13C[\xa8\x9f\xb8>\x12b\xd7m-\x1fQ@\xe3\x85>\xe6\x91)l\x00/m>Da\xc6\xf2\xaatS>\x05\xd7]\xee\xe3\xf09>' # noqa - - arr = np.frombuffer(data, dtype='= 0).all() - - with catch_warnings(record=True): - result = mom.rolling_mean(arr, 2) - assert (result[1:] >= 0).all() - - with catch_warnings(record=True): - result = mom.rolling_var(arr, 2) - assert (result[1:] >= 0).all() + self._check_moment_func(lambda x: kurtosis(x, bias=False), + name='kurt') - # #2527, ugh - arr = np.array([0.00012456, 0.0003, 0]) - with catch_warnings(record=True): - result = mom.rolling_mean(arr, 1) - assert result[-1] >= 0 - - with catch_warnings(record=True): - result = mom.rolling_mean(-arr, 1) - assert result[-1] <= 0 - - def _check_moment_func(self, f, static_comp, name=None, window=50, - has_min_periods=True, has_center=True, - has_time_rule=True, preserve_nan=True, - fill_value=None, test_stable=False, - zero_min_periods_equal=True, + def _check_moment_func(self, static_comp, name, has_min_periods=True, + has_center=True, has_time_rule=True, + fill_value=None, zero_min_periods_equal=True, **kwargs): - with warnings.catch_warnings(record=True): - self._check_ndarray(f, static_comp, window=window, - has_min_periods=has_min_periods, - preserve_nan=preserve_nan, - has_center=has_center, fill_value=fill_value, - test_stable=test_stable, - zero_min_periods_equal=zero_min_periods_equal, - **kwargs) - - with warnings.catch_warnings(record=True): - self._check_structures(f, static_comp, - has_min_periods=has_min_periods, - has_time_rule=has_time_rule, - fill_value=fill_value, - has_center=has_center, **kwargs) - - # new API - if name is not None: - self._check_structures(f, static_comp, name=name, - has_min_periods=has_min_periods, - has_time_rule=has_time_rule, - fill_value=fill_value, - has_center=has_center, **kwargs) - - def _check_ndarray(self, f, static_comp, window=50, has_min_periods=True, - preserve_nan=True, has_center=True, fill_value=None, - test_stable=False, test_window=True, - zero_min_periods_equal=True, **kwargs): - def get_result(arr, window, min_periods=None, center=False): - return f(arr, window, min_periods=min_periods, center=center, ** - kwargs) - - result = get_result(self.arr, window) - tm.assert_almost_equal(result[-1], static_comp(self.arr[-50:])) - - if preserve_nan: - assert (np.isnan(result[self._nan_locs]).all()) - - # excluding NaNs correctly - arr = randn(50) - arr[:10] = np.NaN - arr[-10:] = np.NaN - - if has_min_periods: - result = get_result(arr, 50, min_periods=30) - tm.assert_almost_equal(result[-1], static_comp(arr[10:-10])) - - # min_periods is working correctly - result = get_result(arr, 20, min_periods=15) - assert np.isnan(result[23]) - assert not np.isnan(result[24]) - - assert not np.isnan(result[-6]) - assert np.isnan(result[-5]) - - arr2 = randn(20) - result = get_result(arr2, 10, min_periods=5) - assert isna(result[3]) - assert notna(result[4]) - - if zero_min_periods_equal: - # min_periods=0 may be equivalent to min_periods=1 - result0 = get_result(arr, 20, min_periods=0) - result1 = get_result(arr, 20, min_periods=1) - tm.assert_almost_equal(result0, result1) - else: - result = get_result(arr, 50) - tm.assert_almost_equal(result[-1], static_comp(arr[10:-10])) - - # GH 7925 - if has_center: - if has_min_periods: - result = get_result(arr, 20, min_periods=15, center=True) - expected = get_result( - np.concatenate((arr, np.array([np.NaN] * 9))), 20, - min_periods=15)[9:] - else: - result = get_result(arr, 20, center=True) - expected = get_result( - np.concatenate((arr, np.array([np.NaN] * 9))), 20)[9:] - - tm.assert_numpy_array_equal(result, expected) - - if test_stable: - result = get_result(self.arr + 1e9, window) - tm.assert_almost_equal(result[-1], - static_comp(self.arr[-50:] + 1e9)) - - # Test window larger than array, #7297 - if test_window: - if has_min_periods: - for minp in (0, len(self.arr) - 1, len(self.arr)): - result = get_result(self.arr, len(self.arr) + 1, - min_periods=minp) - expected = get_result(self.arr, len(self.arr), - min_periods=minp) - nan_mask = np.isnan(result) - tm.assert_numpy_array_equal(nan_mask, np.isnan(expected)) - - nan_mask = ~nan_mask - tm.assert_almost_equal(result[nan_mask], - expected[nan_mask]) - else: - result = get_result(self.arr, len(self.arr) + 1) - expected = get_result(self.arr, len(self.arr)) - nan_mask = np.isnan(result) - tm.assert_numpy_array_equal(nan_mask, np.isnan(expected)) - - nan_mask = ~nan_mask - tm.assert_almost_equal(result[nan_mask], expected[nan_mask]) - - def _check_structures(self, f, static_comp, name=None, - has_min_periods=True, has_time_rule=True, - has_center=True, fill_value=None, **kwargs): def get_result(obj, window, min_periods=None, center=False): - - # check via the API calls if name is provided - if name is not None: - r = obj.rolling(window=window, min_periods=min_periods, - center=center) - return getattr(r, name)(**kwargs) - - # check via the moments API - with catch_warnings(record=True): - return f(obj, window=window, min_periods=min_periods, - center=center, **kwargs) + r = obj.rolling(window=window, min_periods=min_periods, + center=center) + return getattr(r, name)(**kwargs) series_result = get_result(self.series, window=50) - frame_result = get_result(self.frame, window=50) - assert isinstance(series_result, Series) - assert type(frame_result) == DataFrame + tm.assert_almost_equal(series_result.iloc[-1], + static_comp(self.series[-50:])) + + frame_result = get_result(self.frame, window=50) + assert isinstance(frame_result, DataFrame) + tm.assert_series_equal(frame_result.iloc[-1, :], + self.frame.iloc[-50:, :].apply(static_comp, + axis=0), + check_names=False) # check time_rule works if has_time_rule: @@ -1500,8 +1289,72 @@ def get_result(obj, window, min_periods=None, center=False): trunc_frame.apply(static_comp), check_names=False) - # GH 7925 + # excluding NaNs correctly + obj = Series(randn(50)) + obj[:10] = np.NaN + obj[-10:] = np.NaN + if has_min_periods: + result = get_result(obj, 50, min_periods=30) + tm.assert_almost_equal(result.iloc[-1], static_comp(obj[10:-10])) + + # min_periods is working correctly + result = get_result(obj, 20, min_periods=15) + assert isna(result.iloc[23]) + assert not isna(result.iloc[24]) + + assert not isna(result.iloc[-6]) + assert isna(result.iloc[-5]) + + obj2 = Series(randn(20)) + result = get_result(obj2, 10, min_periods=5) + assert isna(result.iloc[3]) + assert notna(result.iloc[4]) + + if zero_min_periods_equal: + # min_periods=0 may be equivalent to min_periods=1 + result0 = get_result(obj, 20, min_periods=0) + result1 = get_result(obj, 20, min_periods=1) + tm.assert_almost_equal(result0, result1) + else: + result = get_result(obj, 50) + tm.assert_almost_equal(result.iloc[-1], static_comp(obj[10:-10])) + + # window larger than series length (#7297) + if has_min_periods: + for minp in (0, len(self.series) - 1, len(self.series)): + result = get_result(self.series, len(self.series) + 1, + min_periods=minp) + expected = get_result(self.series, len(self.series), + min_periods=minp) + nan_mask = isna(result) + tm.assert_series_equal(nan_mask, isna(expected)) + + nan_mask = ~nan_mask + tm.assert_almost_equal(result[nan_mask], + expected[nan_mask]) + else: + result = get_result(self.series, len(self.series) + 1) + expected = get_result(self.series, len(self.series)) + nan_mask = isna(result) + tm.assert_series_equal(nan_mask, isna(expected)) + + nan_mask = ~nan_mask + tm.assert_almost_equal(result[nan_mask], expected[nan_mask]) + + # check center=True if has_center: + if has_min_periods: + result = get_result(obj, 20, min_periods=15, center=True) + expected = get_result( + pd.concat([obj, Series([np.NaN] * 9)]), 20, + min_periods=15)[9:].reset_index(drop=True) + else: + result = get_result(obj, 20, center=True) + expected = get_result( + pd.concat([obj, Series([np.NaN] * 9)]), + 20)[9:].reset_index(drop=True) + + tm.assert_series_equal(result, expected) # shifter index s = ['x%d' % x for x in range(12)] @@ -1541,12 +1394,11 @@ def get_result(obj, window, min_periods=None, center=False): tm.assert_frame_equal(frame_xp, frame_rs) def test_ewma(self): - self._check_ew(mom.ewma, name='mean') + self._check_ew(name='mean') - arr = np.zeros(1000) - arr[5] = 1 - with catch_warnings(record=True): - result = mom.ewma(arr, span=100, adjust=False).sum() + vals = pd.Series(np.zeros(1000)) + vals[5] = 1 + result = vals.ewm(span=100, adjust=False).mean().sum() assert np.abs(result - 1) < 1e-2 s = Series([1.0, 2.0, 4.0, 8.0]) @@ -1626,55 +1478,34 @@ def simple_wma(s, w): tm.assert_series_equal(result, expected) def test_ewmvar(self): - self._check_ew(mom.ewmvar, name='var') + self._check_ew(name='var') def test_ewmvol(self): - self._check_ew(mom.ewmvol, name='vol') + self._check_ew(name='vol') def test_ewma_span_com_args(self): - with catch_warnings(record=True): - A = mom.ewma(self.arr, com=9.5) - B = mom.ewma(self.arr, span=20) - tm.assert_almost_equal(A, B) + A = self.series.ewm(com=9.5).mean() + B = self.series.ewm(span=20).mean() + tm.assert_almost_equal(A, B) - pytest.raises(ValueError, mom.ewma, self.arr, com=9.5, span=20) - pytest.raises(ValueError, mom.ewma, self.arr) + with pytest.raises(ValueError): + self.series.ewm(com=9.5, span=20) + with pytest.raises(ValueError): + self.series.ewm().mean() def test_ewma_halflife_arg(self): - with catch_warnings(record=True): - A = mom.ewma(self.arr, com=13.932726172912965) - B = mom.ewma(self.arr, halflife=10.0) - tm.assert_almost_equal(A, B) - - pytest.raises(ValueError, mom.ewma, self.arr, span=20, - halflife=50) - pytest.raises(ValueError, mom.ewma, self.arr, com=9.5, - halflife=50) - pytest.raises(ValueError, mom.ewma, self.arr, com=9.5, span=20, - halflife=50) - pytest.raises(ValueError, mom.ewma, self.arr) - - def test_ewma_alpha_old_api(self): - # GH 10789 - with catch_warnings(record=True): - a = mom.ewma(self.arr, alpha=0.61722699889169674) - b = mom.ewma(self.arr, com=0.62014947789973052) - c = mom.ewma(self.arr, span=2.240298955799461) - d = mom.ewma(self.arr, halflife=0.721792864318) - tm.assert_numpy_array_equal(a, b) - tm.assert_numpy_array_equal(a, c) - tm.assert_numpy_array_equal(a, d) - - def test_ewma_alpha_arg_old_api(self): - # GH 10789 - with catch_warnings(record=True): - pytest.raises(ValueError, mom.ewma, self.arr) - pytest.raises(ValueError, mom.ewma, self.arr, - com=10.0, alpha=0.5) - pytest.raises(ValueError, mom.ewma, self.arr, - span=10.0, alpha=0.5) - pytest.raises(ValueError, mom.ewma, self.arr, - halflife=10.0, alpha=0.5) + A = self.series.ewm(com=13.932726172912965).mean() + B = self.series.ewm(halflife=10.0).mean() + tm.assert_almost_equal(A, B) + + with pytest.raises(ValueError): + self.series.ewm(span=20, halflife=50) + with pytest.raises(ValueError): + self.series.ewm(com=9.5, halflife=50) + with pytest.raises(ValueError): + self.series.ewm(com=9.5, span=20, halflife=50) + with pytest.raises(ValueError): + self.series.ewm() def test_ewm_alpha(self): # GH 10789 @@ -1689,11 +1520,15 @@ def test_ewm_alpha(self): def test_ewm_alpha_arg(self): # GH 10789 - s = Series(self.arr) - pytest.raises(ValueError, s.ewm) - pytest.raises(ValueError, s.ewm, com=10.0, alpha=0.5) - pytest.raises(ValueError, s.ewm, span=10.0, alpha=0.5) - pytest.raises(ValueError, s.ewm, halflife=10.0, alpha=0.5) + s = self.series + with pytest.raises(ValueError): + s.ewm() + with pytest.raises(ValueError): + s.ewm(com=10.0, alpha=0.5) + with pytest.raises(ValueError): + s.ewm(span=10.0, alpha=0.5) + with pytest.raises(ValueError): + s.ewm(halflife=10.0, alpha=0.5) def test_ewm_domain_checks(self): # GH 12492 @@ -1719,24 +1554,25 @@ def test_ewm_domain_checks(self): s.ewm(alpha=1.0) pytest.raises(ValueError, s.ewm, alpha=1.1) - def test_ew_empty_arrays(self): - arr = np.array([], dtype=np.float64) + def test_ew_empty_series(self): + vals = pd.Series([], dtype=np.float64) - funcs = [mom.ewma, mom.ewmvol, mom.ewmvar] + ewm = vals.ewm(3) + funcs = ['mean', 'vol', 'var'] for f in funcs: - with catch_warnings(record=True): - result = f(arr, 3) - tm.assert_almost_equal(result, arr) + result = getattr(ewm, f)() + tm.assert_almost_equal(result, vals) - def _check_ew(self, func, name=None): - with catch_warnings(record=True): - self._check_ew_ndarray(func, name=name) - self._check_ew_structures(func, name=name) + def _check_ew(self, name=None, preserve_nan=False): + series_result = getattr(self.series.ewm(com=10), name)() + assert isinstance(series_result, Series) + + frame_result = getattr(self.frame.ewm(com=10), name)() + assert type(frame_result) == DataFrame - def _check_ew_ndarray(self, func, preserve_nan=False, name=None): - result = func(self.arr, com=10) + result = getattr(self.series.ewm(com=10), name)() if preserve_nan: - assert (np.isnan(result[self._nan_locs]).all()) + assert result[self._nan_locs].isna().all() # excluding NaNs correctly arr = randn(50) @@ -1746,45 +1582,40 @@ def _check_ew_ndarray(self, func, preserve_nan=False, name=None): # check min_periods # GH 7898 - result = func(s, 50, min_periods=2) - assert np.isnan(result.values[:11]).all() - assert not np.isnan(result.values[11:]).any() + result = getattr(s.ewm(com=50, min_periods=2), name)() + assert result[:11].isna().all() + assert not result[11:].isna().any() for min_periods in (0, 1): - result = func(s, 50, min_periods=min_periods) - if func == mom.ewma: - assert np.isnan(result.values[:10]).all() - assert not np.isnan(result.values[10:]).any() + result = getattr(s.ewm(com=50, min_periods=min_periods), name)() + if name == 'mean': + assert result[:10].isna().all() + assert not result[10:].isna().any() else: - # ewmstd, ewmvol, ewmvar (with bias=False) require at least two - # values - assert np.isnan(result.values[:11]).all() - assert not np.isnan(result.values[11:]).any() + # ewm.std, ewm.vol, ewm.var (with bias=False) require at least + # two values + assert result[:11].isna().all() + assert not result[11:].isna().any() # check series of length 0 - result = func(Series([]), 50, min_periods=min_periods) - tm.assert_series_equal(result, Series([])) + result = getattr(Series().ewm(com=50, min_periods=min_periods), + name)() + tm.assert_series_equal(result, Series()) # check series of length 1 - result = func(Series([1.]), 50, min_periods=min_periods) - if func == mom.ewma: + result = getattr(Series([1.]).ewm(50, min_periods=min_periods), + name)() + if name == 'mean': tm.assert_series_equal(result, Series([1.])) else: - # ewmstd, ewmvol, ewmvar with bias=False require at least two - # values + # ewm.std, ewm.vol, ewm.var with bias=False require at least + # two values tm.assert_series_equal(result, Series([np.NaN])) # pass in ints - result2 = func(np.arange(50), span=10) + result2 = getattr(Series(np.arange(50)).ewm(span=10), name)() assert result2.dtype == np.float_ - def _check_ew_structures(self, func, name): - series_result = getattr(self.series.ewm(com=10), name)() - assert isinstance(series_result, Series) - - frame_result = getattr(self.frame.ewm(com=10), name)() - assert type(frame_result) == DataFrame - class TestPairwise(object): @@ -2021,9 +1852,6 @@ class TestMomentsConsistency(Base): # lambda v: Series(v).skew(), 3, 'skew'), # (lambda v: Series(v).kurt(), 4, 'kurt'), - # (lambda x, min_periods: mom.expanding_quantile(x, 0.3, - # min_periods=min_periods, 'quantile'), - # restore once GH 8084 is fixed # lambda v: Series(v).quantile(0.3), None, 'quantile'), @@ -2585,22 +2413,6 @@ def func(A, B, com, **kwargs): pytest.raises(Exception, func, A, randn(50), 20, min_periods=5) - def test_expanding_apply(self): - ser = Series([]) - tm.assert_series_equal(ser, ser.expanding().apply(lambda x: x.mean())) - - def expanding_mean(x, min_periods=1): - return mom.expanding_apply(x, lambda x: x.mean(), - min_periods=min_periods) - - self._check_expanding(expanding_mean, np.mean) - - # GH 8080 - s = Series([None, None, None]) - result = s.expanding(min_periods=0).apply(lambda x: len(x)) - expected = Series([1., 2., 3.]) - tm.assert_series_equal(result, expected) - def test_expanding_apply_args_kwargs(self): def mean_w_arg(x, const): return np.mean(x) + const @@ -2648,9 +2460,6 @@ def test_expanding_cov(self): tm.assert_almost_equal(rolling_result, result) - def test_expanding_max(self): - self._check_expanding(mom.expanding_max, np.max, preserve_nan=False) - def test_expanding_cov_pairwise(self): result = self.frame.expanding().corr() @@ -2980,55 +2789,73 @@ def test_rolling_kurt_eq_value_fperr(self): a = Series([1.1] * 15).rolling(window=10).kurt() assert np.isnan(a).all() - def _check_expanding_ndarray(self, func, static_comp, has_min_periods=True, - has_time_rule=True, preserve_nan=True): - result = func(self.arr) + @pytest.mark.parametrize('func,static_comp', [('sum', np.sum), + ('mean', np.mean), + ('max', np.max), + ('min', np.min)], + ids=['sum', 'mean', 'max', 'min']) + def test_expanding_func(self, func, static_comp): + def expanding_func(x, min_periods=1, center=False, axis=0): + exp = x.expanding(min_periods=min_periods, + center=center, axis=axis) + return getattr(exp, func)() + self._check_expanding(expanding_func, static_comp, preserve_nan=False) + + def test_expanding_apply(self): + + def expanding_mean(x, min_periods=1): + exp = x.expanding(min_periods=min_periods) + return exp.apply(lambda x: x.mean()) + + self._check_expanding(expanding_mean, np.mean) + + ser = Series([]) + tm.assert_series_equal(ser, ser.expanding().apply(lambda x: x.mean())) - tm.assert_almost_equal(result[10], static_comp(self.arr[:11])) + # GH 8080 + s = Series([None, None, None]) + result = s.expanding(min_periods=0).apply(lambda x: len(x)) + expected = Series([1., 2., 3.]) + tm.assert_series_equal(result, expected) + + def _check_expanding(self, func, static_comp, has_min_periods=True, + has_time_rule=True, preserve_nan=True): + + series_result = func(self.series) + assert isinstance(series_result, Series) + frame_result = func(self.frame) + assert isinstance(frame_result, DataFrame) + + result = func(self.series) + tm.assert_almost_equal(result[10], static_comp(self.series[:11])) if preserve_nan: - assert (np.isnan(result[self._nan_locs]).all()) + assert result.iloc[self._nan_locs].isna().all() - arr = randn(50) + ser = Series(randn(50)) if has_min_periods: - result = func(arr, min_periods=30) - assert (np.isnan(result[:29]).all()) - tm.assert_almost_equal(result[-1], static_comp(arr[:50])) + result = func(ser, min_periods=30) + assert result[:29].isna().all() + tm.assert_almost_equal(result.iloc[-1], static_comp(ser[:50])) # min_periods is working correctly - result = func(arr, min_periods=15) - assert np.isnan(result[13]) - assert not np.isnan(result[14]) + result = func(ser, min_periods=15) + assert isna(result.iloc[13]) + assert notna(result.iloc[14]) - arr2 = randn(20) - result = func(arr2, min_periods=5) + ser2 = Series(randn(20)) + result = func(ser2, min_periods=5) assert isna(result[3]) assert notna(result[4]) # min_periods=0 - result0 = func(arr, min_periods=0) - result1 = func(arr, min_periods=1) + result0 = func(ser, min_periods=0) + result1 = func(ser, min_periods=1) tm.assert_almost_equal(result0, result1) else: - result = func(arr) - tm.assert_almost_equal(result[-1], static_comp(arr[:50])) - - def _check_expanding_structures(self, func): - series_result = func(self.series) - assert isinstance(series_result, Series) - frame_result = func(self.frame) - assert type(frame_result) == DataFrame - - def _check_expanding(self, func, static_comp, has_min_periods=True, - has_time_rule=True, preserve_nan=True): - with warnings.catch_warnings(record=True): - self._check_expanding_ndarray(func, static_comp, - has_min_periods=has_min_periods, - has_time_rule=has_time_rule, - preserve_nan=preserve_nan) - with warnings.catch_warnings(record=True): - self._check_expanding_structures(func) + result = func(ser) + tm.assert_almost_equal(result.iloc[-1], static_comp(ser[:50])) def test_rolling_max_gh6297(self): """Replicate result expected in GH #6297""" From 63d96f5bf95af973ed8aa5abaaacc180a47baf2f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 1 Feb 2018 03:23:03 -0800 Subject: [PATCH 018/214] Organize, Split, Parametrize timezones/timestamps tests (#19473) --- pandas/tests/scalar/test_timestamp.py | 570 +----------------- pandas/tests/scalar/timestamp/__init__.py | 0 .../tests/scalar/timestamp/test_arithmetic.py | 76 +++ .../scalar/timestamp/test_comparisons.py | 194 ++++++ .../tests/scalar/timestamp/test_rendering.py | 96 +++ .../tests/scalar/timestamp/test_timezones.py | 87 +++ .../tests/scalar/timestamp/test_unary_ops.py | 217 +++++++ pandas/tests/tseries/test_timezones.py | 96 +-- 8 files changed, 710 insertions(+), 626 deletions(-) create mode 100644 pandas/tests/scalar/timestamp/__init__.py create mode 100644 pandas/tests/scalar/timestamp/test_arithmetic.py create mode 100644 pandas/tests/scalar/timestamp/test_comparisons.py create mode 100644 pandas/tests/scalar/timestamp/test_rendering.py create mode 100644 pandas/tests/scalar/timestamp/test_timezones.py create mode 100644 pandas/tests/scalar/timestamp/test_unary_ops.py diff --git a/pandas/tests/scalar/test_timestamp.py b/pandas/tests/scalar/test_timestamp.py index 2b72eef2c6712..301f6da140866 100644 --- a/pandas/tests/scalar/test_timestamp.py +++ b/pandas/tests/scalar/test_timestamp.py @@ -1,18 +1,14 @@ """ test the scalar Timestamp """ -import sys import pytz import pytest import dateutil -import operator import calendar import numpy as np from dateutil.tz import tzutc from pytz import timezone, utc from datetime import datetime, timedelta -from distutils.version import LooseVersion -from pytz.exceptions import AmbiguousTimeError, NonExistentTimeError import pandas.util.testing as tm import pandas.util._test_decorators as td @@ -21,78 +17,10 @@ from pandas._libs.tslibs import conversion from pandas._libs.tslibs.timezones import get_timezone, dateutil_gettz as gettz -from pandas._libs.tslibs.frequencies import _INVALID_FREQ_ERROR from pandas.compat import long, PY3 from pandas.compat.numpy import np_datetime64_compat -from pandas import Timestamp, Period, Timedelta, NaT - - -class TestTimestampArithmetic(object): - def test_overflow_offset(self): - # xref https://github.com/statsmodels/statsmodels/issues/3374 - # ends up multiplying really large numbers which overflow - - stamp = Timestamp('2017-01-13 00:00:00', freq='D') - offset = 20169940 * offsets.Day(1) - - with pytest.raises(OverflowError): - stamp + offset - - with pytest.raises(OverflowError): - offset + stamp - - with pytest.raises(OverflowError): - stamp - offset - - def test_delta_preserve_nanos(self): - val = Timestamp(long(1337299200000000123)) - result = val + timedelta(1) - assert result.nanosecond == val.nanosecond - - def test_timestamp_sub_datetime(self): - dt = datetime(2013, 10, 12) - ts = Timestamp(datetime(2013, 10, 13)) - assert (ts - dt).days == 1 - assert (dt - ts).days == -1 - - def test_addition_subtraction_types(self): - # Assert on the types resulting from Timestamp +/- various date/time - # objects - dt = datetime(2014, 3, 4) - td = timedelta(seconds=1) - # build a timestamp with a frequency, since then it supports - # addition/subtraction of integers - ts = Timestamp(dt, freq='D') - - assert type(ts + 1) == Timestamp - assert type(ts - 1) == Timestamp - - # Timestamp + datetime not supported, though subtraction is supported - # and yields timedelta more tests in tseries/base/tests/test_base.py - assert type(ts - dt) == Timedelta - assert type(ts + td) == Timestamp - assert type(ts - td) == Timestamp - - # Timestamp +/- datetime64 not supported, so not tested (could possibly - # assert error raised?) - td64 = np.timedelta64(1, 'D') - assert type(ts + td64) == Timestamp - assert type(ts - td64) == Timestamp - - def test_addition_subtraction_preserve_frequency(self): - ts = Timestamp('2014-03-05', freq='D') - td = timedelta(days=1) - original_freq = ts.freq - - assert (ts + 1).freq == original_freq - assert (ts - 1).freq == original_freq - assert (ts + td).freq == original_freq - assert (ts - td).freq == original_freq - - td64 = np.timedelta64(1, 'D') - assert (ts + td64).freq == original_freq - assert (ts - td64).freq == original_freq +from pandas import Timestamp, Period, Timedelta class TestTimestampProperties(object): @@ -508,168 +436,8 @@ def test_max_valid(self): # Ensure that Timestamp.max is a valid Timestamp Timestamp(Timestamp.max) - -class TestTimestamp(object): - @pytest.mark.parametrize('freq', ['D', 'M', 'S', 'N']) - @pytest.mark.parametrize('date', ['2014-03-07', '2014-01-01 09:00', - '2014-01-01 00:00:00.000000001']) - def test_repr(self, date, freq): - # dateutil zone change (only matters for repr) - if LooseVersion(dateutil.__version__) >= LooseVersion('2.6.0'): - timezones = ['UTC', 'Asia/Tokyo', 'US/Eastern', - 'dateutil/US/Pacific'] - else: - timezones = ['UTC', 'Asia/Tokyo', 'US/Eastern', - 'dateutil/America/Los_Angeles'] - - for tz in timezones: - - # avoid to match with timezone name - freq_repr = "'{0}'".format(freq) - if tz.startswith('dateutil'): - tz_repr = tz.replace('dateutil', '') - else: - tz_repr = tz - - date_only = Timestamp(date) - assert date in repr(date_only) - assert tz_repr not in repr(date_only) - assert freq_repr not in repr(date_only) - assert date_only == eval(repr(date_only)) - - date_tz = Timestamp(date, tz=tz) - assert date in repr(date_tz) - assert tz_repr in repr(date_tz) - assert freq_repr not in repr(date_tz) - assert date_tz == eval(repr(date_tz)) - - date_freq = Timestamp(date, freq=freq) - assert date in repr(date_freq) - assert tz_repr not in repr(date_freq) - assert freq_repr in repr(date_freq) - assert date_freq == eval(repr(date_freq)) - - date_tz_freq = Timestamp(date, tz=tz, freq=freq) - assert date in repr(date_tz_freq) - assert tz_repr in repr(date_tz_freq) - assert freq_repr in repr(date_tz_freq) - assert date_tz_freq == eval(repr(date_tz_freq)) - - def test_repr_utcoffset(self): - # This can cause the tz field to be populated, but it's redundant to - # include this information in the date-string. - date_with_utc_offset = Timestamp('2014-03-13 00:00:00-0400', tz=None) - assert '2014-03-13 00:00:00-0400' in repr(date_with_utc_offset) - assert 'tzoffset' not in repr(date_with_utc_offset) - assert 'pytz.FixedOffset(-240)' in repr(date_with_utc_offset) - expr = repr(date_with_utc_offset).replace("'pytz.FixedOffset(-240)'", - 'pytz.FixedOffset(-240)') - assert date_with_utc_offset == eval(expr) - - def test_timestamp_repr_pre1900(self): - # pre-1900 - stamp = Timestamp('1850-01-01', tz='US/Eastern') - repr(stamp) - - iso8601 = '1850-01-01 01:23:45.012345' - stamp = Timestamp(iso8601, tz='US/Eastern') - result = repr(stamp) - assert iso8601 in result - - def test_tz(self): - t = '2014-02-01 09:00' - ts = Timestamp(t) - local = ts.tz_localize('Asia/Tokyo') - assert local.hour == 9 - assert local == Timestamp(t, tz='Asia/Tokyo') - conv = local.tz_convert('US/Eastern') - assert conv == Timestamp('2014-01-31 19:00', tz='US/Eastern') - assert conv.hour == 19 - - # preserves nanosecond - ts = Timestamp(t) + offsets.Nano(5) - local = ts.tz_localize('Asia/Tokyo') - assert local.hour == 9 - assert local.nanosecond == 5 - conv = local.tz_convert('US/Eastern') - assert conv.nanosecond == 5 - assert conv.hour == 19 - - def test_tz_localize_ambiguous(self): - - ts = Timestamp('2014-11-02 01:00') - ts_dst = ts.tz_localize('US/Eastern', ambiguous=True) - ts_no_dst = ts.tz_localize('US/Eastern', ambiguous=False) - - assert (ts_no_dst.value - ts_dst.value) / 1e9 == 3600 - with pytest.raises(ValueError): - ts.tz_localize('US/Eastern', ambiguous='infer') - - # GH 8025 - with tm.assert_raises_regex(TypeError, - 'Cannot localize tz-aware Timestamp, ' - 'use tz_convert for conversions'): - Timestamp('2011-01-01', tz='US/Eastern').tz_localize('Asia/Tokyo') - - with tm.assert_raises_regex(TypeError, - 'Cannot convert tz-naive Timestamp, ' - 'use tz_localize to localize'): - Timestamp('2011-01-01').tz_convert('Asia/Tokyo') - - def test_tz_localize_nonexistent(self): - # see gh-13057 - times = ['2015-03-08 02:00', '2015-03-08 02:30', - '2015-03-29 02:00', '2015-03-29 02:30'] - timezones = ['US/Eastern', 'US/Pacific', - 'Europe/Paris', 'Europe/Belgrade'] - for t, tz in zip(times, timezones): - ts = Timestamp(t) - pytest.raises(NonExistentTimeError, ts.tz_localize, - tz) - pytest.raises(NonExistentTimeError, ts.tz_localize, - tz, errors='raise') - assert ts.tz_localize(tz, errors='coerce') is NaT - - def test_tz_localize_errors_ambiguous(self): - # see gh-13057 - ts = Timestamp('2015-11-1 01:00') - pytest.raises(AmbiguousTimeError, - ts.tz_localize, 'US/Pacific', errors='coerce') - - @pytest.mark.parametrize('tz', ['UTC', 'Asia/Tokyo', - 'US/Eastern', 'dateutil/US/Pacific']) - def test_tz_localize_roundtrip(self, tz): - for t in ['2014-02-01 09:00', '2014-07-08 09:00', - '2014-11-01 17:00', '2014-11-05 00:00']: - ts = Timestamp(t) - localized = ts.tz_localize(tz) - assert localized == Timestamp(t, tz=tz) - - with pytest.raises(TypeError): - localized.tz_localize(tz) - - reset = localized.tz_localize(None) - assert reset == ts - assert reset.tzinfo is None - - @pytest.mark.parametrize('tz', ['UTC', 'Asia/Tokyo', - 'US/Eastern', 'dateutil/US/Pacific']) - def test_tz_convert_roundtrip(self, tz): - for t in ['2014-02-01 09:00', '2014-07-08 09:00', - '2014-11-01 17:00', '2014-11-05 00:00']: - ts = Timestamp(t, tz='UTC') - converted = ts.tz_convert(tz) - - reset = converted.tz_convert(None) - assert reset == Timestamp(t) - assert reset.tzinfo is None - assert reset == converted.tz_convert('UTC').tz_localize(None) - - def test_utc_z_designator(self): - assert get_timezone(Timestamp('2014-11-02 01:00Z').tzinfo) == 'UTC' - def test_now(self): - # #9000 + # GH#9000 ts_from_string = Timestamp('now') ts_from_method = Timestamp.now() ts_datetime = datetime.now() @@ -687,7 +455,6 @@ def test_now(self): ts_from_method_tz.tz_localize(None)) < delta) def test_today(self): - ts_from_string = Timestamp('today') ts_from_method = Timestamp.today() ts_datetime = datetime.today() @@ -704,6 +471,31 @@ def test_today(self): assert (abs(ts_from_string_tz.tz_localize(None) - ts_from_method_tz.tz_localize(None)) < delta) + +class TestTimestamp(object): + + def test_tz(self): + tstr = '2014-02-01 09:00' + ts = Timestamp(tstr) + local = ts.tz_localize('Asia/Tokyo') + assert local.hour == 9 + assert local == Timestamp(tstr, tz='Asia/Tokyo') + conv = local.tz_convert('US/Eastern') + assert conv == Timestamp('2014-01-31 19:00', tz='US/Eastern') + assert conv.hour == 19 + + # preserves nanosecond + ts = Timestamp(tstr) + offsets.Nano(5) + local = ts.tz_localize('Asia/Tokyo') + assert local.hour == 9 + assert local.nanosecond == 5 + conv = local.tz_convert('US/Eastern') + assert conv.nanosecond == 5 + assert conv.hour == 19 + + def test_utc_z_designator(self): + assert get_timezone(Timestamp('2014-11-02 01:00Z').tzinfo) == 'UTC' + def test_asm8(self): np.random.seed(7960929) ns = [Timestamp.min.value, Timestamp.max.value, 1000] @@ -715,110 +507,6 @@ def test_asm8(self): assert (Timestamp('nat').asm8.view('i8') == np.datetime64('nat', 'ns').view('i8')) - def test_pprint(self): - # GH12622 - import pprint - nested_obj = {'foo': 1, - 'bar': [{'w': {'a': Timestamp('2011-01-01')}}] * 10} - result = pprint.pformat(nested_obj, width=50) - expected = r"""{'bar': [{'w': {'a': Timestamp('2011-01-01 00:00:00')}}, - {'w': {'a': Timestamp('2011-01-01 00:00:00')}}, - {'w': {'a': Timestamp('2011-01-01 00:00:00')}}, - {'w': {'a': Timestamp('2011-01-01 00:00:00')}}, - {'w': {'a': Timestamp('2011-01-01 00:00:00')}}, - {'w': {'a': Timestamp('2011-01-01 00:00:00')}}, - {'w': {'a': Timestamp('2011-01-01 00:00:00')}}, - {'w': {'a': Timestamp('2011-01-01 00:00:00')}}, - {'w': {'a': Timestamp('2011-01-01 00:00:00')}}, - {'w': {'a': Timestamp('2011-01-01 00:00:00')}}], - 'foo': 1}""" - assert result == expected - - def test_round(self): - - # round - dt = Timestamp('20130101 09:10:11') - result = dt.round('D') - expected = Timestamp('20130101') - assert result == expected - - dt = Timestamp('20130101 19:10:11') - result = dt.round('D') - expected = Timestamp('20130102') - assert result == expected - - dt = Timestamp('20130201 12:00:00') - result = dt.round('D') - expected = Timestamp('20130202') - assert result == expected - - dt = Timestamp('20130104 12:00:00') - result = dt.round('D') - expected = Timestamp('20130105') - assert result == expected - - dt = Timestamp('20130104 12:32:00') - result = dt.round('30Min') - expected = Timestamp('20130104 12:30:00') - assert result == expected - - # floor - dt = Timestamp('20130101 09:10:11') - result = dt.floor('D') - expected = Timestamp('20130101') - assert result == expected - - # ceil - dt = Timestamp('20130101 09:10:11') - result = dt.ceil('D') - expected = Timestamp('20130102') - assert result == expected - - # round with tz - dt = Timestamp('20130101 09:10:11', tz='US/Eastern') - result = dt.round('D') - expected = Timestamp('20130101', tz='US/Eastern') - assert result == expected - - dt = Timestamp('20130101 09:10:11', tz='US/Eastern') - result = dt.round('s') - assert result == dt - - # GH 14440 & 15578 - result = Timestamp('2016-10-17 12:00:00.0015').round('ms') - expected = Timestamp('2016-10-17 12:00:00.002000') - assert result == expected - - result = Timestamp('2016-10-17 12:00:00.00149').round('ms') - expected = Timestamp('2016-10-17 12:00:00.001000') - assert result == expected - - ts = Timestamp('2016-10-17 12:00:00.0015') - for freq in ['us', 'ns']: - assert ts == ts.round(freq) - - result = Timestamp('2016-10-17 12:00:00.001501031').round('10ns') - expected = Timestamp('2016-10-17 12:00:00.001501030') - assert result == expected - - with tm.assert_produces_warning(): - Timestamp('2016-10-17 12:00:00.001501031').round('1010ns') - - def test_round_misc(self): - stamp = Timestamp('2000-01-05 05:09:15.13') - - def _check_round(freq, expected): - result = stamp.round(freq=freq) - assert result == expected - - for freq, expected in [('D', Timestamp('2000-01-05 00:00:00')), - ('H', Timestamp('2000-01-05 05:00:00')), - ('S', Timestamp('2000-01-05 05:09:15'))]: - _check_round(freq, expected) - - with tm.assert_raises_regex(ValueError, _INVALID_FREQ_ERROR): - stamp.round('foo') - def test_class_ops_pytz(self): def compare(x, y): assert (int(Timestamp(x).value / 1e9) == @@ -960,210 +648,6 @@ def test_hash_equivalent(self): stamp = Timestamp(datetime(2011, 1, 1)) assert d[stamp] == 5 - @td.skip_if_windows - def test_timestamp(self): - # GH#17329 - # tz-naive --> treat it as if it were UTC for purposes of timestamp() - ts = Timestamp.now() - uts = ts.replace(tzinfo=utc) - assert ts.timestamp() == uts.timestamp() - - tsc = Timestamp('2014-10-11 11:00:01.12345678', tz='US/Central') - utsc = tsc.tz_convert('UTC') - - # utsc is a different representation of the same time - assert tsc.timestamp() == utsc.timestamp() - - if PY3: - - # datetime.timestamp() converts in the local timezone - with tm.set_timezone('UTC'): - - # should agree with datetime.timestamp method - dt = ts.to_pydatetime() - assert dt.timestamp() == ts.timestamp() - - -class TestTimestampComparison(object): - def test_comparison_object_array(self): - # GH#15183 - ts = Timestamp('2011-01-03 00:00:00-0500', tz='US/Eastern') - other = Timestamp('2011-01-01 00:00:00-0500', tz='US/Eastern') - naive = Timestamp('2011-01-01 00:00:00') - - arr = np.array([other, ts], dtype=object) - res = arr == ts - expected = np.array([False, True], dtype=bool) - assert (res == expected).all() - - # 2D case - arr = np.array([[other, ts], - [ts, other]], - dtype=object) - res = arr != ts - expected = np.array([[True, False], [False, True]], dtype=bool) - assert res.shape == expected.shape - assert (res == expected).all() - - # tzaware mismatch - arr = np.array([naive], dtype=object) - with pytest.raises(TypeError): - arr < ts - - def test_comparison(self): - # 5-18-2012 00:00:00.000 - stamp = long(1337299200000000000) - - val = Timestamp(stamp) - - assert val == val - assert not val != val - assert not val < val - assert val <= val - assert not val > val - assert val >= val - - other = datetime(2012, 5, 18) - assert val == other - assert not val != other - assert not val < other - assert val <= other - assert not val > other - assert val >= other - - other = Timestamp(stamp + 100) - - assert val != other - assert val != other - assert val < other - assert val <= other - assert other > val - assert other >= val - - def test_compare_invalid(self): - # GH 8058 - val = Timestamp('20130101 12:01:02') - assert not val == 'foo' - assert not val == 10.0 - assert not val == 1 - assert not val == long(1) - assert not val == [] - assert not val == {'foo': 1} - assert not val == np.float64(1) - assert not val == np.int64(1) - - assert val != 'foo' - assert val != 10.0 - assert val != 1 - assert val != long(1) - assert val != [] - assert val != {'foo': 1} - assert val != np.float64(1) - assert val != np.int64(1) - - def test_cant_compare_tz_naive_w_aware(self): - # see gh-1404 - a = Timestamp('3/12/2012') - b = Timestamp('3/12/2012', tz='utc') - - pytest.raises(Exception, a.__eq__, b) - pytest.raises(Exception, a.__ne__, b) - pytest.raises(Exception, a.__lt__, b) - pytest.raises(Exception, a.__gt__, b) - pytest.raises(Exception, b.__eq__, a) - pytest.raises(Exception, b.__ne__, a) - pytest.raises(Exception, b.__lt__, a) - pytest.raises(Exception, b.__gt__, a) - - if sys.version_info < (3, 3): - pytest.raises(Exception, a.__eq__, b.to_pydatetime()) - pytest.raises(Exception, a.to_pydatetime().__eq__, b) - else: - assert not a == b.to_pydatetime() - assert not a.to_pydatetime() == b - - def test_cant_compare_tz_naive_w_aware_explicit_pytz(self): - # see gh-1404 - a = Timestamp('3/12/2012') - b = Timestamp('3/12/2012', tz=utc) - - pytest.raises(Exception, a.__eq__, b) - pytest.raises(Exception, a.__ne__, b) - pytest.raises(Exception, a.__lt__, b) - pytest.raises(Exception, a.__gt__, b) - pytest.raises(Exception, b.__eq__, a) - pytest.raises(Exception, b.__ne__, a) - pytest.raises(Exception, b.__lt__, a) - pytest.raises(Exception, b.__gt__, a) - - if sys.version_info < (3, 3): - pytest.raises(Exception, a.__eq__, b.to_pydatetime()) - pytest.raises(Exception, a.to_pydatetime().__eq__, b) - else: - assert not a == b.to_pydatetime() - assert not a.to_pydatetime() == b - - def test_cant_compare_tz_naive_w_aware_dateutil(self): - # see gh-1404 - a = Timestamp('3/12/2012') - b = Timestamp('3/12/2012', tz=tzutc()) - - pytest.raises(Exception, a.__eq__, b) - pytest.raises(Exception, a.__ne__, b) - pytest.raises(Exception, a.__lt__, b) - pytest.raises(Exception, a.__gt__, b) - pytest.raises(Exception, b.__eq__, a) - pytest.raises(Exception, b.__ne__, a) - pytest.raises(Exception, b.__lt__, a) - pytest.raises(Exception, b.__gt__, a) - - if sys.version_info < (3, 3): - pytest.raises(Exception, a.__eq__, b.to_pydatetime()) - pytest.raises(Exception, a.to_pydatetime().__eq__, b) - else: - assert not a == b.to_pydatetime() - assert not a.to_pydatetime() == b - - def test_timestamp_compare_scalars(self): - # case where ndim == 0 - lhs = np.datetime64(datetime(2013, 12, 6)) - rhs = Timestamp('now') - nat = Timestamp('nat') - - ops = {'gt': 'lt', - 'lt': 'gt', - 'ge': 'le', - 'le': 'ge', - 'eq': 'eq', - 'ne': 'ne'} - - for left, right in ops.items(): - left_f = getattr(operator, left) - right_f = getattr(operator, right) - expected = left_f(lhs, rhs) - - result = right_f(rhs, lhs) - assert result == expected - - expected = left_f(rhs, nat) - result = right_f(nat, rhs) - assert result == expected - - def test_timestamp_compare_with_early_datetime(self): - # e.g. datetime.min - stamp = Timestamp('2012-01-01') - - assert not stamp == datetime.min - assert not stamp == datetime(1600, 1, 1) - assert not stamp == datetime(2700, 1, 1) - assert stamp != datetime.min - assert stamp != datetime(1600, 1, 1) - assert stamp != datetime(2700, 1, 1) - assert stamp > datetime(1600, 1, 1) - assert stamp >= datetime(1600, 1, 1) - assert stamp < datetime(2700, 1, 1) - assert stamp <= datetime(2700, 1, 1) - class TestTimestampNsOperations(object): diff --git a/pandas/tests/scalar/timestamp/__init__.py b/pandas/tests/scalar/timestamp/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/scalar/timestamp/test_arithmetic.py b/pandas/tests/scalar/timestamp/test_arithmetic.py new file mode 100644 index 0000000000000..8f4809c93e28b --- /dev/null +++ b/pandas/tests/scalar/timestamp/test_arithmetic.py @@ -0,0 +1,76 @@ +# -*- coding: utf-8 -*- +from datetime import datetime, timedelta + +import pytest +import numpy as np + +from pandas.compat import long +from pandas.tseries import offsets +from pandas import Timestamp, Timedelta + + +class TestTimestampArithmetic(object): + def test_overflow_offset(self): + # xref https://github.com/statsmodels/statsmodels/issues/3374 + # ends up multiplying really large numbers which overflow + + stamp = Timestamp('2017-01-13 00:00:00', freq='D') + offset = 20169940 * offsets.Day(1) + + with pytest.raises(OverflowError): + stamp + offset + + with pytest.raises(OverflowError): + offset + stamp + + with pytest.raises(OverflowError): + stamp - offset + + def test_delta_preserve_nanos(self): + val = Timestamp(long(1337299200000000123)) + result = val + timedelta(1) + assert result.nanosecond == val.nanosecond + + def test_timestamp_sub_datetime(self): + dt = datetime(2013, 10, 12) + ts = Timestamp(datetime(2013, 10, 13)) + assert (ts - dt).days == 1 + assert (dt - ts).days == -1 + + def test_addition_subtraction_types(self): + # Assert on the types resulting from Timestamp +/- various date/time + # objects + dt = datetime(2014, 3, 4) + td = timedelta(seconds=1) + # build a timestamp with a frequency, since then it supports + # addition/subtraction of integers + ts = Timestamp(dt, freq='D') + + assert type(ts + 1) == Timestamp + assert type(ts - 1) == Timestamp + + # Timestamp + datetime not supported, though subtraction is supported + # and yields timedelta more tests in tseries/base/tests/test_base.py + assert type(ts - dt) == Timedelta + assert type(ts + td) == Timestamp + assert type(ts - td) == Timestamp + + # Timestamp +/- datetime64 not supported, so not tested (could possibly + # assert error raised?) + td64 = np.timedelta64(1, 'D') + assert type(ts + td64) == Timestamp + assert type(ts - td64) == Timestamp + + def test_addition_subtraction_preserve_frequency(self): + ts = Timestamp('2014-03-05', freq='D') + td = timedelta(days=1) + original_freq = ts.freq + + assert (ts + 1).freq == original_freq + assert (ts - 1).freq == original_freq + assert (ts + td).freq == original_freq + assert (ts - td).freq == original_freq + + td64 = np.timedelta64(1, 'D') + assert (ts + td64).freq == original_freq + assert (ts - td64).freq == original_freq diff --git a/pandas/tests/scalar/timestamp/test_comparisons.py b/pandas/tests/scalar/timestamp/test_comparisons.py new file mode 100644 index 0000000000000..72d87be619917 --- /dev/null +++ b/pandas/tests/scalar/timestamp/test_comparisons.py @@ -0,0 +1,194 @@ +# -*- coding: utf-8 -*- +import sys +from datetime import datetime +import operator + +import pytest +import numpy as np + +from dateutil.tz import tzutc +from pytz import utc + +from pandas.compat import long +from pandas import Timestamp + + +class TestTimestampComparison(object): + def test_comparison_object_array(self): + # GH#15183 + ts = Timestamp('2011-01-03 00:00:00-0500', tz='US/Eastern') + other = Timestamp('2011-01-01 00:00:00-0500', tz='US/Eastern') + naive = Timestamp('2011-01-01 00:00:00') + + arr = np.array([other, ts], dtype=object) + res = arr == ts + expected = np.array([False, True], dtype=bool) + assert (res == expected).all() + + # 2D case + arr = np.array([[other, ts], + [ts, other]], + dtype=object) + res = arr != ts + expected = np.array([[True, False], [False, True]], dtype=bool) + assert res.shape == expected.shape + assert (res == expected).all() + + # tzaware mismatch + arr = np.array([naive], dtype=object) + with pytest.raises(TypeError): + arr < ts + + def test_comparison(self): + # 5-18-2012 00:00:00.000 + stamp = long(1337299200000000000) + + val = Timestamp(stamp) + + assert val == val + assert not val != val + assert not val < val + assert val <= val + assert not val > val + assert val >= val + + other = datetime(2012, 5, 18) + assert val == other + assert not val != other + assert not val < other + assert val <= other + assert not val > other + assert val >= other + + other = Timestamp(stamp + 100) + + assert val != other + assert val != other + assert val < other + assert val <= other + assert other > val + assert other >= val + + def test_compare_invalid(self): + # GH 8058 + val = Timestamp('20130101 12:01:02') + assert not val == 'foo' + assert not val == 10.0 + assert not val == 1 + assert not val == long(1) + assert not val == [] + assert not val == {'foo': 1} + assert not val == np.float64(1) + assert not val == np.int64(1) + + assert val != 'foo' + assert val != 10.0 + assert val != 1 + assert val != long(1) + assert val != [] + assert val != {'foo': 1} + assert val != np.float64(1) + assert val != np.int64(1) + + def test_cant_compare_tz_naive_w_aware(self): + # see gh-1404 + a = Timestamp('3/12/2012') + b = Timestamp('3/12/2012', tz='utc') + + pytest.raises(Exception, a.__eq__, b) + pytest.raises(Exception, a.__ne__, b) + pytest.raises(Exception, a.__lt__, b) + pytest.raises(Exception, a.__gt__, b) + pytest.raises(Exception, b.__eq__, a) + pytest.raises(Exception, b.__ne__, a) + pytest.raises(Exception, b.__lt__, a) + pytest.raises(Exception, b.__gt__, a) + + if sys.version_info < (3, 3): + pytest.raises(Exception, a.__eq__, b.to_pydatetime()) + pytest.raises(Exception, a.to_pydatetime().__eq__, b) + else: + assert not a == b.to_pydatetime() + assert not a.to_pydatetime() == b + + def test_cant_compare_tz_naive_w_aware_explicit_pytz(self): + # see gh-1404 + a = Timestamp('3/12/2012') + b = Timestamp('3/12/2012', tz=utc) + + pytest.raises(Exception, a.__eq__, b) + pytest.raises(Exception, a.__ne__, b) + pytest.raises(Exception, a.__lt__, b) + pytest.raises(Exception, a.__gt__, b) + pytest.raises(Exception, b.__eq__, a) + pytest.raises(Exception, b.__ne__, a) + pytest.raises(Exception, b.__lt__, a) + pytest.raises(Exception, b.__gt__, a) + + if sys.version_info < (3, 3): + pytest.raises(Exception, a.__eq__, b.to_pydatetime()) + pytest.raises(Exception, a.to_pydatetime().__eq__, b) + else: + assert not a == b.to_pydatetime() + assert not a.to_pydatetime() == b + + def test_cant_compare_tz_naive_w_aware_dateutil(self): + # see gh-1404 + a = Timestamp('3/12/2012') + b = Timestamp('3/12/2012', tz=tzutc()) + + pytest.raises(Exception, a.__eq__, b) + pytest.raises(Exception, a.__ne__, b) + pytest.raises(Exception, a.__lt__, b) + pytest.raises(Exception, a.__gt__, b) + pytest.raises(Exception, b.__eq__, a) + pytest.raises(Exception, b.__ne__, a) + pytest.raises(Exception, b.__lt__, a) + pytest.raises(Exception, b.__gt__, a) + + if sys.version_info < (3, 3): + pytest.raises(Exception, a.__eq__, b.to_pydatetime()) + pytest.raises(Exception, a.to_pydatetime().__eq__, b) + else: + assert not a == b.to_pydatetime() + assert not a.to_pydatetime() == b + + def test_timestamp_compare_scalars(self): + # case where ndim == 0 + lhs = np.datetime64(datetime(2013, 12, 6)) + rhs = Timestamp('now') + nat = Timestamp('nat') + + ops = {'gt': 'lt', + 'lt': 'gt', + 'ge': 'le', + 'le': 'ge', + 'eq': 'eq', + 'ne': 'ne'} + + for left, right in ops.items(): + left_f = getattr(operator, left) + right_f = getattr(operator, right) + expected = left_f(lhs, rhs) + + result = right_f(rhs, lhs) + assert result == expected + + expected = left_f(rhs, nat) + result = right_f(nat, rhs) + assert result == expected + + def test_timestamp_compare_with_early_datetime(self): + # e.g. datetime.min + stamp = Timestamp('2012-01-01') + + assert not stamp == datetime.min + assert not stamp == datetime(1600, 1, 1) + assert not stamp == datetime(2700, 1, 1) + assert stamp != datetime.min + assert stamp != datetime(1600, 1, 1) + assert stamp != datetime(2700, 1, 1) + assert stamp > datetime(1600, 1, 1) + assert stamp >= datetime(1600, 1, 1) + assert stamp < datetime(2700, 1, 1) + assert stamp <= datetime(2700, 1, 1) diff --git a/pandas/tests/scalar/timestamp/test_rendering.py b/pandas/tests/scalar/timestamp/test_rendering.py new file mode 100644 index 0000000000000..c404b60567daf --- /dev/null +++ b/pandas/tests/scalar/timestamp/test_rendering.py @@ -0,0 +1,96 @@ +# -*- coding: utf-8 -*- + +import pytest +import dateutil +import pytz # noqa # a test below uses pytz but only inside a `eval` call + +import pprint +from distutils.version import LooseVersion + +from pandas import Timestamp + + +class TestTimestampRendering(object): + + # dateutil zone change (only matters for repr) + if LooseVersion(dateutil.__version__) >= LooseVersion('2.6.0'): + timezones = ['UTC', 'Asia/Tokyo', 'US/Eastern', + 'dateutil/US/Pacific'] + else: + timezones = ['UTC', 'Asia/Tokyo', 'US/Eastern', + 'dateutil/America/Los_Angeles'] + + @pytest.mark.parametrize('tz', timezones) + @pytest.mark.parametrize('freq', ['D', 'M', 'S', 'N']) + @pytest.mark.parametrize('date', ['2014-03-07', '2014-01-01 09:00', + '2014-01-01 00:00:00.000000001']) + def test_repr(self, date, freq, tz): + # avoid to match with timezone name + freq_repr = "'{0}'".format(freq) + if tz.startswith('dateutil'): + tz_repr = tz.replace('dateutil', '') + else: + tz_repr = tz + + date_only = Timestamp(date) + assert date in repr(date_only) + assert tz_repr not in repr(date_only) + assert freq_repr not in repr(date_only) + assert date_only == eval(repr(date_only)) + + date_tz = Timestamp(date, tz=tz) + assert date in repr(date_tz) + assert tz_repr in repr(date_tz) + assert freq_repr not in repr(date_tz) + assert date_tz == eval(repr(date_tz)) + + date_freq = Timestamp(date, freq=freq) + assert date in repr(date_freq) + assert tz_repr not in repr(date_freq) + assert freq_repr in repr(date_freq) + assert date_freq == eval(repr(date_freq)) + + date_tz_freq = Timestamp(date, tz=tz, freq=freq) + assert date in repr(date_tz_freq) + assert tz_repr in repr(date_tz_freq) + assert freq_repr in repr(date_tz_freq) + assert date_tz_freq == eval(repr(date_tz_freq)) + + def test_repr_utcoffset(self): + # This can cause the tz field to be populated, but it's redundant to + # include this information in the date-string. + date_with_utc_offset = Timestamp('2014-03-13 00:00:00-0400', tz=None) + assert '2014-03-13 00:00:00-0400' in repr(date_with_utc_offset) + assert 'tzoffset' not in repr(date_with_utc_offset) + assert 'pytz.FixedOffset(-240)' in repr(date_with_utc_offset) + expr = repr(date_with_utc_offset).replace("'pytz.FixedOffset(-240)'", + 'pytz.FixedOffset(-240)') + assert date_with_utc_offset == eval(expr) + + def test_timestamp_repr_pre1900(self): + # pre-1900 + stamp = Timestamp('1850-01-01', tz='US/Eastern') + repr(stamp) + + iso8601 = '1850-01-01 01:23:45.012345' + stamp = Timestamp(iso8601, tz='US/Eastern') + result = repr(stamp) + assert iso8601 in result + + def test_pprint(self): + # GH#12622 + nested_obj = {'foo': 1, + 'bar': [{'w': {'a': Timestamp('2011-01-01')}}] * 10} + result = pprint.pformat(nested_obj, width=50) + expected = r"""{'bar': [{'w': {'a': Timestamp('2011-01-01 00:00:00')}}, + {'w': {'a': Timestamp('2011-01-01 00:00:00')}}, + {'w': {'a': Timestamp('2011-01-01 00:00:00')}}, + {'w': {'a': Timestamp('2011-01-01 00:00:00')}}, + {'w': {'a': Timestamp('2011-01-01 00:00:00')}}, + {'w': {'a': Timestamp('2011-01-01 00:00:00')}}, + {'w': {'a': Timestamp('2011-01-01 00:00:00')}}, + {'w': {'a': Timestamp('2011-01-01 00:00:00')}}, + {'w': {'a': Timestamp('2011-01-01 00:00:00')}}, + {'w': {'a': Timestamp('2011-01-01 00:00:00')}}], + 'foo': 1}""" + assert result == expected diff --git a/pandas/tests/scalar/timestamp/test_timezones.py b/pandas/tests/scalar/timestamp/test_timezones.py new file mode 100644 index 0000000000000..eeec70cc234f5 --- /dev/null +++ b/pandas/tests/scalar/timestamp/test_timezones.py @@ -0,0 +1,87 @@ +# -*- coding: utf-8 -*- +""" +Tests for Timestamp timezone-related methods +""" + +import pytest +from pytz.exceptions import AmbiguousTimeError, NonExistentTimeError + +import pandas.util.testing as tm +from pandas import Timestamp, NaT + + +class TestTimestampTZOperations(object): + # -------------------------------------------------------------- + # Timestamp.tz_localize + + def test_tz_localize_ambiguous(self): + ts = Timestamp('2014-11-02 01:00') + ts_dst = ts.tz_localize('US/Eastern', ambiguous=True) + ts_no_dst = ts.tz_localize('US/Eastern', ambiguous=False) + + assert (ts_no_dst.value - ts_dst.value) / 1e9 == 3600 + with pytest.raises(ValueError): + ts.tz_localize('US/Eastern', ambiguous='infer') + + # GH#8025 + with tm.assert_raises_regex(TypeError, + 'Cannot localize tz-aware Timestamp, ' + 'use tz_convert for conversions'): + Timestamp('2011-01-01', tz='US/Eastern').tz_localize('Asia/Tokyo') + + with tm.assert_raises_regex(TypeError, + 'Cannot convert tz-naive Timestamp, ' + 'use tz_localize to localize'): + Timestamp('2011-01-01').tz_convert('Asia/Tokyo') + + @pytest.mark.parametrize('stamp, tz', [ + ('2015-03-08 02:00', 'US/Eastern'), + ('2015-03-08 02:30', 'US/Pacific'), + ('2015-03-29 02:00', 'Europe/Paris'), + ('2015-03-29 02:30', 'Europe/Belgrade')]) + def test_tz_localize_nonexistent(self, stamp, tz): + # GH#13057 + ts = Timestamp(stamp) + with pytest.raises(NonExistentTimeError): + ts.tz_localize(tz) + with pytest.raises(NonExistentTimeError): + ts.tz_localize(tz, errors='raise') + assert ts.tz_localize(tz, errors='coerce') is NaT + + def test_tz_localize_errors_ambiguous(self): + # GH#13057 + ts = Timestamp('2015-11-1 01:00') + with pytest.raises(AmbiguousTimeError): + ts.tz_localize('US/Pacific', errors='coerce') + + @pytest.mark.parametrize('tz', ['UTC', 'Asia/Tokyo', + 'US/Eastern', 'dateutil/US/Pacific']) + @pytest.mark.parametrize('stamp', ['2014-02-01 09:00', '2014-07-08 09:00', + '2014-11-01 17:00', '2014-11-05 00:00']) + def test_tz_localize_roundtrip(self, stamp, tz): + ts = Timestamp(stamp) + localized = ts.tz_localize(tz) + assert localized == Timestamp(stamp, tz=tz) + + with pytest.raises(TypeError): + localized.tz_localize(tz) + + reset = localized.tz_localize(None) + assert reset == ts + assert reset.tzinfo is None + + # ------------------------------------------------------------------ + # Timestamp.tz_convert + + @pytest.mark.parametrize('tz', ['UTC', 'Asia/Tokyo', + 'US/Eastern', 'dateutil/US/Pacific']) + @pytest.mark.parametrize('stamp', ['2014-02-01 09:00', '2014-07-08 09:00', + '2014-11-01 17:00', '2014-11-05 00:00']) + def test_tz_convert_roundtrip(self, stamp, tz): + ts = Timestamp(stamp, tz='UTC') + converted = ts.tz_convert(tz) + + reset = converted.tz_convert(None) + assert reset == Timestamp(stamp) + assert reset.tzinfo is None + assert reset == converted.tz_convert('UTC').tz_localize(None) diff --git a/pandas/tests/scalar/timestamp/test_unary_ops.py b/pandas/tests/scalar/timestamp/test_unary_ops.py new file mode 100644 index 0000000000000..70c7308dd3991 --- /dev/null +++ b/pandas/tests/scalar/timestamp/test_unary_ops.py @@ -0,0 +1,217 @@ +# -*- coding: utf-8 -*- +from datetime import datetime + +import pytest +import pytz +from pytz import utc + +import pandas.util.testing as tm +import pandas.util._test_decorators as td + +from pandas.compat import PY3 +from pandas._libs.tslibs.frequencies import _INVALID_FREQ_ERROR +from pandas import Timestamp + + +class TestTimestampUnaryOps(object): + + # -------------------------------------------------------------- + # Timestamp.round + + def test_round_day_naive(self): + dt = Timestamp('20130101 09:10:11') + result = dt.round('D') + expected = Timestamp('20130101') + assert result == expected + + dt = Timestamp('20130101 19:10:11') + result = dt.round('D') + expected = Timestamp('20130102') + assert result == expected + + dt = Timestamp('20130201 12:00:00') + result = dt.round('D') + expected = Timestamp('20130202') + assert result == expected + + dt = Timestamp('20130104 12:00:00') + result = dt.round('D') + expected = Timestamp('20130105') + assert result == expected + + def test_round_tzaware(self): + dt = Timestamp('20130101 09:10:11', tz='US/Eastern') + result = dt.round('D') + expected = Timestamp('20130101', tz='US/Eastern') + assert result == expected + + dt = Timestamp('20130101 09:10:11', tz='US/Eastern') + result = dt.round('s') + assert result == dt + + def test_round_30min(self): + # round + dt = Timestamp('20130104 12:32:00') + result = dt.round('30Min') + expected = Timestamp('20130104 12:30:00') + assert result == expected + + def test_round_subsecond(self): + # GH#14440 & GH#15578 + result = Timestamp('2016-10-17 12:00:00.0015').round('ms') + expected = Timestamp('2016-10-17 12:00:00.002000') + assert result == expected + + result = Timestamp('2016-10-17 12:00:00.00149').round('ms') + expected = Timestamp('2016-10-17 12:00:00.001000') + assert result == expected + + ts = Timestamp('2016-10-17 12:00:00.0015') + for freq in ['us', 'ns']: + assert ts == ts.round(freq) + + result = Timestamp('2016-10-17 12:00:00.001501031').round('10ns') + expected = Timestamp('2016-10-17 12:00:00.001501030') + assert result == expected + + def test_round_nonstandard_freq(self): + with tm.assert_produces_warning(): + Timestamp('2016-10-17 12:00:00.001501031').round('1010ns') + + def test_round_invalid_arg(self): + stamp = Timestamp('2000-01-05 05:09:15.13') + with tm.assert_raises_regex(ValueError, _INVALID_FREQ_ERROR): + stamp.round('foo') + + @pytest.mark.parametrize('freq, expected', [ + ('D', Timestamp('2000-01-05 00:00:00')), + ('H', Timestamp('2000-01-05 05:00:00')), + ('S', Timestamp('2000-01-05 05:09:15'))]) + def test_round_frequencies(self, freq, expected): + stamp = Timestamp('2000-01-05 05:09:15.13') + + result = stamp.round(freq=freq) + assert result == expected + + def test_ceil(self): + dt = Timestamp('20130101 09:10:11') + result = dt.ceil('D') + expected = Timestamp('20130102') + assert result == expected + + def test_floor(self): + dt = Timestamp('20130101 09:10:11') + result = dt.floor('D') + expected = Timestamp('20130101') + assert result == expected + + # -------------------------------------------------------------- + # Timestamp.replace + timezones = ['UTC', 'Asia/Tokyo', 'US/Eastern', 'dateutil/US/Pacific'] + + def test_replace_naive(self): + # GH#14621, GH#7825 + ts = Timestamp('2016-01-01 09:00:00') + result = ts.replace(hour=0) + expected = Timestamp('2016-01-01 00:00:00') + assert result == expected + + @pytest.mark.parametrize('tz', timezones) + def test_replace_aware(self, tz): + # GH#14621, GH#7825 + # replacing datetime components with and w/o presence of a timezone + ts = Timestamp('2016-01-01 09:00:00', tz=tz) + result = ts.replace(hour=0) + expected = Timestamp('2016-01-01 00:00:00', tz=tz) + assert result == expected + + @pytest.mark.parametrize('tz', timezones) + def test_replace_preserves_nanos(self, tz): + # GH#14621, GH#7825 + ts = Timestamp('2016-01-01 09:00:00.000000123', tz=tz) + result = ts.replace(hour=0) + expected = Timestamp('2016-01-01 00:00:00.000000123', tz=tz) + assert result == expected + + @pytest.mark.parametrize('tz', timezones) + def test_replace_multiple(self, tz): + # GH#14621, GH#7825 + # replacing datetime components with and w/o presence of a timezone + # test all + ts = Timestamp('2016-01-01 09:00:00.000000123', tz=tz) + result = ts.replace(year=2015, month=2, day=2, hour=0, minute=5, + second=5, microsecond=5, nanosecond=5) + expected = Timestamp('2015-02-02 00:05:05.000005005', tz=tz) + assert result == expected + + @pytest.mark.parametrize('tz', timezones) + def test_replace_invalid_kwarg(self, tz): + # GH#14621, GH#7825 + ts = Timestamp('2016-01-01 09:00:00.000000123', tz=tz) + with pytest.raises(TypeError): + ts.replace(foo=5) + + @pytest.mark.parametrize('tz', timezones) + def test_replace_integer_args(self, tz): + # GH#14621, GH#7825 + ts = Timestamp('2016-01-01 09:00:00.000000123', tz=tz) + with pytest.raises(ValueError): + ts.replace(hour=0.1) + + def test_replace_tzinfo_equiv_tz_localize_none(self): + # GH#14621, GH#7825 + # assert conversion to naive is the same as replacing tzinfo with None + ts = Timestamp('2013-11-03 01:59:59.999999-0400', tz='US/Eastern') + assert ts.tz_localize(None) == ts.replace(tzinfo=None) + + @td.skip_if_windows + def test_replace_tzinfo(self): + # GH#15683 + dt = datetime(2016, 3, 27, 1) + tzinfo = pytz.timezone('CET').localize(dt, is_dst=False).tzinfo + + result_dt = dt.replace(tzinfo=tzinfo) + result_pd = Timestamp(dt).replace(tzinfo=tzinfo) + + if PY3: + # datetime.timestamp() converts in the local timezone + with tm.set_timezone('UTC'): + assert result_dt.timestamp() == result_pd.timestamp() + + assert result_dt == result_pd + assert result_dt == result_pd.to_pydatetime() + + result_dt = dt.replace(tzinfo=tzinfo).replace(tzinfo=None) + result_pd = Timestamp(dt).replace(tzinfo=tzinfo).replace(tzinfo=None) + + if PY3: + # datetime.timestamp() converts in the local timezone + with tm.set_timezone('UTC'): + assert result_dt.timestamp() == result_pd.timestamp() + + assert result_dt == result_pd + assert result_dt == result_pd.to_pydatetime() + + # -------------------------------------------------------------- + + @td.skip_if_windows + def test_timestamp(self): + # GH#17329 + # tz-naive --> treat it as if it were UTC for purposes of timestamp() + ts = Timestamp.now() + uts = ts.replace(tzinfo=utc) + assert ts.timestamp() == uts.timestamp() + + tsc = Timestamp('2014-10-11 11:00:01.12345678', tz='US/Central') + utsc = tsc.tz_convert('UTC') + + # utsc is a different representation of the same time + assert tsc.timestamp() == utsc.timestamp() + + if PY3: + # datetime.timestamp() converts in the local timezone + with tm.set_timezone('UTC'): + + # should agree with datetime.timestamp method + dt = ts.to_pydatetime() + assert dt.timestamp() == ts.timestamp() diff --git a/pandas/tests/tseries/test_timezones.py b/pandas/tests/tseries/test_timezones.py index 7ae63d7d080cc..cc5f4d30f9aaf 100644 --- a/pandas/tests/tseries/test_timezones.py +++ b/pandas/tests/tseries/test_timezones.py @@ -14,7 +14,7 @@ import pandas.util.testing as tm import pandas.util._test_decorators as td import pandas.tseries.offsets as offsets -from pandas.compat import lrange, zip, PY3 +from pandas.compat import lrange, zip from pandas.core.indexes.datetimes import bdate_range, date_range from pandas.core.dtypes.dtypes import DatetimeTZDtype from pandas._libs import tslib @@ -1198,65 +1198,23 @@ def test_tz_convert_tzlocal(self): class TestTimeZoneCacheKey(object): - def test_cache_keys_are_distinct_for_pytz_vs_dateutil(self): - tzs = pytz.common_timezones - for tz_name in tzs: - if tz_name == 'UTC': - # skip utc as it's a special case in dateutil - continue - tz_p = timezones.maybe_get_tz(tz_name) - tz_d = timezones.maybe_get_tz('dateutil/' + tz_name) - if tz_d is None: - # skip timezones that dateutil doesn't know about. - continue - assert (timezones._p_tz_cache_key(tz_p) != - timezones._p_tz_cache_key(tz_d)) + @pytest.mark.parametrize('tz_name', list(pytz.common_timezones)) + def test_cache_keys_are_distinct_for_pytz_vs_dateutil(self, tz_name): + if tz_name == 'UTC': + # skip utc as it's a special case in dateutil + return + tz_p = timezones.maybe_get_tz(tz_name) + tz_d = timezones.maybe_get_tz('dateutil/' + tz_name) + if tz_d is None: + # skip timezones that dateutil doesn't know about. + return + assert (timezones._p_tz_cache_key(tz_p) != + timezones._p_tz_cache_key(tz_d)) class TestTimeZones(object): timezones = ['UTC', 'Asia/Tokyo', 'US/Eastern', 'dateutil/US/Pacific'] - def test_replace(self): - # GH 14621 - # GH 7825 - # replacing datetime components with and w/o presence of a timezone - dt = Timestamp('2016-01-01 09:00:00') - result = dt.replace(hour=0) - expected = Timestamp('2016-01-01 00:00:00') - assert result == expected - - for tz in self.timezones: - dt = Timestamp('2016-01-01 09:00:00', tz=tz) - result = dt.replace(hour=0) - expected = Timestamp('2016-01-01 00:00:00', tz=tz) - assert result == expected - - # we preserve nanoseconds - dt = Timestamp('2016-01-01 09:00:00.000000123', tz=tz) - result = dt.replace(hour=0) - expected = Timestamp('2016-01-01 00:00:00.000000123', tz=tz) - assert result == expected - - # test all - dt = Timestamp('2016-01-01 09:00:00.000000123', tz=tz) - result = dt.replace(year=2015, month=2, day=2, hour=0, minute=5, - second=5, microsecond=5, nanosecond=5) - expected = Timestamp('2015-02-02 00:05:05.000005005', tz=tz) - assert result == expected - - # error - def f(): - dt.replace(foo=5) - pytest.raises(TypeError, f) - - def f(): - dt.replace(hour=0.1) - pytest.raises(ValueError, f) - - # assert conversion to naive is the same as replacing tzinfo with None - dt = Timestamp('2013-11-03 01:59:59.999999-0400', tz='US/Eastern') - assert dt.tz_localize(None) == dt.replace(tzinfo=None) - def test_ambiguous_compat(self): # validate that pytz and dateutil are compat for dst # when the transition happens @@ -1298,34 +1256,6 @@ def test_ambiguous_compat(self): assert (result_pytz.to_pydatetime().tzname() == result_dateutil.to_pydatetime().tzname()) - @td.skip_if_windows - def test_replace_tzinfo(self): - # GH 15683 - dt = datetime(2016, 3, 27, 1) - tzinfo = pytz.timezone('CET').localize(dt, is_dst=False).tzinfo - - result_dt = dt.replace(tzinfo=tzinfo) - result_pd = Timestamp(dt).replace(tzinfo=tzinfo) - - if PY3: - # datetime.timestamp() converts in the local timezone - with tm.set_timezone('UTC'): - assert result_dt.timestamp() == result_pd.timestamp() - - assert result_dt == result_pd - assert result_dt == result_pd.to_pydatetime() - - result_dt = dt.replace(tzinfo=tzinfo).replace(tzinfo=None) - result_pd = Timestamp(dt).replace(tzinfo=tzinfo).replace(tzinfo=None) - - if PY3: - # datetime.timestamp() converts in the local timezone - with tm.set_timezone('UTC'): - assert result_dt.timestamp() == result_pd.timestamp() - - assert result_dt == result_pd - assert result_dt == result_pd.to_pydatetime() - def test_index_equals_with_tz(self): left = date_range('1/1/2011', periods=100, freq='H', tz='utc') right = date_range('1/1/2011', periods=100, freq='H', tz='US/Eastern') From d7fa5b372c20394d5e7e7b39bdf669f0ddb2c9f3 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 1 Feb 2018 03:26:45 -0800 Subject: [PATCH 019/214] implement test_scalar_compat (#19479) --- .../indexes/datetimes/test_arithmetic.py | 60 +++++- .../tests/indexes/datetimes/test_datetime.py | 39 +--- .../indexes/datetimes/test_datetimelike.py | 32 +-- pandas/tests/indexes/datetimes/test_misc.py | 78 +------- pandas/tests/indexes/datetimes/test_ops.py | 117 ----------- .../indexes/datetimes/test_scalar_compat.py | 188 ++++++++++++++++++ pandas/tests/indexes/datetimes/test_setops.py | 32 +++ 7 files changed, 284 insertions(+), 262 deletions(-) create mode 100644 pandas/tests/indexes/datetimes/test_scalar_compat.py diff --git a/pandas/tests/indexes/datetimes/test_arithmetic.py b/pandas/tests/indexes/datetimes/test_arithmetic.py index 480f025db17ca..671071b5e4945 100644 --- a/pandas/tests/indexes/datetimes/test_arithmetic.py +++ b/pandas/tests/indexes/datetimes/test_arithmetic.py @@ -10,7 +10,7 @@ import pandas as pd from pandas.compat.numpy import np_datetime64_compat import pandas.util.testing as tm -from pandas.errors import PerformanceWarning +from pandas.errors import PerformanceWarning, NullFrequencyError from pandas import (Timestamp, Timedelta, Series, DatetimeIndex, TimedeltaIndex, date_range) @@ -274,6 +274,64 @@ def test_dti_isub_int(self, tz, one): rng -= one tm.assert_index_equal(rng, expected) + # ------------------------------------------------------------- + # DatetimeIndex.shift is used in integer addition + + def test_dti_shift_tzaware(self, tz): + # GH#9903 + idx = pd.DatetimeIndex([], name='xxx', tz=tz) + tm.assert_index_equal(idx.shift(0, freq='H'), idx) + tm.assert_index_equal(idx.shift(3, freq='H'), idx) + + idx = pd.DatetimeIndex(['2011-01-01 10:00', '2011-01-01 11:00' + '2011-01-01 12:00'], name='xxx', tz=tz) + tm.assert_index_equal(idx.shift(0, freq='H'), idx) + exp = pd.DatetimeIndex(['2011-01-01 13:00', '2011-01-01 14:00' + '2011-01-01 15:00'], name='xxx', tz=tz) + tm.assert_index_equal(idx.shift(3, freq='H'), exp) + exp = pd.DatetimeIndex(['2011-01-01 07:00', '2011-01-01 08:00' + '2011-01-01 09:00'], name='xxx', tz=tz) + tm.assert_index_equal(idx.shift(-3, freq='H'), exp) + + def test_dti_shift_freqs(self): + # test shift for DatetimeIndex and non DatetimeIndex + # GH#8083 + drange = pd.date_range('20130101', periods=5) + result = drange.shift(1) + expected = pd.DatetimeIndex(['2013-01-02', '2013-01-03', '2013-01-04', + '2013-01-05', + '2013-01-06'], freq='D') + tm.assert_index_equal(result, expected) + + result = drange.shift(-1) + expected = pd.DatetimeIndex(['2012-12-31', '2013-01-01', '2013-01-02', + '2013-01-03', '2013-01-04'], + freq='D') + tm.assert_index_equal(result, expected) + + result = drange.shift(3, freq='2D') + expected = pd.DatetimeIndex(['2013-01-07', '2013-01-08', '2013-01-09', + '2013-01-10', + '2013-01-11'], freq='D') + tm.assert_index_equal(result, expected) + + def test_dti_shift_int(self): + rng = date_range('1/1/2000', periods=20) + + result = rng + 5 + expected = rng.shift(5) + tm.assert_index_equal(result, expected) + + result = rng - 5 + expected = rng.shift(-5) + tm.assert_index_equal(result, expected) + + def test_dti_shift_no_freq(self): + # GH#19147 + dti = pd.DatetimeIndex(['2011-01-01 10:00', '2011-01-01'], freq=None) + with pytest.raises(NullFrequencyError): + dti.shift(2) + # ------------------------------------------------------------- # Binary operations DatetimeIndex and timedelta-like diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index 49f94bfa65543..a75ace2933b71 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -2,7 +2,7 @@ import pytest import numpy as np -from datetime import date, timedelta, time, datetime +from datetime import date, timedelta, time import dateutil import pandas as pd @@ -16,31 +16,6 @@ randn = np.random.randn -class TestDatetimeIndexLikeTimestamp(object): - # Tests for DatetimeIndex behaving like a vectorized Timestamp - - def test_dti_date_out_of_range(self): - # see gh-1475 - pytest.raises(ValueError, DatetimeIndex, ['1400-01-01']) - pytest.raises(ValueError, DatetimeIndex, [datetime(1400, 1, 1)]) - - def test_timestamp_fields(self): - # extra fields from DatetimeIndex like quarter and week - idx = tm.makeDateIndex(100) - - fields = ['dayofweek', 'dayofyear', 'week', 'weekofyear', 'quarter', - 'days_in_month', 'is_month_start', 'is_month_end', - 'is_quarter_start', 'is_quarter_end', 'is_year_start', - 'is_year_end', 'weekday_name'] - for f in fields: - expected = getattr(idx, f)[-1] - result = getattr(Timestamp(idx[-1]), f) - assert result == expected - - assert idx.freq == Timestamp(idx[-1], idx.freq).freq - assert idx.freqstr == Timestamp(idx[-1], idx.freq).freqstr - - class TestDatetimeIndex(object): def test_get_loc(self): @@ -371,18 +346,6 @@ def test_isin(self): assert_almost_equal(index.isin([index[2], 5]), np.array([False, False, True, False])) - def test_time(self): - rng = pd.date_range('1/1/2000', freq='12min', periods=10) - result = pd.Index(rng).time - expected = [t.time() for t in rng] - assert (result == expected).all() - - def test_date(self): - rng = pd.date_range('1/1/2000', freq='12H', periods=10) - result = pd.Index(rng).date - expected = [t.date() for t in rng] - assert (result == expected).all() - def test_does_not_convert_mixed_integer(self): df = tm.makeCustomDataframe(10, 10, data_gen_f=lambda *args, **kwargs: randn(), diff --git a/pandas/tests/indexes/datetimes/test_datetimelike.py b/pandas/tests/indexes/datetimes/test_datetimelike.py index 9d6d27ecb4b6f..c6b3a77773dc7 100644 --- a/pandas/tests/indexes/datetimes/test_datetimelike.py +++ b/pandas/tests/indexes/datetimes/test_datetimelike.py @@ -1,9 +1,7 @@ """ generic tests from the Datetimelike class """ -import numpy as np -import pandas as pd from pandas.util import testing as tm -from pandas import Series, Index, DatetimeIndex, date_range +from pandas import DatetimeIndex, date_range from ..datetimelike import DatetimeLike @@ -27,31 +25,7 @@ def test_pickle_compat_construction(self): pass def test_intersection(self): - first = self.index - second = self.index[5:] - intersect = first.intersection(second) - assert tm.equalContents(intersect, second) - - # GH 10149 - cases = [klass(second.values) for klass in [np.array, Series, list]] - for case in cases: - result = first.intersection(case) - assert tm.equalContents(result, second) - - third = Index(['a', 'b', 'c']) - result = first.intersection(third) - expected = pd.Index([], dtype=object) - tm.assert_index_equal(result, expected) + pass # handled in test_setops def test_union(self): - first = self.index[:5] - second = self.index[5:] - everything = self.index - union = first.union(second) - assert tm.equalContents(union, everything) - - # GH 10149 - cases = [klass(second.values) for klass in [np.array, Series, list]] - for case in cases: - result = first.union(case) - assert tm.equalContents(result, everything) + pass # handled in test_setops diff --git a/pandas/tests/indexes/datetimes/test_misc.py b/pandas/tests/indexes/datetimes/test_misc.py index 951aa2c520d0f..4a46c3b04bbad 100644 --- a/pandas/tests/indexes/datetimes/test_misc.py +++ b/pandas/tests/indexes/datetimes/test_misc.py @@ -4,53 +4,7 @@ import pandas as pd import pandas.util.testing as tm from pandas import (Index, DatetimeIndex, datetime, offsets, - Float64Index, date_range, Timestamp) - - -class TestDateTimeIndexToJulianDate(object): - - def test_1700(self): - r1 = Float64Index([2345897.5, 2345898.5, 2345899.5, 2345900.5, - 2345901.5]) - r2 = date_range(start=Timestamp('1710-10-01'), periods=5, - freq='D').to_julian_date() - assert isinstance(r2, Float64Index) - tm.assert_index_equal(r1, r2) - - def test_2000(self): - r1 = Float64Index([2451601.5, 2451602.5, 2451603.5, 2451604.5, - 2451605.5]) - r2 = date_range(start=Timestamp('2000-02-27'), periods=5, - freq='D').to_julian_date() - assert isinstance(r2, Float64Index) - tm.assert_index_equal(r1, r2) - - def test_hour(self): - r1 = Float64Index( - [2451601.5, 2451601.5416666666666666, 2451601.5833333333333333, - 2451601.625, 2451601.6666666666666666]) - r2 = date_range(start=Timestamp('2000-02-27'), periods=5, - freq='H').to_julian_date() - assert isinstance(r2, Float64Index) - tm.assert_index_equal(r1, r2) - - def test_minute(self): - r1 = Float64Index( - [2451601.5, 2451601.5006944444444444, 2451601.5013888888888888, - 2451601.5020833333333333, 2451601.5027777777777777]) - r2 = date_range(start=Timestamp('2000-02-27'), periods=5, - freq='T').to_julian_date() - assert isinstance(r2, Float64Index) - tm.assert_index_equal(r1, r2) - - def test_second(self): - r1 = Float64Index( - [2451601.5, 2451601.500011574074074, 2451601.5000231481481481, - 2451601.5000347222222222, 2451601.5000462962962962]) - r2 = date_range(start=Timestamp('2000-02-27'), periods=5, - freq='S').to_julian_date() - assert isinstance(r2, Float64Index) - tm.assert_index_equal(r1, r2) + date_range, Timestamp) class TestTimeSeries(object): @@ -129,17 +83,6 @@ def test_range_edges(self): '1970-01-03', '1970-01-04']) tm.assert_index_equal(idx, exp) - def test_datetimeindex_integers_shift(self): - rng = date_range('1/1/2000', periods=20) - - result = rng + 5 - expected = rng.shift(5) - tm.assert_index_equal(result, expected) - - result = rng - 5 - expected = rng.shift(-5) - tm.assert_index_equal(result, expected) - def test_datetimeindex_repr_short(self): dr = date_range(start='1/1/2012', periods=1) repr(dr) @@ -150,25 +93,6 @@ def test_datetimeindex_repr_short(self): dr = date_range(start='1/1/2012', periods=3) repr(dr) - def test_normalize(self): - rng = date_range('1/1/2000 9:30', periods=10, freq='D') - - result = rng.normalize() - expected = date_range('1/1/2000', periods=10, freq='D') - tm.assert_index_equal(result, expected) - - rng_ns = pd.DatetimeIndex(np.array([1380585623454345752, - 1380585612343234312]).astype( - "datetime64[ns]")) - rng_ns_normalized = rng_ns.normalize() - expected = pd.DatetimeIndex(np.array([1380585600000000000, - 1380585600000000000]).astype( - "datetime64[ns]")) - tm.assert_index_equal(rng_ns_normalized, expected) - - assert result.is_normalized - assert not rng.is_normalized - class TestDatetime64(object): diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py index fb8dd1a43aa7f..4f386eb28cc0f 100644 --- a/pandas/tests/indexes/datetimes/test_ops.py +++ b/pandas/tests/indexes/datetimes/test_ops.py @@ -7,7 +7,6 @@ from itertools import product import pandas as pd -from pandas.errors import NullFrequencyError import pandas._libs.tslib as tslib from pandas._libs.tslibs.offsets import shift_months import pandas.util.testing as tm @@ -144,76 +143,6 @@ def test_numpy_minmax(self): tm.assert_raises_regex( ValueError, errmsg, np.argmax, dr, out=0) - def test_round_daily(self): - dti = pd.date_range('20130101 09:10:11', periods=5) - result = dti.round('D') - expected = pd.date_range('20130101', periods=5) - tm.assert_index_equal(result, expected) - - dti = dti.tz_localize('UTC').tz_convert('US/Eastern') - result = dti.round('D') - expected = pd.date_range('20130101', - periods=5).tz_localize('US/Eastern') - tm.assert_index_equal(result, expected) - - result = dti.round('s') - tm.assert_index_equal(result, dti) - - # invalid - for freq in ['Y', 'M', 'foobar']: - pytest.raises(ValueError, lambda: dti.round(freq)) - - def test_round(self): - for tz in self.tz: - rng = pd.date_range(start='2016-01-01', periods=5, - freq='30Min', tz=tz) - elt = rng[1] - - expected_rng = DatetimeIndex([ - Timestamp('2016-01-01 00:00:00', tz=tz, freq='30T'), - Timestamp('2016-01-01 00:00:00', tz=tz, freq='30T'), - Timestamp('2016-01-01 01:00:00', tz=tz, freq='30T'), - Timestamp('2016-01-01 02:00:00', tz=tz, freq='30T'), - Timestamp('2016-01-01 02:00:00', tz=tz, freq='30T'), - ]) - expected_elt = expected_rng[1] - - tm.assert_index_equal(rng.round(freq='H'), expected_rng) - assert elt.round(freq='H') == expected_elt - - msg = pd._libs.tslibs.frequencies._INVALID_FREQ_ERROR - with tm.assert_raises_regex(ValueError, msg): - rng.round(freq='foo') - with tm.assert_raises_regex(ValueError, msg): - elt.round(freq='foo') - - msg = " is a non-fixed frequency" - tm.assert_raises_regex(ValueError, msg, rng.round, freq='M') - tm.assert_raises_regex(ValueError, msg, elt.round, freq='M') - - # GH 14440 & 15578 - index = pd.DatetimeIndex(['2016-10-17 12:00:00.0015'], tz=tz) - result = index.round('ms') - expected = pd.DatetimeIndex(['2016-10-17 12:00:00.002000'], tz=tz) - tm.assert_index_equal(result, expected) - - for freq in ['us', 'ns']: - tm.assert_index_equal(index, index.round(freq)) - - index = pd.DatetimeIndex(['2016-10-17 12:00:00.00149'], tz=tz) - result = index.round('ms') - expected = pd.DatetimeIndex(['2016-10-17 12:00:00.001000'], tz=tz) - tm.assert_index_equal(result, expected) - - index = pd.DatetimeIndex(['2016-10-17 12:00:00.001501031']) - result = index.round('10ns') - expected = pd.DatetimeIndex(['2016-10-17 12:00:00.001501030']) - tm.assert_index_equal(result, expected) - - with tm.assert_produces_warning(): - ts = '2016-10-17 12:00:00.001501031' - pd.DatetimeIndex([ts]).round('1010ns') - def test_repeat_range(self): rng = date_range('1/1/2000', '1/1/2001') @@ -586,52 +515,6 @@ def test_nat_new(self): exp = np.array([tslib.iNaT] * 5, dtype=np.int64) tm.assert_numpy_array_equal(result, exp) - def test_shift_no_freq(self): - # GH#19147 - dti = pd.DatetimeIndex(['2011-01-01 10:00', '2011-01-01'], freq=None) - with pytest.raises(NullFrequencyError): - dti.shift(2) - - def test_shift(self): - # GH 9903 - for tz in self.tz: - idx = pd.DatetimeIndex([], name='xxx', tz=tz) - tm.assert_index_equal(idx.shift(0, freq='H'), idx) - tm.assert_index_equal(idx.shift(3, freq='H'), idx) - - idx = pd.DatetimeIndex(['2011-01-01 10:00', '2011-01-01 11:00' - '2011-01-01 12:00'], name='xxx', tz=tz) - tm.assert_index_equal(idx.shift(0, freq='H'), idx) - exp = pd.DatetimeIndex(['2011-01-01 13:00', '2011-01-01 14:00' - '2011-01-01 15:00'], name='xxx', tz=tz) - tm.assert_index_equal(idx.shift(3, freq='H'), exp) - exp = pd.DatetimeIndex(['2011-01-01 07:00', '2011-01-01 08:00' - '2011-01-01 09:00'], name='xxx', tz=tz) - tm.assert_index_equal(idx.shift(-3, freq='H'), exp) - - # TODO: moved from test_datetimelike; de-duplicate with test_shift above - def test_shift2(self): - # test shift for datetimeIndex and non datetimeIndex - # GH8083 - drange = pd.date_range('20130101', periods=5) - result = drange.shift(1) - expected = pd.DatetimeIndex(['2013-01-02', '2013-01-03', '2013-01-04', - '2013-01-05', - '2013-01-06'], freq='D') - tm.assert_index_equal(result, expected) - - result = drange.shift(-1) - expected = pd.DatetimeIndex(['2012-12-31', '2013-01-01', '2013-01-02', - '2013-01-03', '2013-01-04'], - freq='D') - tm.assert_index_equal(result, expected) - - result = drange.shift(3, freq='2D') - expected = pd.DatetimeIndex(['2013-01-07', '2013-01-08', '2013-01-09', - '2013-01-10', - '2013-01-11'], freq='D') - tm.assert_index_equal(result, expected) - def test_nat(self): assert pd.DatetimeIndex._na_value is pd.NaT assert pd.DatetimeIndex([])._na_value is pd.NaT diff --git a/pandas/tests/indexes/datetimes/test_scalar_compat.py b/pandas/tests/indexes/datetimes/test_scalar_compat.py new file mode 100644 index 0000000000000..111f68ba14775 --- /dev/null +++ b/pandas/tests/indexes/datetimes/test_scalar_compat.py @@ -0,0 +1,188 @@ +# -*- coding: utf-8 -*- +""" +Tests for DatetimeIndex methods behaving like their Timestamp counterparts +""" +from datetime import datetime + +import numpy as np +import pytest + +import pandas.util.testing as tm +import pandas as pd + +from pandas import date_range, Timestamp, DatetimeIndex + + +@pytest.fixture(params=[None, 'UTC', 'Asia/Tokyo', + 'US/Eastern', 'dateutil/Asia/Singapore', + 'dateutil/US/Pacific']) +def tz(request): + return request.param + + +class TestDatetimeIndexOps(object): + def test_dti_time(self): + rng = date_range('1/1/2000', freq='12min', periods=10) + result = pd.Index(rng).time + expected = [t.time() for t in rng] + assert (result == expected).all() + + def test_dti_date(self): + rng = date_range('1/1/2000', freq='12H', periods=10) + result = pd.Index(rng).date + expected = [t.date() for t in rng] + assert (result == expected).all() + + def test_dti_date_out_of_range(self): + # GH#1475 + pytest.raises(ValueError, DatetimeIndex, ['1400-01-01']) + pytest.raises(ValueError, DatetimeIndex, [datetime(1400, 1, 1)]) + + def test_dti_timestamp_fields(self): + # extra fields from DatetimeIndex like quarter and week + idx = tm.makeDateIndex(100) + + fields = ['dayofweek', 'dayofyear', 'week', 'weekofyear', 'quarter', + 'days_in_month', 'is_month_start', 'is_month_end', + 'is_quarter_start', 'is_quarter_end', 'is_year_start', + 'is_year_end', 'weekday_name'] + for f in fields: + expected = getattr(idx, f)[-1] + result = getattr(Timestamp(idx[-1]), f) + assert result == expected + + assert idx.freq == Timestamp(idx[-1], idx.freq).freq + assert idx.freqstr == Timestamp(idx[-1], idx.freq).freqstr + + # ---------------------------------------------------------------- + # DatetimeIndex.round + + def test_round_daily(self): + dti = date_range('20130101 09:10:11', periods=5) + result = dti.round('D') + expected = date_range('20130101', periods=5) + tm.assert_index_equal(result, expected) + + dti = dti.tz_localize('UTC').tz_convert('US/Eastern') + result = dti.round('D') + expected = date_range('20130101', + periods=5).tz_localize('US/Eastern') + tm.assert_index_equal(result, expected) + + result = dti.round('s') + tm.assert_index_equal(result, dti) + + # invalid + for freq in ['Y', 'M', 'foobar']: + pytest.raises(ValueError, lambda: dti.round(freq)) + + def test_round(self, tz): + rng = date_range(start='2016-01-01', periods=5, + freq='30Min', tz=tz) + elt = rng[1] + + expected_rng = DatetimeIndex([ + Timestamp('2016-01-01 00:00:00', tz=tz, freq='30T'), + Timestamp('2016-01-01 00:00:00', tz=tz, freq='30T'), + Timestamp('2016-01-01 01:00:00', tz=tz, freq='30T'), + Timestamp('2016-01-01 02:00:00', tz=tz, freq='30T'), + Timestamp('2016-01-01 02:00:00', tz=tz, freq='30T'), + ]) + expected_elt = expected_rng[1] + + tm.assert_index_equal(rng.round(freq='H'), expected_rng) + assert elt.round(freq='H') == expected_elt + + msg = pd._libs.tslibs.frequencies._INVALID_FREQ_ERROR + with tm.assert_raises_regex(ValueError, msg): + rng.round(freq='foo') + with tm.assert_raises_regex(ValueError, msg): + elt.round(freq='foo') + + msg = " is a non-fixed frequency" + tm.assert_raises_regex(ValueError, msg, rng.round, freq='M') + tm.assert_raises_regex(ValueError, msg, elt.round, freq='M') + + # GH#14440 & GH#15578 + index = DatetimeIndex(['2016-10-17 12:00:00.0015'], tz=tz) + result = index.round('ms') + expected = DatetimeIndex(['2016-10-17 12:00:00.002000'], tz=tz) + tm.assert_index_equal(result, expected) + + for freq in ['us', 'ns']: + tm.assert_index_equal(index, index.round(freq)) + + index = DatetimeIndex(['2016-10-17 12:00:00.00149'], tz=tz) + result = index.round('ms') + expected = DatetimeIndex(['2016-10-17 12:00:00.001000'], tz=tz) + tm.assert_index_equal(result, expected) + + index = DatetimeIndex(['2016-10-17 12:00:00.001501031']) + result = index.round('10ns') + expected = DatetimeIndex(['2016-10-17 12:00:00.001501030']) + tm.assert_index_equal(result, expected) + + with tm.assert_produces_warning(): + ts = '2016-10-17 12:00:00.001501031' + DatetimeIndex([ts]).round('1010ns') + + # ---------------------------------------------------------------- + # DatetimeIndex.normalize + + def test_normalize(self): + rng = date_range('1/1/2000 9:30', periods=10, freq='D') + + result = rng.normalize() + expected = date_range('1/1/2000', periods=10, freq='D') + tm.assert_index_equal(result, expected) + + arr_ns = np.array([1380585623454345752, + 1380585612343234312]).astype("datetime64[ns]") + rng_ns = DatetimeIndex(arr_ns) + rng_ns_normalized = rng_ns.normalize() + + arr_ns = np.array([1380585600000000000, + 1380585600000000000]).astype("datetime64[ns]") + expected = DatetimeIndex(arr_ns) + tm.assert_index_equal(rng_ns_normalized, expected) + + assert result.is_normalized + assert not rng.is_normalized + + +class TestDateTimeIndexToJulianDate(object): + + def test_1700(self): + dr = date_range(start=Timestamp('1710-10-01'), periods=5, freq='D') + r1 = pd.Index([x.to_julian_date() for x in dr]) + r2 = dr.to_julian_date() + assert isinstance(r2, pd.Float64Index) + tm.assert_index_equal(r1, r2) + + def test_2000(self): + dr = date_range(start=Timestamp('2000-02-27'), periods=5, freq='D') + r1 = pd.Index([x.to_julian_date() for x in dr]) + r2 = dr.to_julian_date() + assert isinstance(r2, pd.Float64Index) + tm.assert_index_equal(r1, r2) + + def test_hour(self): + dr = date_range(start=Timestamp('2000-02-27'), periods=5, freq='H') + r1 = pd.Index([x.to_julian_date() for x in dr]) + r2 = dr.to_julian_date() + assert isinstance(r2, pd.Float64Index) + tm.assert_index_equal(r1, r2) + + def test_minute(self): + dr = date_range(start=Timestamp('2000-02-27'), periods=5, freq='T') + r1 = pd.Index([x.to_julian_date() for x in dr]) + r2 = dr.to_julian_date() + assert isinstance(r2, pd.Float64Index) + tm.assert_index_equal(r1, r2) + + def test_second(self): + dr = date_range(start=Timestamp('2000-02-27'), periods=5, freq='S') + r1 = pd.Index([x.to_julian_date() for x in dr]) + r2 = dr.to_julian_date() + assert isinstance(r2, pd.Float64Index) + tm.assert_index_equal(r1, r2) diff --git a/pandas/tests/indexes/datetimes/test_setops.py b/pandas/tests/indexes/datetimes/test_setops.py index b74da4922429d..84632e59e2bfb 100644 --- a/pandas/tests/indexes/datetimes/test_setops.py +++ b/pandas/tests/indexes/datetimes/test_setops.py @@ -17,6 +17,20 @@ class TestDatetimeIndexSetOps(object): tz = [None, 'UTC', 'Asia/Tokyo', 'US/Eastern', 'dateutil/Asia/Singapore', 'dateutil/US/Pacific'] + # TODO: moved from test_datetimelike; dedup with version below + def test_union2(self): + everything = tm.makeDateIndex(10) + first = everything[:5] + second = everything[5:] + union = first.union(second) + assert tm.equalContents(union, everything) + + # GH 10149 + cases = [klass(second.values) for klass in [np.array, Series, list]] + for case in cases: + result = first.union(case) + assert tm.equalContents(result, everything) + @pytest.mark.parametrize("tz", tz) def test_union(self, tz): rng1 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) @@ -101,6 +115,24 @@ def test_union_with_DatetimeIndex(self): i1.union(i2) # Works i2.union(i1) # Fails with "AttributeError: can't set attribute" + # TODO: moved from test_datetimelike; de-duplicate with version below + def test_intersection2(self): + first = tm.makeDateIndex(10) + second = first[5:] + intersect = first.intersection(second) + assert tm.equalContents(intersect, second) + + # GH 10149 + cases = [klass(second.values) for klass in [np.array, Series, list]] + for case in cases: + result = first.intersection(case) + assert tm.equalContents(result, second) + + third = Index(['a', 'b', 'c']) + result = first.intersection(third) + expected = pd.Index([], dtype=object) + tm.assert_index_equal(result, expected) + @pytest.mark.parametrize("tz", [None, 'Asia/Tokyo', 'US/Eastern', 'dateutil/US/Pacific']) def test_intersection(self, tz): From 4eb0cec404cdf468baf55038c2d34a0665d2ae4f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 1 Feb 2018 03:33:00 -0800 Subject: [PATCH 020/214] Refactor out libwriters, fix references to Timestamp, Timedelta (#19413) --- pandas/_libs/lib.pyx | 196 +----------------- pandas/_libs/parsers.pyx | 34 +++ pandas/_libs/src/inference.pyx | 26 +-- pandas/_libs/writers.pyx | 174 ++++++++++++++++ pandas/core/computation/scope.py | 2 +- pandas/core/dtypes/cast.py | 2 +- pandas/core/generic.py | 8 +- pandas/core/internals.py | 4 +- pandas/core/nanops.py | 4 +- pandas/core/resample.py | 2 +- pandas/io/formats/format.py | 5 +- pandas/io/json/normalize.py | 2 +- pandas/io/parsers.py | 5 +- pandas/io/pytables.py | 13 +- pandas/io/stata.py | 3 +- pandas/plotting/_converter.py | 6 +- .../indexes/datetimes/test_construction.py | 3 +- pandas/tests/indexes/test_base.py | 2 +- pandas/tests/indexes/test_multi.py | 2 +- pandas/tests/indexes/test_numeric.py | 2 +- pandas/tests/io/json/test_ujson.py | 2 +- pandas/tests/io/parser/common.py | 2 +- pandas/tests/io/parser/converters.py | 2 +- pandas/tests/io/parser/parse_dates.py | 2 +- pandas/tests/io/parser/test_parsers.py | 2 +- pandas/tests/io/parser/usecols.py | 2 +- pandas/tests/series/test_indexing.py | 4 +- pandas/tests/test_lib.py | 10 +- setup.py | 4 + 29 files changed, 262 insertions(+), 263 deletions(-) create mode 100644 pandas/_libs/writers.pyx diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index c3a654b01022c..e1d59f807a7fd 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -21,14 +21,7 @@ from cpython cimport (Py_INCREF, PyTuple_SET_ITEM, PyBytes_Check, PyUnicode_Check, PyTuple_New, - PyObject_RichCompareBool, - PyBytes_GET_SIZE, - PyUnicode_GET_SIZE) - -try: - from cpython cimport PyString_GET_SIZE -except ImportError: - from cpython cimport PyUnicode_GET_SIZE as PyString_GET_SIZE + PyObject_RichCompareBool) cimport cpython @@ -38,7 +31,7 @@ from cpython.datetime cimport (PyDateTime_Check, PyDate_Check, PyDateTime_IMPORT) PyDateTime_IMPORT -from tslib import NaT, Timestamp, Timedelta, array_to_datetime +from tslib import NaT, array_to_datetime from missing cimport checknull @@ -127,28 +120,6 @@ def item_from_zerodim(object val): return util.unbox_if_zerodim(val) -@cython.wraparound(False) -@cython.boundscheck(False) -def fast_unique(ndarray[object] values): - cdef: - Py_ssize_t i, n = len(values) - list uniques = [] - dict table = {} - object val, stub = 0 - - for i from 0 <= i < n: - val = values[i] - if val not in table: - table[val] = stub - uniques.append(val) - try: - uniques.sort() - except Exception: - pass - - return uniques - - @cython.wraparound(False) @cython.boundscheck(False) def fast_unique_multiple(list arrays): @@ -368,30 +339,6 @@ def has_infs_f8(ndarray[float64_t] arr): return False -def convert_timestamps(ndarray values): - cdef: - object val, f, result - dict cache = {} - Py_ssize_t i, n = len(values) - ndarray[object] out - - # for HDFStore, a bit temporary but... - - from datetime import datetime - f = datetime.fromtimestamp - - out = np.empty(n, dtype='O') - - for i in range(n): - val = util.get_value_1d(values, i) - if val in cache: - out[i] = cache[val] - else: - cache[val] = out[i] = f(val) - - return out - - def maybe_indices_to_slice(ndarray[int64_t] indices, int max_len): cdef: Py_ssize_t i, n = len(indices) @@ -731,145 +678,6 @@ def clean_index_list(list obj): return np.asarray(obj), 0 -ctypedef fused pandas_string: - str - unicode - bytes - - -@cython.boundscheck(False) -@cython.wraparound(False) -cpdef Py_ssize_t max_len_string_array(pandas_string[:] arr): - """ return the maximum size of elements in a 1-dim string array """ - cdef: - Py_ssize_t i, m = 0, l = 0, length = arr.shape[0] - pandas_string v - - for i in range(length): - v = arr[i] - if PyString_Check(v): - l = PyString_GET_SIZE(v) - elif PyBytes_Check(v): - l = PyBytes_GET_SIZE(v) - elif PyUnicode_Check(v): - l = PyUnicode_GET_SIZE(v) - - if l > m: - m = l - - return m - - -@cython.boundscheck(False) -@cython.wraparound(False) -def string_array_replace_from_nan_rep( - ndarray[object, ndim=1] arr, object nan_rep, - object replace=None): - """ - Replace the values in the array with 'replacement' if - they are 'nan_rep'. Return the same array. - """ - - cdef int length = arr.shape[0], i = 0 - if replace is None: - replace = np.nan - - for i from 0 <= i < length: - if arr[i] == nan_rep: - arr[i] = replace - - return arr - - -@cython.boundscheck(False) -@cython.wraparound(False) -def convert_json_to_lines(object arr): - """ - replace comma separated json with line feeds, paying special attention - to quotes & brackets - """ - cdef: - Py_ssize_t i = 0, num_open_brackets_seen = 0, length - bint in_quotes = 0, is_escaping = 0 - ndarray[uint8_t] narr - unsigned char v, comma, left_bracket, right_brack, newline - - newline = ord('\n') - comma = ord(',') - left_bracket = ord('{') - right_bracket = ord('}') - quote = ord('"') - backslash = ord('\\') - - narr = np.frombuffer(arr.encode('utf-8'), dtype='u1').copy() - length = narr.shape[0] - for i in range(length): - v = narr[i] - if v == quote and i > 0 and not is_escaping: - in_quotes = ~in_quotes - if v == backslash or is_escaping: - is_escaping = ~is_escaping - if v == comma: # commas that should be \n - if num_open_brackets_seen == 0 and not in_quotes: - narr[i] = newline - elif v == left_bracket: - if not in_quotes: - num_open_brackets_seen += 1 - elif v == right_bracket: - if not in_quotes: - num_open_brackets_seen -= 1 - - return narr.tostring().decode('utf-8') - - -@cython.boundscheck(False) -@cython.wraparound(False) -def write_csv_rows(list data, ndarray data_index, - int nlevels, ndarray cols, object writer): - - cdef int N, j, i, ncols - cdef list rows - cdef object val - - # In crude testing, N>100 yields little marginal improvement - N=100 - - # pre-allocate rows - ncols = len(cols) - rows = [[None] * (nlevels + ncols) for x in range(N)] - - j = -1 - if nlevels == 1: - for j in range(len(data_index)): - row = rows[j % N] - row[0] = data_index[j] - for i in range(ncols): - row[1 + i] = data[i][j] - - if j >= N - 1 and j % N == N - 1: - writer.writerows(rows) - elif nlevels > 1: - for j in range(len(data_index)): - row = rows[j % N] - row[:nlevels] = list(data_index[j]) - for i in range(ncols): - row[nlevels + i] = data[i][j] - - if j >= N - 1 and j % N == N - 1: - writer.writerows(rows) - else: - for j in range(len(data_index)): - row = rows[j % N] - for i in range(ncols): - row[i] = data[i][j] - - if j >= N - 1 and j % N == N - 1: - writer.writerows(rows) - - if j >= 0 and (j < N - 1 or (j % N) != N - 1): - writer.writerows(rows[:((j + 1) % N)]) - - # ------------------------------------------------------------------------------ # Groupby-related functions diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index efe61716d0831..89d2de6de213a 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -2225,3 +2225,37 @@ def _maybe_encode(values): if values is None: return [] return [x.encode('utf-8') if isinstance(x, unicode) else x for x in values] + + +def sanitize_objects(ndarray[object] values, set na_values, + convert_empty=True): + """ + Convert specified values, including the given set na_values and empty + strings if convert_empty is True, to np.nan. + + Parameters + ---------- + values : ndarray[object] + na_values : set + convert_empty : bool (default True) + """ + cdef: + Py_ssize_t i, n + object val, onan + Py_ssize_t na_count = 0 + dict memo = {} + + n = len(values) + onan = np.nan + + for i from 0 <= i < n: + val = values[i] + if (convert_empty and val == '') or (val in na_values): + values[i] = onan + na_count += 1 + elif val in memo: + values[i] = memo[val] + else: + memo[val] = val + + return na_count diff --git a/pandas/_libs/src/inference.pyx b/pandas/_libs/src/inference.pyx index b29a2e519efcd..75bff34e4a391 100644 --- a/pandas/_libs/src/inference.pyx +++ b/pandas/_libs/src/inference.pyx @@ -6,7 +6,7 @@ from tslibs.nattype import NaT from tslibs.conversion cimport convert_to_tsobject from tslibs.timedeltas cimport convert_to_timedelta64 from tslibs.timezones cimport get_timezone, tz_compare -from datetime import datetime, timedelta + iNaT = util.get_nat() cdef bint PY2 = sys.version_info[0] == 2 @@ -1405,30 +1405,6 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, return objects -def sanitize_objects(ndarray[object] values, set na_values, - convert_empty=True): - cdef: - Py_ssize_t i, n - object val, onan - Py_ssize_t na_count = 0 - dict memo = {} - - n = len(values) - onan = np.nan - - for i from 0 <= i < n: - val = values[i] - if (convert_empty and val == '') or (val in na_values): - values[i] = onan - na_count += 1 - elif val in memo: - values[i] = memo[val] - else: - memo[val] = val - - return na_count - - def maybe_convert_bool(ndarray[object] arr, true_values=None, false_values=None): cdef: diff --git a/pandas/_libs/writers.pyx b/pandas/_libs/writers.pyx new file mode 100644 index 0000000000000..6f07d04b3fad3 --- /dev/null +++ b/pandas/_libs/writers.pyx @@ -0,0 +1,174 @@ +# -*- coding: utf-8 -*- + +cimport cython +from cython cimport Py_ssize_t + +from cpython cimport (PyString_Check, PyBytes_Check, PyUnicode_Check, + PyBytes_GET_SIZE, PyUnicode_GET_SIZE) + +try: + from cpython cimport PyString_GET_SIZE +except ImportError: + from cpython cimport PyUnicode_GET_SIZE as PyString_GET_SIZE + +import numpy as np +cimport numpy as cnp +from numpy cimport ndarray, uint8_t +cnp.import_array() + +cimport util + + +ctypedef fused pandas_string: + str + unicode + bytes + + +@cython.boundscheck(False) +@cython.wraparound(False) +def write_csv_rows(list data, ndarray data_index, + int nlevels, ndarray cols, object writer): + """ + Write the given data to the writer object, pre-allocating where possible + for performance improvements. + + Parameters + ---------- + data : list + data_index : ndarray + nlevels : int + cols : ndarray + writer : object + """ + cdef int N, j, i, ncols + cdef list rows + cdef object val + + # In crude testing, N>100 yields little marginal improvement + N = 100 + + # pre-allocate rows + ncols = len(cols) + rows = [[None] * (nlevels + ncols) for x in range(N)] + + j = -1 + if nlevels == 1: + for j in range(len(data_index)): + row = rows[j % N] + row[0] = data_index[j] + for i in range(ncols): + row[1 + i] = data[i][j] + + if j >= N - 1 and j % N == N - 1: + writer.writerows(rows) + elif nlevels > 1: + for j in range(len(data_index)): + row = rows[j % N] + row[:nlevels] = list(data_index[j]) + for i in range(ncols): + row[nlevels + i] = data[i][j] + + if j >= N - 1 and j % N == N - 1: + writer.writerows(rows) + else: + for j in range(len(data_index)): + row = rows[j % N] + for i in range(ncols): + row[i] = data[i][j] + + if j >= N - 1 and j % N == N - 1: + writer.writerows(rows) + + if j >= 0 and (j < N - 1 or (j % N) != N - 1): + writer.writerows(rows[:((j + 1) % N)]) + + +@cython.boundscheck(False) +@cython.wraparound(False) +def convert_json_to_lines(object arr): + """ + replace comma separated json with line feeds, paying special attention + to quotes & brackets + """ + cdef: + Py_ssize_t i = 0, num_open_brackets_seen = 0, length + bint in_quotes = 0, is_escaping = 0 + ndarray[uint8_t] narr + unsigned char v, comma, left_bracket, right_brack, newline + + newline = ord('\n') + comma = ord(',') + left_bracket = ord('{') + right_bracket = ord('}') + quote = ord('"') + backslash = ord('\\') + + narr = np.frombuffer(arr.encode('utf-8'), dtype='u1').copy() + length = narr.shape[0] + for i in range(length): + v = narr[i] + if v == quote and i > 0 and not is_escaping: + in_quotes = ~in_quotes + if v == backslash or is_escaping: + is_escaping = ~is_escaping + if v == comma: # commas that should be \n + if num_open_brackets_seen == 0 and not in_quotes: + narr[i] = newline + elif v == left_bracket: + if not in_quotes: + num_open_brackets_seen += 1 + elif v == right_bracket: + if not in_quotes: + num_open_brackets_seen -= 1 + + return narr.tostring().decode('utf-8') + + +# stata, pytables +@cython.boundscheck(False) +@cython.wraparound(False) +cpdef Py_ssize_t max_len_string_array(pandas_string[:] arr): + """ return the maximum size of elements in a 1-dim string array """ + cdef: + Py_ssize_t i, m = 0, l = 0, length = arr.shape[0] + pandas_string v + + for i in range(length): + v = arr[i] + if PyString_Check(v): + l = PyString_GET_SIZE(v) + elif PyBytes_Check(v): + l = PyBytes_GET_SIZE(v) + elif PyUnicode_Check(v): + l = PyUnicode_GET_SIZE(v) + + if l > m: + m = l + + return m + + +# ------------------------------------------------------------------ +# PyTables Helpers + + +@cython.boundscheck(False) +@cython.wraparound(False) +def string_array_replace_from_nan_rep( + ndarray[object, ndim=1] arr, object nan_rep, + object replace=None): + """ + Replace the values in the array with 'replacement' if + they are 'nan_rep'. Return the same array. + """ + + cdef int length = arr.shape[0], i = 0 + if replace is None: + replace = np.nan + + for i from 0 <= i < length: + if arr[i] == nan_rep: + arr[i] = replace + + return arr diff --git a/pandas/core/computation/scope.py b/pandas/core/computation/scope.py index 6a298f5137eb1..c3128be0f5599 100644 --- a/pandas/core/computation/scope.py +++ b/pandas/core/computation/scope.py @@ -48,7 +48,7 @@ def _raw_hex_id(obj): _DEFAULT_GLOBALS = { - 'Timestamp': pandas._libs.lib.Timestamp, + 'Timestamp': pandas._libs.tslib.Timestamp, 'datetime': datetime.datetime, 'True': True, 'False': False, diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 5155662d2f97d..b2816343fc8eb 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -282,7 +282,7 @@ def maybe_promote(dtype, fill_value=np.nan): fill_value = iNaT elif issubclass(dtype.type, np.timedelta64): try: - fill_value = lib.Timedelta(fill_value).value + fill_value = tslib.Timedelta(fill_value).value except Exception: # as for datetimes, cannot upcast to object fill_value = iNaT diff --git a/pandas/core/generic.py b/pandas/core/generic.py index bee954aa9bba8..5a15d720c5790 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -10,7 +10,7 @@ import numpy as np import pandas as pd -from pandas._libs import tslib, lib, properties +from pandas._libs import tslib, properties from pandas.core.dtypes.common import ( _ensure_int64, _ensure_object, @@ -7216,9 +7216,9 @@ def describe_categorical_1d(data): if is_datetime64_dtype(data): asint = data.dropna().values.view('i8') names += ['top', 'freq', 'first', 'last'] - result += [lib.Timestamp(top), freq, - lib.Timestamp(asint.min()), - lib.Timestamp(asint.max())] + result += [tslib.Timestamp(top), freq, + tslib.Timestamp(asint.min()), + tslib.Timestamp(asint.max())] else: names += ['top', 'freq'] result += [top, freq] diff --git a/pandas/core/internals.py b/pandas/core/internals.py index f3e5e4c99a899..22d38d3df071e 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -2656,7 +2656,7 @@ def _try_coerce_args(self, values, other): other = other.asi8 other_mask = isna(other) elif isinstance(other, (np.datetime64, datetime, date)): - other = lib.Timestamp(other) + other = tslib.Timestamp(other) tz = getattr(other, 'tz', None) # test we can have an equal time zone @@ -2675,7 +2675,7 @@ def _try_coerce_result(self, result): if result.dtype.kind in ['i', 'f', 'O']: result = result.astype('M8[ns]') elif isinstance(result, (np.integer, np.float, np.datetime64)): - result = lib.Timestamp(result, tz=self.values.tz) + result = tslib.Timestamp(result, tz=self.values.tz) if isinstance(result, np.ndarray): # allow passing of > 1dim if its trivial if result.ndim > 1: diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index eda86f12d501d..d4851f579dda4 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -268,7 +268,7 @@ def _wrap_results(result, dtype): if is_datetime64_dtype(dtype): if not isinstance(result, np.ndarray): - result = lib.Timestamp(result) + result = tslib.Timestamp(result) else: result = result.view(dtype) elif is_timedelta64_dtype(dtype): @@ -278,7 +278,7 @@ def _wrap_results(result, dtype): if np.fabs(result) > _int64_max: raise ValueError("overflow in timedelta operation") - result = lib.Timedelta(result, unit='ns') + result = tslib.Timedelta(result, unit='ns') else: result = result.astype('i8').view(dtype) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 706bec9e44892..961c8c004e9e3 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -24,7 +24,7 @@ from pandas.compat.numpy import function as nv from pandas._libs import lib, tslib -from pandas._libs.lib import Timestamp +from pandas._libs.tslib import Timestamp from pandas._libs.tslibs.period import IncompatibleFrequency from pandas.util._decorators import Appender, Substitution diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index bca0b64cb53fe..269c81b380b5e 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -38,7 +38,7 @@ _stringify_path) from pandas.io.formats.printing import adjoin, justify, pprint_thing from pandas.io.formats.common import get_level_lengths -from pandas._libs import lib +from pandas._libs import lib, writers as libwriters from pandas._libs.tslib import (iNaT, Timestamp, Timedelta, format_array_from_datetime) from pandas.core.indexes.datetimes import DatetimeIndex @@ -1789,7 +1789,8 @@ def _save_chunk(self, start_i, end_i): date_format=self.date_format, quoting=self.quoting) - lib.write_csv_rows(self.data, ix, self.nlevels, self.cols, self.writer) + libwriters.write_csv_rows(self.data, ix, self.nlevels, + self.cols, self.writer) # ---------------------------------------------------------------------- diff --git a/pandas/io/json/normalize.py b/pandas/io/json/normalize.py index 595031b04e367..c7901f4352d00 100644 --- a/pandas/io/json/normalize.py +++ b/pandas/io/json/normalize.py @@ -5,7 +5,7 @@ from collections import defaultdict import numpy as np -from pandas._libs.lib import convert_json_to_lines +from pandas._libs.writers import convert_json_to_lines from pandas import compat, DataFrame diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 5135bb01fb378..af1441f4a0fc9 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1596,11 +1596,12 @@ def _infer_types(self, values, na_values, try_num_bool=True): except Exception: result = values if values.dtype == np.object_: - na_count = lib.sanitize_objects(result, na_values, False) + na_count = parsers.sanitize_objects(result, na_values, + False) else: result = values if values.dtype == np.object_: - na_count = lib.sanitize_objects(values, na_values, False) + na_count = parsers.sanitize_objects(values, na_values, False) if result.dtype == np.object_ and try_num_bool: result = lib.maybe_convert_bool(values, diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 5376473f83f22..0d833807602e1 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -47,7 +47,7 @@ from pandas.core.config import get_option from pandas.core.computation.pytables import Expr, maybe_expression -from pandas._libs import algos, lib +from pandas._libs import algos, lib, writers as libwriters from pandas._libs.tslibs import timezones from distutils.version import LooseVersion @@ -3843,7 +3843,7 @@ def read(self, where=None, columns=None, **kwargs): # need a better algorithm tuple_index = long_index.values - unique_tuples = lib.fast_unique(tuple_index) + unique_tuples = unique(tuple_index) unique_tuples = com._asarray_tuplesafe(unique_tuples) indexer = match(unique_tuples, tuple_index) @@ -4561,7 +4561,8 @@ def _convert_string_array(data, encoding, itemsize=None): # create the sized dtype if itemsize is None: - itemsize = lib.max_len_string_array(_ensure_object(data.ravel())) + ensured = _ensure_object(data.ravel()) + itemsize = libwriters.max_len_string_array(ensured) data = np.asarray(data, dtype="S%d" % itemsize) return data @@ -4590,7 +4591,7 @@ def _unconvert_string_array(data, nan_rep=None, encoding=None): encoding = _ensure_encoding(encoding) if encoding is not None and len(data): - itemsize = lib.max_len_string_array(_ensure_object(data)) + itemsize = libwriters.max_len_string_array(_ensure_object(data)) if compat.PY3: dtype = "U{0}".format(itemsize) else: @@ -4604,7 +4605,7 @@ def _unconvert_string_array(data, nan_rep=None, encoding=None): if nan_rep is None: nan_rep = 'nan' - data = lib.string_array_replace_from_nan_rep(data, nan_rep) + data = libwriters.string_array_replace_from_nan_rep(data, nan_rep) return data.reshape(shape) @@ -4621,7 +4622,7 @@ def _get_converter(kind, encoding): if kind == 'datetime64': return lambda x: np.asarray(x, dtype='M8[ns]') elif kind == 'datetime': - return lib.convert_timestamps + return lambda x: to_datetime(x, cache=True).to_pydatetime() elif kind == 'string': return lambda x: _unconvert_string_array(x, encoding=encoding) else: # pragma: no cover diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 0922a4a9c3e9b..adbff06364dbe 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -17,8 +17,9 @@ import numpy as np from dateutil.relativedelta import relativedelta -from pandas._libs.lib import max_len_string_array, infer_dtype +from pandas._libs.lib import infer_dtype from pandas._libs.tslib import NaT, Timestamp +from pandas._libs.writers import max_len_string_array import pandas as pd from pandas import compat, to_timedelta, to_datetime, isna, DatetimeIndex diff --git a/pandas/plotting/_converter.py b/pandas/plotting/_converter.py index 66ee7fa98491f..07163615c6ba4 100644 --- a/pandas/plotting/_converter.py +++ b/pandas/plotting/_converter.py @@ -23,7 +23,7 @@ from pandas.compat import lrange import pandas.compat as compat -import pandas._libs.lib as lib +from pandas._libs import tslib import pandas.core.common as com from pandas.core.index import Index @@ -52,7 +52,7 @@ def get_pairs(): pairs = [ - (lib.Timestamp, DatetimeConverter), + (tslib.Timestamp, DatetimeConverter), (Period, PeriodConverter), (pydt.datetime, DatetimeConverter), (pydt.date, DatetimeConverter), @@ -312,7 +312,7 @@ def try_parse(values): if isinstance(values, (datetime, pydt.date)): return _dt_to_float_ordinal(values) elif isinstance(values, np.datetime64): - return _dt_to_float_ordinal(lib.Timestamp(values)) + return _dt_to_float_ordinal(tslib.Timestamp(values)) elif isinstance(values, pydt.time): return dates.date2num(values) elif (is_integer(values) or is_float(values)): diff --git a/pandas/tests/indexes/datetimes/test_construction.py b/pandas/tests/indexes/datetimes/test_construction.py index b59dd25ead57f..197a42bdaacbb 100644 --- a/pandas/tests/indexes/datetimes/test_construction.py +++ b/pandas/tests/indexes/datetimes/test_construction.py @@ -7,7 +7,6 @@ import pandas as pd from pandas import offsets import pandas.util.testing as tm -from pandas._libs import lib from pandas._libs.tslib import OutOfBoundsDatetime from pandas._libs.tslibs import conversion from pandas import (DatetimeIndex, Index, Timestamp, datetime, date_range, @@ -537,7 +536,7 @@ def test_datetimeindex_constructor_misc(self): arr = [datetime(2005, 1, 1), '1/2/2005', '1/3/2005', '2005-01-04'] idx2 = DatetimeIndex(arr) - arr = [lib.Timestamp(datetime(2005, 1, 1)), '1/2/2005', '1/3/2005', + arr = [Timestamp(datetime(2005, 1, 1)), '1/2/2005', '1/3/2005', '2005-01-04'] idx3 = DatetimeIndex(arr) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 508c3a73f48c7..974099f1fbbe9 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -29,7 +29,7 @@ from pandas.core.indexes.datetimes import _to_m8 import pandas as pd -from pandas._libs.lib import Timestamp +from pandas._libs.tslib import Timestamp class TestIndex(Base): diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index aedc957ec67da..e59456b8a2d5e 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -19,7 +19,7 @@ from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.indexes.base import InvalidIndexError from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike -from pandas._libs.lib import Timestamp +from pandas._libs.tslib import Timestamp import pandas.util.testing as tm diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index 3de1c4c982654..0c1bec7a6f1a9 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -13,7 +13,7 @@ import pandas.util.testing as tm import pandas as pd -from pandas._libs.lib import Timestamp +from pandas._libs.tslib import Timestamp from pandas.tests.indexes.common import Base diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index cd1685f282bd2..e949772981eb7 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -425,7 +425,7 @@ def test_npy_nat(self): assert ujson.encode(input) == 'null', "Expected null" def test_datetime_units(self): - from pandas._libs.lib import Timestamp + from pandas._libs.tslib import Timestamp val = datetime.datetime(2013, 8, 17, 21, 17, 12, 215504) stamp = Timestamp(val) diff --git a/pandas/tests/io/parser/common.py b/pandas/tests/io/parser/common.py index f677b356a77a5..cf7ec9e2f2652 100644 --- a/pandas/tests/io/parser/common.py +++ b/pandas/tests/io/parser/common.py @@ -12,7 +12,7 @@ import pytest import numpy as np -from pandas._libs.lib import Timestamp +from pandas._libs.tslib import Timestamp import pandas as pd import pandas.util.testing as tm diff --git a/pandas/tests/io/parser/converters.py b/pandas/tests/io/parser/converters.py index 1176b1e84e29b..ae35d45591dc5 100644 --- a/pandas/tests/io/parser/converters.py +++ b/pandas/tests/io/parser/converters.py @@ -13,7 +13,7 @@ import pandas as pd import pandas.util.testing as tm -from pandas._libs.lib import Timestamp +from pandas._libs.tslib import Timestamp from pandas import DataFrame, Index from pandas.compat import parse_date, StringIO, lmap diff --git a/pandas/tests/io/parser/parse_dates.py b/pandas/tests/io/parser/parse_dates.py index b7d0dd1a3484f..919b357f14236 100644 --- a/pandas/tests/io/parser/parse_dates.py +++ b/pandas/tests/io/parser/parse_dates.py @@ -11,7 +11,7 @@ import pytest import numpy as np from pandas._libs.tslibs import parsing -from pandas._libs.lib import Timestamp +from pandas._libs.tslib import Timestamp import pandas as pd import pandas.io.parsers as parsers diff --git a/pandas/tests/io/parser/test_parsers.py b/pandas/tests/io/parser/test_parsers.py index ec240531925e3..7717102b64fc5 100644 --- a/pandas/tests/io/parser/test_parsers.py +++ b/pandas/tests/io/parser/test_parsers.py @@ -5,7 +5,7 @@ from pandas import read_csv, read_table, DataFrame import pandas.core.common as com -from pandas._libs.lib import Timestamp +from pandas._libs.tslib import Timestamp from pandas.compat import StringIO from .common import ParserTests diff --git a/pandas/tests/io/parser/usecols.py b/pandas/tests/io/parser/usecols.py index 8767055239cd5..195fb4cba2aed 100644 --- a/pandas/tests/io/parser/usecols.py +++ b/pandas/tests/io/parser/usecols.py @@ -11,7 +11,7 @@ import pandas.util.testing as tm from pandas import DataFrame, Index -from pandas._libs.lib import Timestamp +from pandas._libs.tslib import Timestamp from pandas.compat import StringIO diff --git a/pandas/tests/series/test_indexing.py b/pandas/tests/series/test_indexing.py index fbfbad547ce1b..e5c3d6f7d3ee1 100644 --- a/pandas/tests/series/test_indexing.py +++ b/pandas/tests/series/test_indexing.py @@ -17,7 +17,7 @@ Categorical) from pandas.core.indexing import IndexingError from pandas.tseries.offsets import BDay -from pandas._libs import tslib, lib +from pandas._libs import tslib from pandas.compat import lrange, range from pandas import compat @@ -2707,7 +2707,7 @@ def test_fancy_getitem(self): assert s['1/2/2009'] == 48 assert s['2009-1-2'] == 48 assert s[datetime(2009, 1, 2)] == 48 - assert s[lib.Timestamp(datetime(2009, 1, 2))] == 48 + assert s[Timestamp(datetime(2009, 1, 2))] == 48 pytest.raises(KeyError, s.__getitem__, '2009-1-3') assert_series_equal(s['3/6/2009':'2009-06-05'], diff --git a/pandas/tests/test_lib.py b/pandas/tests/test_lib.py index 10061204df42a..502f0c3bced61 100644 --- a/pandas/tests/test_lib.py +++ b/pandas/tests/test_lib.py @@ -3,7 +3,7 @@ import pytest import numpy as np -from pandas._libs import lib +from pandas._libs import lib, writers as libwriters import pandas.util.testing as tm @@ -12,19 +12,19 @@ class TestMisc(object): def test_max_len_string_array(self): arr = a = np.array(['foo', 'b', np.nan], dtype='object') - assert lib.max_len_string_array(arr) == 3 + assert libwriters.max_len_string_array(arr) == 3 # unicode arr = a.astype('U').astype(object) - assert lib.max_len_string_array(arr) == 3 + assert libwriters.max_len_string_array(arr) == 3 # bytes for python3 arr = a.astype('S').astype(object) - assert lib.max_len_string_array(arr) == 3 + assert libwriters.max_len_string_array(arr) == 3 # raises pytest.raises(TypeError, - lambda: lib.max_len_string_array(arr.astype('U'))) + lambda: libwriters.max_len_string_array(arr.astype('U'))) def test_fast_unique_multiple_list_gen_sort(self): keys = [['p', 'a'], ['n', 'd'], ['a', 's']] diff --git a/setup.py b/setup.py index 27943a776c414..5397a1b84dc4d 100755 --- a/setup.py +++ b/setup.py @@ -328,6 +328,7 @@ class CheckSDist(sdist_class): 'pandas/_libs/tslibs/frequencies.pyx', 'pandas/_libs/tslibs/resolution.pyx', 'pandas/_libs/tslibs/parsing.pyx', + 'pandas/_libs/writers.pyx', 'pandas/io/sas/sas.pyx'] def initialize_options(self): @@ -616,6 +617,9 @@ def pxd(name): '_libs.window': { 'pyxfile': '_libs/window', 'pxdfiles': ['_libs/skiplist', '_libs/src/util']}, + '_libs.writers': { + 'pyxfile': '_libs/writers', + 'pxdfiles': ['_libs/src/util']}, 'io.sas._sas': { 'pyxfile': 'io/sas/sas'}} From b5dd6a38b18b3da8736a64ce3ce9b80bbe44f35f Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 1 Feb 2018 07:45:15 -0500 Subject: [PATCH 021/214] PERF: remove use of Panel & perf in rolling corr/cov (#19257) * PERF: remove use of Panel & perf in rolling corr/cov closes #17917 --- asv_bench/benchmarks/rolling.py | 25 ++++++++++++++++-- doc/source/whatsnew/v0.23.0.txt | 2 +- pandas/core/reshape/pivot.py | 8 ++---- pandas/core/window.py | 47 +++++++++++++++++++++------------ pandas/tests/test_window.py | 22 ++++++++------- 5 files changed, 69 insertions(+), 35 deletions(-) diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index 59cf7d090a622..75990d83f8212 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -11,8 +11,8 @@ class Methods(object): [10, 1000], ['int', 'float'], ['median', 'mean', 'max', 'min', 'std', 'count', 'skew', 'kurt', - 'sum', 'corr', 'cov']) - param_names = ['constructor', 'window', 'dtype', 'method'] + 'sum']) + param_names = ['contructor', 'window', 'dtype', 'method'] def setup(self, constructor, window, dtype, method): N = 10**5 @@ -23,6 +23,27 @@ def time_rolling(self, constructor, window, dtype, method): getattr(self.roll, method)() +class Pairwise(object): + + sample_time = 0.2 + params = ([10, 1000, None], + ['corr', 'cov'], + [True, False]) + param_names = ['window', 'method', 'pairwise'] + + def setup(self, window, method, pairwise): + N = 10**4 + arr = np.random.random(N) + self.df = pd.DataFrame(arr) + + def time_pairwise(self, window, method, pairwise): + if window is None: + r = self.df.expanding() + else: + r = self.df.rolling(window=window) + getattr(r, method)(self.df, pairwise=pairwise) + + class Quantile(object): sample_time = 0.2 diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 2bd2bb199bf1f..5db29cb76b106 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -383,7 +383,7 @@ Performance Improvements - :func:`Series` / :func:`DataFrame` tab completion limits to 100 values, for better performance. (:issue:`18587`) - Improved performance of :func:`DataFrame.median` with ``axis=1`` when bottleneck is not installed (:issue:`16468`) - Improved performance of :func:`MultiIndex.get_loc` for large indexes, at the cost of a reduction in performance for small ones (:issue:`18519`) - +- Improved performance of pairwise ``.rolling()`` and ``.expanding()`` with ``.cov()`` and ``.corr()`` operations (:issue:`17917`) .. _whatsnew_0230.docs: diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 0e92fc4edce85..a4c9848dca900 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -99,19 +99,15 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', if not dropna: from pandas import MultiIndex - try: + if table.index.nlevels > 1: m = MultiIndex.from_arrays(cartesian_product(table.index.levels), names=table.index.names) table = table.reindex(m, axis=0) - except AttributeError: - pass # it's a single level - try: + if table.columns.nlevels > 1: m = MultiIndex.from_arrays(cartesian_product(table.columns.levels), names=table.columns.names) table = table.reindex(m, axis=1) - except AttributeError: - pass # it's a single level or a series if isinstance(table, ABCDataFrame): table = table.sort_index(axis=1) diff --git a/pandas/core/window.py b/pandas/core/window.py index 4d6a1de60f59b..a3f19ef50459d 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -1863,25 +1863,38 @@ def dataframe_from_int_dict(data, frame_template): results[i][j] = f(*_prep_binary(arg1.iloc[:, i], arg2.iloc[:, j])) - # TODO: not the most efficient (perf-wise) - # though not bad code-wise - from pandas import Panel, MultiIndex, concat - - with warnings.catch_warnings(record=True): - p = Panel.from_dict(results).swapaxes('items', 'major') - if len(p.major_axis) > 0: - p.major_axis = arg1.columns[p.major_axis] - if len(p.minor_axis) > 0: - p.minor_axis = arg2.columns[p.minor_axis] - - if len(p.items): + from pandas import MultiIndex, concat + + result_index = arg1.index.union(arg2.index) + if len(result_index): + + # construct result frame result = concat( - [p.iloc[i].T for i in range(len(p.items))], - keys=p.items) + [concat([results[i][j] + for j, c in enumerate(arg2.columns)], + ignore_index=True) + for i, c in enumerate(arg1.columns)], + ignore_index=True, + axis=1) + result.columns = arg1.columns + + # set the index and reorder + if arg2.columns.nlevels > 1: + result.index = MultiIndex.from_product( + arg2.columns.levels + [result_index]) + result = result.reorder_levels([2, 0, 1]).sort_index() + else: + result.index = MultiIndex.from_product( + [range(len(arg2.columns)), + range(len(result_index))]) + result = result.swaplevel(1, 0).sort_index() + result.index = MultiIndex.from_product( + [result_index] + [arg2.columns]) else: + # empty result result = DataFrame( - index=MultiIndex(levels=[arg1.index, arg1.columns], + index=MultiIndex(levels=[arg1.index, arg2.columns], labels=[[], []]), columns=arg2.columns, dtype='float64') @@ -1890,9 +1903,9 @@ def dataframe_from_int_dict(data, frame_template): # reset our column names to arg2 names # careful not to mutate the original names result.columns = result.columns.set_names( - arg2.columns.names) + arg1.columns.names) result.index = result.index.set_names( - arg1.index.names + arg1.columns.names) + result_index.names + arg2.columns.names) return result diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index 22526d14a7168..dabdb1e8e689c 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -14,6 +14,7 @@ import pandas.tseries.offsets as offsets from pandas.core.base import SpecificationError from pandas.errors import UnsupportedFunctionCall +from pandas.core.sorting import safe_sort import pandas.util.testing as tm import pandas.util._test_decorators as td from pandas.compat import range, zip @@ -1645,7 +1646,7 @@ def compare(self, result, expected): result = result.dropna().values expected = expected.dropna().values - tm.assert_numpy_array_equal(result, expected) + tm.assert_numpy_array_equal(result, expected, check_dtype=False) @pytest.mark.parametrize('f', [lambda x: x.cov(), lambda x: x.corr()]) def test_no_flex(self, f): @@ -1670,15 +1671,19 @@ def test_no_flex(self, f): def test_pairwise_with_self(self, f): # DataFrame with itself, pairwise=True - results = [f(df) for df in self.df1s] - for (df, result) in zip(self.df1s, results): + # note that we may construct the 1st level of the MI + # in a non-motononic way, so compare accordingly + results = [] + for i, df in enumerate(self.df1s): + result = f(df) tm.assert_index_equal(result.index.levels[0], df.index, check_names=False) - tm.assert_index_equal(result.index.levels[1], - df.columns, - check_names=False) + tm.assert_numpy_array_equal(safe_sort(result.index.levels[1]), + safe_sort(df.columns.unique())) tm.assert_index_equal(result.columns, df.columns) + results.append(df) + for i, result in enumerate(results): if i > 0: self.compare(result, results[0]) @@ -1716,9 +1721,8 @@ def test_pairwise_with_other(self, f): tm.assert_index_equal(result.index.levels[0], df.index, check_names=False) - tm.assert_index_equal(result.index.levels[1], - self.df2.columns, - check_names=False) + tm.assert_numpy_array_equal(safe_sort(result.index.levels[1]), + safe_sort(self.df2.columns.unique())) for i, result in enumerate(results): if i > 0: self.compare(result, results[0]) From 113f78886907a77fd4c73e1456833e83ee48594f Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 1 Feb 2018 07:54:56 -0500 Subject: [PATCH 022/214] TST: fix up pandas_datareader downstream tests (#19490) closes #18935 --- pandas/tests/test_downstream.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index 0f0abd8cd3400..b438d6a6137b0 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -89,7 +89,8 @@ def test_pandas_gbq(df): def test_pandas_datareader(): pandas_datareader = import_module('pandas_datareader') # noqa - pandas_datareader.get_data_google('AAPL') + pandas_datareader.DataReader( + 'F', 'quandl', '2017-01-01', '2017-02-01') def test_geopandas(): From d3851ac09d6a9121cea44aabbdc7e4f60f06b7d9 Mon Sep 17 00:00:00 2001 From: Matt Kirk Date: Thu, 1 Feb 2018 20:09:17 +0700 Subject: [PATCH 023/214] BUG: fix issue with concat creating SparseFrame if not all series are sparse. (#18924) --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/core/dtypes/concat.py | 10 ++- pandas/core/dtypes/generic.py | 2 + pandas/tests/dtypes/test_generic.py | 2 + pandas/tests/reshape/test_reshape.py | 9 +++ pandas/tests/sparse/test_combine_concat.py | 85 +++++++++++++--------- 6 files changed, 71 insertions(+), 38 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 5db29cb76b106..6cbdc3be07f13 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -540,6 +540,7 @@ Reshaping - Bug in :func:`DataFrame.merge` in which merging using ``Index`` objects as vectors raised an Exception (:issue:`19038`) - Bug in :func:`DataFrame.stack`, :func:`DataFrame.unstack`, :func:`Series.unstack` which were not returning subclasses (:issue:`15563`) - Bug in timezone comparisons, manifesting as a conversion of the index to UTC in ``.concat()`` (:issue:`18523`) +- Bug in :func:`concat` when concatting sparse and dense series it returns only a ``SparseDataFrame``. Should be a ``DataFrame``. (:issue:`18914`, :issue:`18686`, and :issue:`16874`) - diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 3e54ce61cd5b2..ddecbe85087d8 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -19,7 +19,7 @@ _TD_DTYPE) from pandas.core.dtypes.generic import ( ABCDatetimeIndex, ABCTimedeltaIndex, - ABCPeriodIndex, ABCRangeIndex) + ABCPeriodIndex, ABCRangeIndex, ABCSparseDataFrame) def get_dtype_kinds(l): @@ -89,14 +89,16 @@ def _get_series_result_type(result, objs=None): def _get_frame_result_type(result, objs): """ return appropriate class of DataFrame-like concat - if any block is SparseBlock, return SparseDataFrame + if all blocks are SparseBlock, return SparseDataFrame otherwise, return 1st obj """ - if any(b.is_sparse for b in result.blocks): + + if result.blocks and all(b.is_sparse for b in result.blocks): from pandas.core.sparse.api import SparseDataFrame return SparseDataFrame else: - return objs[0] + return next(obj for obj in objs if not isinstance(obj, + ABCSparseDataFrame)) def _concat_compat(to_concat, axis=0): diff --git a/pandas/core/dtypes/generic.py b/pandas/core/dtypes/generic.py index 6fae09c43d2be..b032cb6f14d4c 100644 --- a/pandas/core/dtypes/generic.py +++ b/pandas/core/dtypes/generic.py @@ -43,6 +43,8 @@ def _check(cls, inst): ABCSeries = create_pandas_abc_type("ABCSeries", "_typ", ("series", )) ABCDataFrame = create_pandas_abc_type("ABCDataFrame", "_typ", ("dataframe", )) +ABCSparseDataFrame = create_pandas_abc_type("ABCSparseDataFrame", "_subtyp", + ("sparse_frame", )) ABCPanel = create_pandas_abc_type("ABCPanel", "_typ", ("panel",)) ABCSparseSeries = create_pandas_abc_type("ABCSparseSeries", "_subtyp", ('sparse_series', diff --git a/pandas/tests/dtypes/test_generic.py b/pandas/tests/dtypes/test_generic.py index 58cb182e7d403..53f92b98f022e 100644 --- a/pandas/tests/dtypes/test_generic.py +++ b/pandas/tests/dtypes/test_generic.py @@ -18,6 +18,7 @@ class TestABCClasses(object): df = pd.DataFrame({'names': ['a', 'b', 'c']}, index=multi_index) sparse_series = pd.Series([1, 2, 3]).to_sparse() sparse_array = pd.SparseArray(np.random.randn(10)) + sparse_frame = pd.SparseDataFrame({'a': [1, -1, None]}) def test_abc_types(self): assert isinstance(pd.Index(['a', 'b', 'c']), gt.ABCIndex) @@ -37,6 +38,7 @@ def test_abc_types(self): assert isinstance(self.df.to_panel(), gt.ABCPanel) assert isinstance(self.sparse_series, gt.ABCSparseSeries) assert isinstance(self.sparse_array, gt.ABCSparseArray) + assert isinstance(self.sparse_frame, gt.ABCSparseDataFrame) assert isinstance(self.categorical, gt.ABCCategorical) assert isinstance(pd.Period('2012', freq='A-DEC'), gt.ABCPeriod) diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_reshape.py index 22925cceb30d1..c9d079421532f 100644 --- a/pandas/tests/reshape/test_reshape.py +++ b/pandas/tests/reshape/test_reshape.py @@ -454,6 +454,15 @@ def test_dataframe_dummies_preserve_categorical_dtype(self, dtype): tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize('sparse', [True, False]) + def test_get_dummies_dont_sparsify_all_columns(self, sparse): + # GH18914 + df = DataFrame.from_items([('GDP', [1, 2]), ('Nation', ['AB', 'CD'])]) + df = get_dummies(df, columns=['Nation'], sparse=sparse) + df2 = df.reindex(columns=['GDP']) + + tm.assert_frame_equal(df[['GDP']], df2) + class TestCategoricalReshape(object): diff --git a/pandas/tests/sparse/test_combine_concat.py b/pandas/tests/sparse/test_combine_concat.py index 15639fbe156c6..70fd1da529d46 100644 --- a/pandas/tests/sparse/test_combine_concat.py +++ b/pandas/tests/sparse/test_combine_concat.py @@ -1,8 +1,10 @@ # pylint: disable-msg=E1101,W0612 +import pytest import numpy as np import pandas as pd import pandas.util.testing as tm +import itertools class TestSparseSeriesConcat(object): @@ -317,37 +319,52 @@ def test_concat_axis1(self): assert isinstance(res, pd.SparseDataFrame) tm.assert_frame_equal(res.to_dense(), exp) - def test_concat_sparse_dense(self): - sparse = self.dense1.to_sparse() - - res = pd.concat([sparse, self.dense2]) - exp = pd.concat([self.dense1, self.dense2]) - assert isinstance(res, pd.SparseDataFrame) - tm.assert_frame_equal(res.to_dense(), exp) - - res = pd.concat([self.dense2, sparse]) - exp = pd.concat([self.dense2, self.dense1]) - assert isinstance(res, pd.SparseDataFrame) - tm.assert_frame_equal(res.to_dense(), exp) - - sparse = self.dense1.to_sparse(fill_value=0) - - res = pd.concat([sparse, self.dense2]) - exp = pd.concat([self.dense1, self.dense2]) - assert isinstance(res, pd.SparseDataFrame) - tm.assert_frame_equal(res.to_dense(), exp) - - res = pd.concat([self.dense2, sparse]) - exp = pd.concat([self.dense2, self.dense1]) - assert isinstance(res, pd.SparseDataFrame) - tm.assert_frame_equal(res.to_dense(), exp) - - res = pd.concat([self.dense3, sparse], axis=1) - exp = pd.concat([self.dense3, self.dense1], axis=1) - assert isinstance(res, pd.SparseDataFrame) - tm.assert_frame_equal(res, exp) - - res = pd.concat([sparse, self.dense3], axis=1) - exp = pd.concat([self.dense1, self.dense3], axis=1) - assert isinstance(res, pd.SparseDataFrame) - tm.assert_frame_equal(res, exp) + @pytest.mark.parametrize('fill_value,sparse_idx,dense_idx', + itertools.product([None, 0, 1, np.nan], + [0, 1], + [1, 0])) + def test_concat_sparse_dense_rows(self, fill_value, sparse_idx, dense_idx): + frames = [self.dense1, self.dense2] + sparse_frame = [frames[dense_idx], + frames[sparse_idx].to_sparse(fill_value=fill_value)] + dense_frame = [frames[dense_idx], frames[sparse_idx]] + + # This will try both directions sparse + dense and dense + sparse + for _ in range(2): + res = pd.concat(sparse_frame) + exp = pd.concat(dense_frame) + + assert isinstance(res, pd.SparseDataFrame) + tm.assert_frame_equal(res.to_dense(), exp) + + sparse_frame = sparse_frame[::-1] + dense_frame = dense_frame[::-1] + + @pytest.mark.parametrize('fill_value,sparse_idx,dense_idx', + itertools.product([None, 0, 1, np.nan], + [0, 1], + [1, 0])) + def test_concat_sparse_dense_cols(self, fill_value, sparse_idx, dense_idx): + # See GH16874, GH18914 and #18686 for why this should be a DataFrame + + frames = [self.dense1, self.dense3] + + sparse_frame = [frames[dense_idx], + frames[sparse_idx].to_sparse(fill_value=fill_value)] + dense_frame = [frames[dense_idx], frames[sparse_idx]] + + # This will try both directions sparse + dense and dense + sparse + for _ in range(2): + res = pd.concat(sparse_frame, axis=1) + exp = pd.concat(dense_frame, axis=1) + + for column in frames[dense_idx].columns: + if dense_idx == sparse_idx: + tm.assert_frame_equal(res[column], exp[column]) + else: + tm.assert_series_equal(res[column], exp[column]) + + tm.assert_frame_equal(res, exp) + + sparse_frame = sparse_frame[::-1] + dense_frame = dense_frame[::-1] From c7688299e0621a072ae27ad480c9d35f223a08ce Mon Sep 17 00:00:00 2001 From: Mitch Negus <21086604+mitchnegus@users.noreply.github.com> Date: Thu, 1 Feb 2018 05:15:42 -0800 Subject: [PATCH 024/214] updated hist documentation (#19366) --- pandas/plotting/_core.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 8b03d6ddde4ec..88b899ad60313 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -2156,10 +2156,18 @@ def hist_frame(data, column=None, by=None, grid=True, xlabelsize=None, The size of the figure to create in inches by default layout : tuple, optional Tuple of (rows, columns) for the layout of the histograms - bins : integer, default 10 - Number of histogram bins to be used + bins : integer or sequence, default 10 + Number of histogram bins to be used. If an integer is given, bins + 1 + bin edges are calculated and returned. If bins is a sequence, gives + bin edges, including left edge of first bin and right edge of last + bin. In this case, bins is returned unmodified. `**kwds` : other plotting keyword arguments To be passed to hist function + + See Also + -------- + matplotlib.axes.Axes.hist : Plot a histogram using matplotlib. + """ _converter._WARN = False if by is not None: @@ -2219,14 +2227,19 @@ def hist_series(self, by=None, ax=None, grid=True, xlabelsize=None, rotation of y axis labels figsize : tuple, default None figure size in inches by default + bins : integer or sequence, default 10 + Number of histogram bins to be used. If an integer is given, bins + 1 + bin edges are calculated and returned. If bins is a sequence, gives + bin edges, including left edge of first bin and right edge of last + bin. In this case, bins is returned unmodified. bins: integer, default 10 Number of histogram bins to be used `**kwds` : keywords To be passed to the actual plotting function - Notes - ----- - See matplotlib documentation online for more on this + See Also + -------- + matplotlib.axes.Axes.hist : Plot a histogram using matplotlib. """ import matplotlib.pyplot as plt From 09307dd06a73b5702095987fb5868275d44cc1f7 Mon Sep 17 00:00:00 2001 From: Upkar Lidder Date: Thu, 1 Feb 2018 05:26:35 -0800 Subject: [PATCH 025/214] CLN: GH19404 Changing function signature to match logic (#19425) --- pandas/core/generic.py | 2 +- pandas/io/clipboards.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 5a15d720c5790..48981a27f3c7e 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1906,7 +1906,7 @@ def to_pickle(self, path, compression='infer', return to_pickle(self, path, compression=compression, protocol=protocol) - def to_clipboard(self, excel=None, sep=None, **kwargs): + def to_clipboard(self, excel=True, sep=None, **kwargs): """ Attempt to write text representation of object to the system clipboard This can be pasted into Excel, for example. diff --git a/pandas/io/clipboards.py b/pandas/io/clipboards.py index 347ec41baf0e1..dcc221ce978b3 100644 --- a/pandas/io/clipboards.py +++ b/pandas/io/clipboards.py @@ -63,7 +63,7 @@ def read_clipboard(sep=r'\s+', **kwargs): # pragma: no cover return read_table(StringIO(text), sep=sep, **kwargs) -def to_clipboard(obj, excel=None, sep=None, **kwargs): # pragma: no cover +def to_clipboard(obj, excel=True, sep=None, **kwargs): # pragma: no cover """ Attempt to write text representation of object to the system clipboard The clipboard can be then pasted into Excel for example. From 35812eaaecebeeee0ddf07dee4b583c4eea07785 Mon Sep 17 00:00:00 2001 From: WBare Date: Thu, 1 Feb 2018 08:37:19 -0500 Subject: [PATCH 026/214] ENH limit_area added to interpolate1d closes #16284 --- doc/source/missing_data.rst | 53 +++++++++--- doc/source/whatsnew/v0.23.0.txt | 35 +++++++- pandas/core/generic.py | 10 ++- pandas/core/internals.py | 10 ++- pandas/core/missing.py | 130 +++++++++++++++++----------- pandas/core/resample.py | 4 +- pandas/tests/series/test_missing.py | 39 +++++++++ 7 files changed, 208 insertions(+), 73 deletions(-) diff --git a/doc/source/missing_data.rst b/doc/source/missing_data.rst index f56378b533909..ee0e2c7462f66 100644 --- a/doc/source/missing_data.rst +++ b/doc/source/missing_data.rst @@ -190,7 +190,7 @@ Sum/Prod of Empties/Nans .. warning:: This behavior is now standard as of v0.21.0; previously sum/prod would give different - results if the ``bottleneck`` package was installed. + results if the ``bottleneck`` package was installed. See the :ref:`v0.21.0 whatsnew `. With ``sum`` or ``prod`` on an empty or all-``NaN`` ``Series``, or columns of a ``DataFrame``, the result will be all-``NaN``. @@ -353,7 +353,11 @@ examined :ref:`in the API `. Interpolation ~~~~~~~~~~~~~ -Both Series and DataFrame objects have an :meth:`~DataFrame.interpolate` method +.. versionadded:: 0.21.0 + + The ``limit_area`` keyword argument was added. + +Both Series and DataFrame objects have an :meth:`~DataFrame.interpolate` method that, by default, performs linear interpolation at missing datapoints. .. ipython:: python @@ -477,33 +481,54 @@ at the new values. .. _documentation: http://docs.scipy.org/doc/scipy/reference/interpolate.html#univariate-interpolation .. _guide: http://docs.scipy.org/doc/scipy/reference/tutorial/interpolate.html +.. _missing_data.interp_limits: + Interpolation Limits ^^^^^^^^^^^^^^^^^^^^ Like other pandas fill methods, ``interpolate`` accepts a ``limit`` keyword -argument. Use this argument to limit the number of consecutive interpolations, -keeping ``NaN`` values for interpolations that are too far from the last valid -observation: +argument. Use this argument to limit the number of consecutive ``NaN`` values +filled since the last valid observation: .. ipython:: python - ser = pd.Series([np.nan, np.nan, 5, np.nan, np.nan, np.nan, 13]) - ser.interpolate(limit=2) + ser = pd.Series([np.nan, np.nan, 5, np.nan, np.nan, np.nan, 13, np.nan, np.nan]) -By default, ``limit`` applies in a forward direction, so that only ``NaN`` -values after a non-``NaN`` value can be filled. If you provide ``'backward'`` or -``'both'`` for the ``limit_direction`` keyword argument, you can fill ``NaN`` -values before non-``NaN`` values, or both before and after non-``NaN`` values, -respectively: + # fill all consecutive values in a forward direction + ser.interpolate() -.. ipython:: python + # fill one consecutive value in a forward direction + ser.interpolate(limit=1) + +By default, ``NaN`` values are filled in a ``forward`` direction. Use +``limit_direction`` parameter to fill ``backward`` or from ``both`` directions. - ser.interpolate(limit=1) # limit_direction == 'forward' +.. ipython:: python + # fill one consecutive value backwards ser.interpolate(limit=1, limit_direction='backward') + # fill one consecutive value in both directions ser.interpolate(limit=1, limit_direction='both') + # fill all consecutive values in both directions + ser.interpolate(limit_direction='both') + +By default, ``NaN`` values are filled whether they are inside (surrounded by) +existing valid values, or outside existing valid values. Introduced in v0.23 +the ``limit_area`` parameter restricts filling to either inside or outside values. + +.. ipython:: python + + # fill one consecutive inside value in both directions + ser.interpolate(limit_direction='both', limit_area='inside', limit=1) + + # fill all consecutive outside values backward + ser.interpolate(limit_direction='backward', limit_area='outside') + + # fill all consecutive outside values in both directions + ser.interpolate(limit_direction='both', limit_area='outside') + .. _missing_data.replace: Replacing Generic Values diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 6cbdc3be07f13..66e88e181ac0f 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -13,10 +13,38 @@ version. New features ~~~~~~~~~~~~ -- -- -- +.. _whatsnew_0210.enhancements.limit_area: + +``DataFrame.interpolate`` has gained the ``limit_area`` kwarg +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`DataFrame.interpolate` has gained a ``limit_area`` parameter to allow further control of which ``NaN`` s are replaced. +Use `limit_area='inside'` to fill only NaNs surrounded by valid values or use `limit_area='outside'` to fill only ``NaN`` s +outside the existing valid values while preserving those inside. (:issue:`16284`) See the :ref:`full documentation here `. + +.. ipython:: python + + ser = pd.Series([np.nan, np.nan, 5, np.nan, np.nan, np.nan, 13, np.nan, np.nan]) + ser + +Fill one consecutive inside value in both directions + +.. ipython:: python + + ser.interpolate(limit_direction='both', limit_area='inside', limit=1) + +Fill all consecutive outside values backward + +.. ipython:: python + + ser.interpolate(limit_direction='backward', limit_area='outside') + +Fill all consecutive outside values in both directions + +.. ipython:: python + + ser.interpolate(limit_direction='both', limit_area='outside') .. _whatsnew_0210.enhancements.get_dummies_dtype: @@ -207,6 +235,7 @@ Other Enhancements :func:`pandas.api.extensions.register_index_accessor`, accessor for libraries downstream of pandas to register custom accessors like ``.cat`` on pandas objects. See :ref:`Registering Custom Accessors ` for more (:issue:`14781`). + - ``IntervalIndex.astype`` now supports conversions between subtypes when passed an ``IntervalDtype`` (:issue:`19197`) - :class:`IntervalIndex` and its associated constructor methods (``from_arrays``, ``from_breaks``, ``from_tuples``) have gained a ``dtype`` parameter (:issue:`19262`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 48981a27f3c7e..d34a85b5b4388 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5085,6 +5085,12 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, limit : int, default None. Maximum number of consecutive NaNs to fill. Must be greater than 0. limit_direction : {'forward', 'backward', 'both'}, default 'forward' + limit_area : {'inside', 'outside'}, default None + * None: (default) no fill restriction + * 'inside' Only fill NaNs surrounded by valid values (interpolate). + * 'outside' Only fill NaNs outside valid values (extrapolate). + .. versionadded:: 0.21.0 + If limit is specified, consecutive NaNs will be filled in this direction. inplace : bool, default False @@ -5118,7 +5124,8 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, @Appender(_shared_docs['interpolate'] % _shared_doc_kwargs) def interpolate(self, method='linear', axis=0, limit=None, inplace=False, - limit_direction='forward', downcast=None, **kwargs): + limit_direction='forward', limit_area=None, + downcast=None, **kwargs): """ Interpolate values according to different methods. """ @@ -5167,6 +5174,7 @@ def interpolate(self, method='linear', axis=0, limit=None, inplace=False, new_data = data.interpolate(method=method, axis=ax, index=index, values=_maybe_transposed_self, limit=limit, limit_direction=limit_direction, + limit_area=limit_area, inplace=inplace, downcast=downcast, **kwargs) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 22d38d3df071e..4b12d931ade35 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1073,8 +1073,8 @@ def coerce_to_target_dtype(self, other): def interpolate(self, method='pad', axis=0, index=None, values=None, inplace=False, limit=None, limit_direction='forward', - fill_value=None, coerce=False, downcast=None, mgr=None, - **kwargs): + limit_area=None, fill_value=None, coerce=False, + downcast=None, mgr=None, **kwargs): inplace = validate_bool_kwarg(inplace, 'inplace') @@ -1115,6 +1115,7 @@ def check_int_bool(self, inplace): return self._interpolate(method=m, index=index, values=values, axis=axis, limit=limit, limit_direction=limit_direction, + limit_area=limit_area, fill_value=fill_value, inplace=inplace, downcast=downcast, mgr=mgr, **kwargs) @@ -1148,8 +1149,8 @@ def _interpolate_with_fill(self, method='pad', axis=0, inplace=False, def _interpolate(self, method=None, index=None, values=None, fill_value=None, axis=0, limit=None, - limit_direction='forward', inplace=False, downcast=None, - mgr=None, **kwargs): + limit_direction='forward', limit_area=None, + inplace=False, downcast=None, mgr=None, **kwargs): """ interpolate using scipy wrappers """ inplace = validate_bool_kwarg(inplace, 'inplace') @@ -1177,6 +1178,7 @@ def func(x): # i.e. not an arg to missing.interpolate_1d return missing.interpolate_1d(index, x, method=method, limit=limit, limit_direction=limit_direction, + limit_area=limit_area, fill_value=fill_value, bounds_error=False, **kwargs) diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 74fa21fa4b53d..2eccc5777bca6 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -111,7 +111,7 @@ def clean_interp_method(method, **kwargs): def interpolate_1d(xvalues, yvalues, method='linear', limit=None, - limit_direction='forward', fill_value=None, + limit_direction='forward', limit_area=None, fill_value=None, bounds_error=False, order=None, **kwargs): """ Logic for the 1-d interpolation. The result should be 1-d, inputs @@ -151,28 +151,12 @@ def interpolate_1d(xvalues, yvalues, method='linear', limit=None, raise ValueError(msg.format(valid=valid_limit_directions, invalid=limit_direction)) - from pandas import Series - ys = Series(yvalues) - start_nans = set(range(ys.first_valid_index())) - end_nans = set(range(1 + ys.last_valid_index(), len(valid))) - - # violate_limit is a list of the indexes in the series whose yvalue is - # currently NaN, and should still be NaN after the interpolation. - # Specifically: - # - # If limit_direction='forward' or None then the list will contain NaNs at - # the beginning of the series, and NaNs that are more than 'limit' away - # from the prior non-NaN. - # - # If limit_direction='backward' then the list will contain NaNs at - # the end of the series, and NaNs that are more than 'limit' away - # from the subsequent non-NaN. - # - # If limit_direction='both' then the list will contain NaNs that - # are more than 'limit' away from any non-NaN. - # - # If limit=None, then use default behavior of filling an unlimited number - # of NaNs in the direction specified by limit_direction + if limit_area is not None: + valid_limit_areas = ['inside', 'outside'] + limit_area = limit_area.lower() + if limit_area not in valid_limit_areas: + raise ValueError('Invalid limit_area: expecting one of {}, got ' + '{}.'.format(valid_limit_areas, limit_area)) # default limit is unlimited GH #16282 if limit is None: @@ -183,22 +167,43 @@ def interpolate_1d(xvalues, yvalues, method='linear', limit=None, elif limit < 1: raise ValueError('Limit must be greater than 0') - # each possible limit_direction - # TODO: do we need sorted? - if limit_direction == 'forward' and limit is not None: - violate_limit = sorted(start_nans | - set(_interp_limit(invalid, limit, 0))) - elif limit_direction == 'forward': - violate_limit = sorted(start_nans) - elif limit_direction == 'backward' and limit is not None: - violate_limit = sorted(end_nans | - set(_interp_limit(invalid, 0, limit))) + from pandas import Series + ys = Series(yvalues) + + # These are sets of index pointers to invalid values... i.e. {0, 1, etc... + all_nans = set(np.flatnonzero(invalid)) + start_nans = set(range(ys.first_valid_index())) + end_nans = set(range(1 + ys.last_valid_index(), len(valid))) + mid_nans = all_nans - start_nans - end_nans + + # Like the sets above, preserve_nans contains indices of invalid values, + # but in this case, it is the final set of indices that need to be + # preserved as NaN after the interpolation. + + # For example if limit_direction='forward' then preserve_nans will + # contain indices of NaNs at the beginning of the series, and NaNs that + # are more than'limit' away from the prior non-NaN. + + # set preserve_nans based on direction using _interp_limit + if limit_direction == 'forward': + preserve_nans = start_nans | set(_interp_limit(invalid, limit, 0)) elif limit_direction == 'backward': - violate_limit = sorted(end_nans) - elif limit_direction == 'both' and limit is not None: - violate_limit = sorted(_interp_limit(invalid, limit, limit)) + preserve_nans = end_nans | set(_interp_limit(invalid, 0, limit)) else: - violate_limit = [] + # both directions... just use _interp_limit + preserve_nans = set(_interp_limit(invalid, limit, limit)) + + # if limit_area is set, add either mid or outside indices + # to preserve_nans GH #16284 + if limit_area == 'inside': + # preserve NaNs on the outside + preserve_nans |= start_nans | end_nans + elif limit_area == 'outside': + # preserve NaNs on the inside + preserve_nans |= mid_nans + + # sort preserve_nans and covert to list + preserve_nans = sorted(preserve_nans) xvalues = getattr(xvalues, 'values', xvalues) yvalues = getattr(yvalues, 'values', yvalues) @@ -215,7 +220,7 @@ def interpolate_1d(xvalues, yvalues, method='linear', limit=None, else: inds = xvalues result[invalid] = np.interp(inds[invalid], inds[valid], yvalues[valid]) - result[violate_limit] = np.nan + result[preserve_nans] = np.nan return result sp_methods = ['nearest', 'zero', 'slinear', 'quadratic', 'cubic', @@ -234,7 +239,7 @@ def interpolate_1d(xvalues, yvalues, method='linear', limit=None, fill_value=fill_value, bounds_error=bounds_error, order=order, **kwargs) - result[violate_limit] = np.nan + result[preserve_nans] = np.nan return result @@ -646,8 +651,24 @@ def fill_zeros(result, x, y, name, fill): def _interp_limit(invalid, fw_limit, bw_limit): - """Get idx of values that won't be filled b/c they exceed the limits. + """ + Get indexers of values that won't be filled + because they exceed the limits. + + Parameters + ---------- + invalid : boolean ndarray + fw_limit : int or None + forward limit to index + bw_limit : int or None + backward limit to index + + Returns + ------- + set of indexers + Notes + ----- This is equivalent to the more readable, but slower .. code-block:: python @@ -660,6 +681,8 @@ def _interp_limit(invalid, fw_limit, bw_limit): # 1. operate on the reversed array # 2. subtract the returned indicies from N - 1 N = len(invalid) + f_idx = set() + b_idx = set() def inner(invalid, limit): limit = min(limit, N) @@ -668,18 +691,25 @@ def inner(invalid, limit): set(np.where((~invalid[:limit + 1]).cumsum() == 0)[0])) return idx - if fw_limit == 0: - f_idx = set(np.where(invalid)[0]) - else: - f_idx = inner(invalid, fw_limit) + if fw_limit is not None: - if bw_limit == 0: - # then we don't even need to care about backwards, just use forwards - return f_idx - else: - b_idx = set(N - 1 - np.asarray(list(inner(invalid[::-1], bw_limit)))) if fw_limit == 0: - return b_idx + f_idx = set(np.where(invalid)[0]) + else: + f_idx = inner(invalid, fw_limit) + + if bw_limit is not None: + + if bw_limit == 0: + # then we don't even need to care about backwards + # just use forwards + return f_idx + else: + b_idx = list(inner(invalid[::-1], bw_limit)) + b_idx = set(N - 1 - np.asarray(b_idx)) + if fw_limit == 0: + return b_idx + return f_idx & b_idx diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 961c8c004e9e3..df656092f476e 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -557,7 +557,8 @@ def fillna(self, method, limit=None): @Appender(_shared_docs['interpolate'] % _shared_docs_kwargs) def interpolate(self, method='linear', axis=0, limit=None, inplace=False, - limit_direction='forward', downcast=None, **kwargs): + limit_direction='forward', limit_area=None, + downcast=None, **kwargs): """ Interpolate values according to different methods. @@ -567,6 +568,7 @@ def interpolate(self, method='linear', axis=0, limit=None, inplace=False, return result.interpolate(method=method, axis=axis, limit=limit, inplace=inplace, limit_direction=limit_direction, + limit_area=limit_area, downcast=downcast, **kwargs) def asfreq(self, fill_value=None): diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index 0dc5e23184af7..2bc44cb1c683f 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -1079,6 +1079,45 @@ def test_interp_limit_bad_direction(self): pytest.raises(ValueError, s.interpolate, method='linear', limit_direction='abc') + # limit_area introduced GH #16284 + def test_interp_limit_area(self): + # These tests are for issue #9218 -- fill NaNs in both directions. + s = Series([nan, nan, 3, nan, nan, nan, 7, nan, nan]) + + expected = Series([nan, nan, 3., 4., 5., 6., 7., nan, nan]) + result = s.interpolate(method='linear', limit_area='inside') + assert_series_equal(result, expected) + + expected = Series([nan, nan, 3., 4., nan, nan, 7., nan, nan]) + result = s.interpolate(method='linear', limit_area='inside', + limit=1) + + expected = Series([nan, nan, 3., 4., nan, 6., 7., nan, nan]) + result = s.interpolate(method='linear', limit_area='inside', + limit_direction='both', limit=1) + assert_series_equal(result, expected) + + expected = Series([nan, nan, 3., nan, nan, nan, 7., 7., 7.]) + result = s.interpolate(method='linear', limit_area='outside') + assert_series_equal(result, expected) + + expected = Series([nan, nan, 3., nan, nan, nan, 7., 7., nan]) + result = s.interpolate(method='linear', limit_area='outside', + limit=1) + + expected = Series([nan, 3., 3., nan, nan, nan, 7., 7., nan]) + result = s.interpolate(method='linear', limit_area='outside', + limit_direction='both', limit=1) + assert_series_equal(result, expected) + + expected = Series([3., 3., 3., nan, nan, nan, 7., nan, nan]) + result = s.interpolate(method='linear', limit_area='outside', + direction='backward') + + # raises an error even if limit type is wrong. + pytest.raises(ValueError, s.interpolate, method='linear', + limit_area='abc') + def test_interp_limit_direction(self): # These tests are for issue #9218 -- fill NaNs in both directions. s = Series([1, 3, np.nan, np.nan, np.nan, 11]) From 78ba063f438924378f887d77fcc692186e6381a6 Mon Sep 17 00:00:00 2001 From: Matt Kirk Date: Fri, 2 Feb 2018 02:26:15 +0700 Subject: [PATCH 027/214] BUG: Fix problem with SparseDataFrame not persisting to csv (#19441) * BUG: Fix problem with SparseDataFrame not persisting to csv * FIX: Remove comment and move test with more coverage * FIX: Flake8 issues cleanup * Fix failing test due to blank lines * FIX: linting errors on whitespace * Use parametrize on test * Move bug description to sparse header * Add GH issue to test * Fix linting error --- doc/source/whatsnew/v0.23.0.txt | 3 +-- pandas/core/internals.py | 3 ++- pandas/tests/sparse/frame/test_to_csv.py | 20 ++++++++++++++++++++ 3 files changed, 23 insertions(+), 3 deletions(-) create mode 100644 pandas/tests/sparse/frame/test_to_csv.py diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 66e88e181ac0f..91362c7640575 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -529,7 +529,6 @@ I/O - Bug in :func:`DataFrame.to_parquet` where an exception was raised if the write destination is S3 (:issue:`19134`) - :class:`Interval` now supported in :func:`DataFrame.to_excel` for all Excel file types (:issue:`19242`) - :class:`Timedelta` now supported in :func:`DataFrame.to_excel` for xls file type (:issue:`19242`, :issue:`9155`) -- Plotting ^^^^^^^^ @@ -553,7 +552,7 @@ Sparse ^^^^^^ - Bug in which creating a ``SparseDataFrame`` from a dense ``Series`` or an unsupported type raised an uncontrolled exception (:issue:`19374`) -- +- Bug in :class:`SparseDataFrame.to_csv` causing exception (:issue:`19384`) - Reshaping diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 4b12d931ade35..52e8317f5209a 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -709,7 +709,8 @@ def to_native_types(self, slicer=None, na_rep='nan', quoting=None, **kwargs): """ convert to our native types format, slicing if desired """ - values = self.values + values = self.get_values() + if slicer is not None: values = values[:, slicer] mask = isna(values) diff --git a/pandas/tests/sparse/frame/test_to_csv.py b/pandas/tests/sparse/frame/test_to_csv.py new file mode 100644 index 0000000000000..b0243dfde8d3f --- /dev/null +++ b/pandas/tests/sparse/frame/test_to_csv.py @@ -0,0 +1,20 @@ +import numpy as np +import pytest +from pandas import SparseDataFrame, read_csv +from pandas.util import testing as tm + + +class TestSparseDataFrameToCsv(object): + fill_values = [np.nan, 0, None, 1] + + @pytest.mark.parametrize('fill_value', fill_values) + def test_to_csv_sparse_dataframe(self, fill_value): + # GH19384 + sdf = SparseDataFrame({'a': type(self).fill_values}, + default_fill_value=fill_value) + + with tm.ensure_clean('sparse_df.csv') as path: + sdf.to_csv(path, index=False) + df = read_csv(path, skip_blank_lines=False) + + tm.assert_sp_frame_equal(df.to_sparse(fill_value=fill_value), sdf) From 6670dfcbbf95492cea0439a00e34d499298ce343 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 2 Feb 2018 05:03:29 -0600 Subject: [PATCH 028/214] Added E741 to flake8 config (#19496) --- setup.cfg | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 828ef80971f7b..942b2b0a1a0bf 100644 --- a/setup.cfg +++ b/setup.cfg @@ -12,7 +12,11 @@ tag_prefix = v parentdir_prefix = pandas- [flake8] -ignore = E731,E402,W503 +ignore = + E402, # module level import not at top of file + E731, # do not assign a lambda expression, use a def + E741, # do not use variables named 'l', 'O', or 'I' + W503 # line break before binary operator max-line-length = 79 [yapf] From f6b260bb8e048dc2efcedd76df5b878ba67fd43e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 2 Feb 2018 03:05:30 -0800 Subject: [PATCH 029/214] implement timedeltas.test_scalar_compat (#19503) --- .../indexes/timedeltas/test_arithmetic.py | 56 ++++++++++++++++- pandas/tests/indexes/timedeltas/test_ops.py | 46 +------------- .../indexes/timedeltas/test_scalar_compat.py | 63 +++++++++++++++++++ .../indexes/timedeltas/test_timedelta.py | 49 +-------------- 4 files changed, 120 insertions(+), 94 deletions(-) create mode 100644 pandas/tests/indexes/timedeltas/test_scalar_compat.py diff --git a/pandas/tests/indexes/timedeltas/test_arithmetic.py b/pandas/tests/indexes/timedeltas/test_arithmetic.py index ef6523a9eb270..3dc60ed33b958 100644 --- a/pandas/tests/indexes/timedeltas/test_arithmetic.py +++ b/pandas/tests/indexes/timedeltas/test_arithmetic.py @@ -10,7 +10,7 @@ to_timedelta, timedelta_range, date_range, Series, Timestamp, Timedelta) -from pandas.errors import PerformanceWarning +from pandas.errors import PerformanceWarning, NullFrequencyError @pytest.fixture(params=[pd.offsets.Hour(2), timedelta(hours=2), @@ -138,6 +138,60 @@ def test_tdi_add_str_invalid(self): with pytest.raises(TypeError): 'a' + tdi + # ------------------------------------------------------------- + # TimedeltaIndex.shift is used by __add__/__sub__ + + def test_tdi_shift_empty(self): + # GH#9903 + idx = pd.TimedeltaIndex([], name='xxx') + tm.assert_index_equal(idx.shift(0, freq='H'), idx) + tm.assert_index_equal(idx.shift(3, freq='H'), idx) + + def test_tdi_shift_hours(self): + # GH#9903 + idx = pd.TimedeltaIndex(['5 hours', '6 hours', '9 hours'], name='xxx') + tm.assert_index_equal(idx.shift(0, freq='H'), idx) + exp = pd.TimedeltaIndex(['8 hours', '9 hours', '12 hours'], name='xxx') + tm.assert_index_equal(idx.shift(3, freq='H'), exp) + exp = pd.TimedeltaIndex(['2 hours', '3 hours', '6 hours'], name='xxx') + tm.assert_index_equal(idx.shift(-3, freq='H'), exp) + + def test_tdi_shift_minutes(self): + # GH#9903 + idx = pd.TimedeltaIndex(['5 hours', '6 hours', '9 hours'], name='xxx') + tm.assert_index_equal(idx.shift(0, freq='T'), idx) + exp = pd.TimedeltaIndex(['05:03:00', '06:03:00', '9:03:00'], + name='xxx') + tm.assert_index_equal(idx.shift(3, freq='T'), exp) + exp = pd.TimedeltaIndex(['04:57:00', '05:57:00', '8:57:00'], + name='xxx') + tm.assert_index_equal(idx.shift(-3, freq='T'), exp) + + def test_tdi_shift_int(self): + # GH#8083 + trange = pd.to_timedelta(range(5), unit='d') + pd.offsets.Hour(1) + result = trange.shift(1) + expected = TimedeltaIndex(['1 days 01:00:00', '2 days 01:00:00', + '3 days 01:00:00', + '4 days 01:00:00', '5 days 01:00:00'], + freq='D') + tm.assert_index_equal(result, expected) + + def test_tdi_shift_nonstandard_freq(self): + # GH#8083 + trange = pd.to_timedelta(range(5), unit='d') + pd.offsets.Hour(1) + result = trange.shift(3, freq='2D 1s') + expected = TimedeltaIndex(['6 days 01:00:03', '7 days 01:00:03', + '8 days 01:00:03', '9 days 01:00:03', + '10 days 01:00:03'], freq='D') + tm.assert_index_equal(result, expected) + + def test_shift_no_freq(self): + # GH#19147 + tdi = TimedeltaIndex(['1 days 01:00:00', '2 days 01:00:00'], freq=None) + with pytest.raises(NullFrequencyError): + tdi.shift(2) + # ------------------------------------------------------------- @pytest.mark.parametrize('box', [np.array, pd.Index]) diff --git a/pandas/tests/indexes/timedeltas/test_ops.py b/pandas/tests/indexes/timedeltas/test_ops.py index 112c62b7e2f8d..e944aad13f8d5 100644 --- a/pandas/tests/indexes/timedeltas/test_ops.py +++ b/pandas/tests/indexes/timedeltas/test_ops.py @@ -98,32 +98,6 @@ def test_numpy_minmax(self): tm.assert_raises_regex( ValueError, errmsg, np.argmax, td, out=0) - def test_round(self): - td = pd.timedelta_range(start='16801 days', periods=5, freq='30Min') - elt = td[1] - - expected_rng = TimedeltaIndex([ - Timedelta('16801 days 00:00:00'), - Timedelta('16801 days 00:00:00'), - Timedelta('16801 days 01:00:00'), - Timedelta('16801 days 02:00:00'), - Timedelta('16801 days 02:00:00'), - ]) - expected_elt = expected_rng[1] - - tm.assert_index_equal(td.round(freq='H'), expected_rng) - assert elt.round(freq='H') == expected_elt - - msg = pd._libs.tslibs.frequencies._INVALID_FREQ_ERROR - with tm.assert_raises_regex(ValueError, msg): - td.round(freq='foo') - with tm.assert_raises_regex(ValueError, msg): - elt.round(freq='foo') - - msg = " is a non-fixed frequency" - tm.assert_raises_regex(ValueError, msg, td.round, freq='M') - tm.assert_raises_regex(ValueError, msg, elt.round, freq='M') - def test_representation(self): idx1 = TimedeltaIndex([], freq='D') idx2 = TimedeltaIndex(['1 days'], freq='D') @@ -387,25 +361,7 @@ def test_nat_new(self): tm.assert_numpy_array_equal(result, exp) def test_shift(self): - # GH 9903 - idx = pd.TimedeltaIndex([], name='xxx') - tm.assert_index_equal(idx.shift(0, freq='H'), idx) - tm.assert_index_equal(idx.shift(3, freq='H'), idx) - - idx = pd.TimedeltaIndex(['5 hours', '6 hours', '9 hours'], name='xxx') - tm.assert_index_equal(idx.shift(0, freq='H'), idx) - exp = pd.TimedeltaIndex(['8 hours', '9 hours', '12 hours'], name='xxx') - tm.assert_index_equal(idx.shift(3, freq='H'), exp) - exp = pd.TimedeltaIndex(['2 hours', '3 hours', '6 hours'], name='xxx') - tm.assert_index_equal(idx.shift(-3, freq='H'), exp) - - tm.assert_index_equal(idx.shift(0, freq='T'), idx) - exp = pd.TimedeltaIndex(['05:03:00', '06:03:00', '9:03:00'], - name='xxx') - tm.assert_index_equal(idx.shift(3, freq='T'), exp) - exp = pd.TimedeltaIndex(['04:57:00', '05:57:00', '8:57:00'], - name='xxx') - tm.assert_index_equal(idx.shift(-3, freq='T'), exp) + pass # handled in test_arithmetic.py def test_repeat(self): index = pd.timedelta_range('1 days', periods=2, freq='D') diff --git a/pandas/tests/indexes/timedeltas/test_scalar_compat.py b/pandas/tests/indexes/timedeltas/test_scalar_compat.py new file mode 100644 index 0000000000000..7d97e1fadea30 --- /dev/null +++ b/pandas/tests/indexes/timedeltas/test_scalar_compat.py @@ -0,0 +1,63 @@ +# -*- coding: utf-8 -*- +""" +Tests for TimedeltaIndex methods behaving like their Timedelta counterparts +""" + +import numpy as np + +import pandas as pd +import pandas.util.testing as tm +from pandas import timedelta_range, Timedelta, TimedeltaIndex, Index, Series + + +class TestVectorizedTimedelta(object): + def test_tdi_total_seconds(self): + # GH#10939 + # test index + rng = timedelta_range('1 days, 10:11:12.100123456', periods=2, + freq='s') + expt = [1 * 86400 + 10 * 3600 + 11 * 60 + 12 + 100123456. / 1e9, + 1 * 86400 + 10 * 3600 + 11 * 60 + 13 + 100123456. / 1e9] + tm.assert_almost_equal(rng.total_seconds(), Index(expt)) + + # test Series + ser = Series(rng) + s_expt = Series(expt, index=[0, 1]) + tm.assert_series_equal(ser.dt.total_seconds(), s_expt) + + # with nat + ser[1] = np.nan + s_expt = Series([1 * 86400 + 10 * 3600 + 11 * 60 + + 12 + 100123456. / 1e9, np.nan], index=[0, 1]) + tm.assert_series_equal(ser.dt.total_seconds(), s_expt) + + # with both nat + ser = Series([np.nan, np.nan], dtype='timedelta64[ns]') + tm.assert_series_equal(ser.dt.total_seconds(), + Series([np.nan, np.nan], index=[0, 1])) + + def test_tdi_round(self): + td = pd.timedelta_range(start='16801 days', periods=5, freq='30Min') + elt = td[1] + + expected_rng = TimedeltaIndex([Timedelta('16801 days 00:00:00'), + Timedelta('16801 days 00:00:00'), + Timedelta('16801 days 01:00:00'), + Timedelta('16801 days 02:00:00'), + Timedelta('16801 days 02:00:00')]) + expected_elt = expected_rng[1] + + tm.assert_index_equal(td.round(freq='H'), expected_rng) + assert elt.round(freq='H') == expected_elt + + msg = pd._libs.tslibs.frequencies._INVALID_FREQ_ERROR + with tm.assert_raises_regex(ValueError, msg): + td.round(freq='foo') + with tm.assert_raises_regex(ValueError, msg): + elt.round(freq='foo') + + msg = " is a non-fixed frequency" + with tm.assert_raises_regex(ValueError, msg): + td.round(freq='M') + with tm.assert_raises_regex(ValueError, msg): + elt.round(freq='M') diff --git a/pandas/tests/indexes/timedeltas/test_timedelta.py b/pandas/tests/indexes/timedeltas/test_timedelta.py index 1af971e8a4326..32157a9a44e04 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta.py @@ -4,7 +4,6 @@ from datetime import timedelta import pandas as pd -from pandas.errors import NullFrequencyError import pandas.util.testing as tm from pandas import (timedelta_range, date_range, Series, Timedelta, TimedeltaIndex, Index, DataFrame, @@ -34,28 +33,7 @@ def test_numeric_compat(self): pass def test_shift(self): - # test shift for TimedeltaIndex - # err8083 - - drange = self.create_index() - result = drange.shift(1) - expected = TimedeltaIndex(['1 days 01:00:00', '2 days 01:00:00', - '3 days 01:00:00', - '4 days 01:00:00', '5 days 01:00:00'], - freq='D') - tm.assert_index_equal(result, expected) - - result = drange.shift(3, freq='2D 1s') - expected = TimedeltaIndex(['6 days 01:00:03', '7 days 01:00:03', - '8 days 01:00:03', '9 days 01:00:03', - '10 days 01:00:03'], freq='D') - tm.assert_index_equal(result, expected) - - def test_shift_no_freq(self): - # GH#19147 - tdi = TimedeltaIndex(['1 days 01:00:00', '2 days 01:00:00'], freq=None) - with pytest.raises(NullFrequencyError): - tdi.shift(2) + pass # this is handled in test_arithmetic.py def test_pickle_compat_construction(self): pass @@ -203,31 +181,6 @@ def test_map(self): exp = Int64Index([f(x) for x in rng]) tm.assert_index_equal(result, exp) - def test_total_seconds(self): - # GH 10939 - # test index - rng = timedelta_range('1 days, 10:11:12.100123456', periods=2, - freq='s') - expt = [1 * 86400 + 10 * 3600 + 11 * 60 + 12 + 100123456. / 1e9, - 1 * 86400 + 10 * 3600 + 11 * 60 + 13 + 100123456. / 1e9] - tm.assert_almost_equal(rng.total_seconds(), Index(expt)) - - # test Series - s = Series(rng) - s_expt = Series(expt, index=[0, 1]) - tm.assert_series_equal(s.dt.total_seconds(), s_expt) - - # with nat - s[1] = np.nan - s_expt = Series([1 * 86400 + 10 * 3600 + 11 * 60 + - 12 + 100123456. / 1e9, np.nan], index=[0, 1]) - tm.assert_series_equal(s.dt.total_seconds(), s_expt) - - # with both nat - s = Series([np.nan, np.nan], dtype='timedelta64[ns]') - tm.assert_series_equal(s.dt.total_seconds(), - Series([np.nan, np.nan], index=[0, 1])) - def test_pass_TimedeltaIndex_to_index(self): rng = timedelta_range('1 days', '10 days') From 7db4beae22562d9845d74971d93ffea3910d8a60 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 2 Feb 2018 03:29:53 -0800 Subject: [PATCH 030/214] Continue de-nesting core.ops (#19448) --- pandas/core/ops.py | 142 ++++++++++++++++++----------------- pandas/core/sparse/series.py | 2 +- 2 files changed, 74 insertions(+), 70 deletions(-) diff --git a/pandas/core/ops.py b/pandas/core/ops.py index ba8a15b60ba56..6ea4a81cb52a1 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -39,8 +39,7 @@ ABCSeries, ABCDataFrame, ABCIndex, - ABCPeriodIndex, - ABCSparseSeries) + ABCSparseSeries, ABCSparseArray) def _gen_eval_kwargs(name): @@ -445,8 +444,14 @@ def names(x): return new_methods -def add_methods(cls, new_methods, force): +def add_methods(cls, new_methods): for name, method in new_methods.items(): + # For most methods, if we find that the class already has a method + # of the same name, it is OK to over-write it. The exception is + # inplace methods (__iadd__, __isub__, ...) for SparseArray, which + # retain the np.ndarray versions. + force = not (issubclass(cls, ABCSparseArray) and + name.startswith('__i')) if force or name not in cls.__dict__: bind_method(cls, name, method) @@ -454,8 +459,7 @@ def add_methods(cls, new_methods, force): # ---------------------------------------------------------------------- # Arithmetic def add_special_arithmetic_methods(cls, arith_method=None, - comp_method=None, bool_method=None, - force=False): + comp_method=None, bool_method=None): """ Adds the full suite of special arithmetic methods (``__add__``, ``__sub__``, etc.) to the class. @@ -469,9 +473,6 @@ def add_special_arithmetic_methods(cls, arith_method=None, factory for rich comparison - signature: f(op, name, str_rep) bool_method : function (optional) factory for boolean methods - signature: f(op, name, str_rep) - force : bool, default False - if False, checks whether function is defined **on ``cls.__dict__``** - before defining if True, always defines functions on class base """ new_methods = _create_methods(cls, arith_method, comp_method, bool_method, special=True) @@ -512,12 +513,11 @@ def f(self, other): __ior__=_wrap_inplace_method(new_methods["__or__"]), __ixor__=_wrap_inplace_method(new_methods["__xor__"]))) - add_methods(cls, new_methods=new_methods, force=force) + add_methods(cls, new_methods=new_methods) def add_flex_arithmetic_methods(cls, flex_arith_method, - flex_comp_method=None, flex_bool_method=None, - force=False): + flex_comp_method=None, flex_bool_method=None): """ Adds the full suite of flex arithmetic methods (``pow``, ``mul``, ``add``) to the class. @@ -529,9 +529,6 @@ def add_flex_arithmetic_methods(cls, flex_arith_method, f(op, name, str_rep) flex_comp_method : function, optional, factory for rich comparison - signature: f(op, name, str_rep) - force : bool, default False - if False, checks whether function is defined **on ``cls.__dict__``** - before defining if True, always defines functions on class base """ new_methods = _create_methods(cls, flex_arith_method, flex_comp_method, flex_bool_method, @@ -544,7 +541,7 @@ def add_flex_arithmetic_methods(cls, flex_arith_method, if k in new_methods: new_methods.pop(k) - add_methods(cls, new_methods=new_methods, force=force) + add_methods(cls, new_methods=new_methods) # ----------------------------------------------------------------------------- @@ -614,14 +611,11 @@ def na_op(x, y): result = np.empty(x.size, dtype=dtype) mask = notna(x) & notna(y) result[mask] = op(x[mask], com._values_from_object(y[mask])) - elif isinstance(x, np.ndarray): + else: + assert isinstance(x, np.ndarray) result = np.empty(len(x), dtype=x.dtype) mask = notna(x) result[mask] = op(x[mask], y) - else: - raise TypeError("{typ} cannot perform the operation " - "{op}".format(typ=type(x).__name__, - op=str_rep)) result, changed = maybe_upcast_putmask(result, ~mask, np.nan) @@ -658,6 +652,10 @@ def wrapper(left, right, name=name, na_op=na_op): index=left.index, name=res_name, dtype=result.dtype) + elif is_categorical_dtype(left): + raise TypeError("{typ} cannot perform the operation " + "{op}".format(typ=type(left).__name__, op=str_rep)) + lvalues = left.values rvalues = right if isinstance(rvalues, ABCSeries): @@ -745,8 +743,12 @@ def na_op(x, y): elif is_categorical_dtype(y) and not is_scalar(y): return op(y, x) - if is_object_dtype(x.dtype): + elif is_object_dtype(x.dtype): result = _comp_method_OBJECT_ARRAY(op, x, y) + + elif is_datetimelike_v_numeric(x, y): + raise TypeError("invalid type comparison") + else: # we want to compare like types @@ -754,15 +756,6 @@ def na_op(x, y): # we are not NotImplemented, otherwise # we would allow datetime64 (but viewed as i8) against # integer comparisons - if is_datetimelike_v_numeric(x, y): - raise TypeError("invalid type comparison") - - # numpy does not like comparisons vs None - if is_scalar(y) and isna(y): - if name == '__ne__': - return np.ones(len(x), dtype=bool) - else: - return np.zeros(len(x), dtype=bool) # we have a datetime/timedelta and may need to convert mask = None @@ -795,15 +788,18 @@ def wrapper(self, other, axis=None): if axis is not None: self._get_axis_number(axis) - if isinstance(other, ABCSeries): + if isinstance(other, ABCDataFrame): # pragma: no cover + # Defer to DataFrame implementation; fail early + return NotImplemented + + elif isinstance(other, ABCSeries): name = com._maybe_match_name(self, other) if not self._indexed_same(other): msg = 'Can only compare identically-labeled Series objects' raise ValueError(msg) - return self._constructor(na_op(self.values, other.values), - index=self.index, name=name) - elif isinstance(other, ABCDataFrame): # pragma: no cover - return NotImplemented + res_values = na_op(self.values, other.values) + return self._constructor(res_values, index=self.index, name=name) + elif isinstance(other, (np.ndarray, pd.Index)): # do not check length of zerodim array # as it will broadcast @@ -811,23 +807,25 @@ def wrapper(self, other, axis=None): len(self) != len(other)): raise ValueError('Lengths must match to compare') - if isinstance(other, ABCPeriodIndex): - # temp workaround until fixing GH 13637 - # tested in test_nat_comparisons - # (pandas.tests.series.test_operators.TestSeriesOperators) - return self._constructor(na_op(self.values, - other.astype(object).values), - index=self.index) - - return self._constructor(na_op(self.values, np.asarray(other)), + res_values = na_op(self.values, np.asarray(other)) + return self._constructor(res_values, index=self.index).__finalize__(self) - elif isinstance(other, pd.Categorical): - if not is_categorical_dtype(self): - msg = ("Cannot compare a Categorical for op {op} with Series " - "of dtype {typ}.\nIf you want to compare values, use " - "'series np.asarray(other)'.") - raise TypeError(msg.format(op=op, typ=self.dtype)) + elif (isinstance(other, pd.Categorical) and + not is_categorical_dtype(self)): + raise TypeError("Cannot compare a Categorical for op {op} with " + "Series of dtype {typ}.\nIf you want to compare " + "values, use 'series np.asarray(other)'." + .format(op=op, typ=self.dtype)) + + elif is_scalar(other) and isna(other): + # numpy does not like comparisons vs None + if op is operator.ne: + res_values = np.ones(len(self), dtype=bool) + else: + res_values = np.zeros(len(self), dtype=bool) + return self._constructor(res_values, index=self.index, + name=self.name, dtype='bool') if is_categorical_dtype(self): # cats are a special case as get_values() would return an ndarray, @@ -877,11 +875,10 @@ def na_op(x, y): y = _ensure_object(y) result = lib.vec_binop(x, y, op) else: + # let null fall thru + if not isna(y): + y = bool(y) try: - - # let null fall thru - if not isna(y): - y = bool(y) result = lib.scalar_binop(x, y, op) except: msg = ("cannot compare a dtyped [{dtype}] array " @@ -899,26 +896,31 @@ def wrapper(self, other): self, other = _align_method_SERIES(self, other, align_asobject=True) - if isinstance(other, ABCSeries): + if isinstance(other, ABCDataFrame): + # Defer to DataFrame implementation; fail early + return NotImplemented + + elif isinstance(other, ABCSeries): name = com._maybe_match_name(self, other) is_other_int_dtype = is_integer_dtype(other.dtype) other = fill_int(other) if is_other_int_dtype else fill_bool(other) filler = (fill_int if is_self_int_dtype and is_other_int_dtype else fill_bool) - return filler(self._constructor(na_op(self.values, other.values), - index=self.index, name=name)) - elif isinstance(other, ABCDataFrame): - return NotImplemented + res_values = na_op(self.values, other.values) + unfilled = self._constructor(res_values, + index=self.index, name=name) + return filler(unfilled) else: # scalars, list, tuple, np.array filler = (fill_int if is_self_int_dtype and is_integer_dtype(np.asarray(other)) else fill_bool) - return filler(self._constructor( - na_op(self.values, other), - index=self.index)).__finalize__(self) + + res_values = na_op(self.values, other) + unfilled = self._constructor(res_values, index=self.index) + return filler(unfilled).__finalize__(self) return wrapper @@ -1023,21 +1025,23 @@ def na_op(x, y): mask = notna(xrav) & notna(yrav) xrav = xrav[mask] - # we may need to manually - # broadcast a 1 element array if yrav.shape != mask.shape: - yrav = np.empty(mask.shape, dtype=yrav.dtype) - yrav.fill(yrav.item()) + # FIXME: GH#5284, GH#5035, GH#19448 + # Without specifically raising here we get mismatched + # errors in Py3 (TypeError) vs Py2 (ValueError) + raise ValueError('Cannot broadcast operands together.') yrav = yrav[mask] - if np.prod(xrav.shape) and np.prod(yrav.shape): + if xrav.size: with np.errstate(all='ignore'): result[mask] = op(xrav, yrav) - elif hasattr(x, 'size'): + + elif isinstance(x, np.ndarray): + # mask is only meaningful for x result = np.empty(x.size, dtype=x.dtype) mask = notna(xrav) xrav = xrav[mask] - if np.prod(xrav.shape): + if xrav.size: with np.errstate(all='ignore'): result[mask] = op(xrav, y) else: diff --git a/pandas/core/sparse/series.py b/pandas/core/sparse/series.py index 1c23527cf57c4..62a467bec2683 100644 --- a/pandas/core/sparse/series.py +++ b/pandas/core/sparse/series.py @@ -819,4 +819,4 @@ def from_coo(cls, A, dense_index=False): ops.add_special_arithmetic_methods(SparseSeries, ops._arith_method_SPARSE_SERIES, comp_method=ops._arith_method_SPARSE_SERIES, - bool_method=None, force=True) + bool_method=None) From 601b8c9c45b3cb06ee4ceaf34456bbfd3f5e5d1d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 2 Feb 2018 03:32:49 -0800 Subject: [PATCH 031/214] Make DateOffset.kwds a property (#19403) --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/_libs/tslibs/offsets.pyx | 8 ++ pandas/core/indexes/datetimes.py | 8 +- pandas/tests/tseries/offsets/test_offsets.py | 2 +- pandas/tseries/offsets.py | 131 +++++++++---------- 5 files changed, 78 insertions(+), 72 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 91362c7640575..818b17baa38aa 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -341,6 +341,7 @@ Other API Changes - :func:`DatetimeIndex.shift` and :func:`TimedeltaIndex.shift` will now raise ``NullFrequencyError`` (which subclasses ``ValueError``, which was raised in older versions) when the index object frequency is ``None`` (:issue:`19147`) - Addition and subtraction of ``NaN`` from a :class:`Series` with ``dtype='timedelta64[ns]'`` will raise a ``TypeError` instead of treating the ``NaN`` as ``NaT`` (:issue:`19274`) - Set operations (union, difference...) on :class:`IntervalIndex` with incompatible index types will now raise a ``TypeError`` rather than a ``ValueError`` (:issue:`19329`) +- :class:`DateOffset` objects render more simply, e.g. "" instead of "" (:issue:`19403`) .. _whatsnew_0230.deprecations: diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index e02818dd818df..8caf9ea0e0389 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -302,6 +302,14 @@ class _BaseOffset(object): _normalize_cache = True _cacheable = False _day_opt = None + _attributes = frozenset(['n', 'normalize']) + + @property + def kwds(self): + # for backwards-compatibility + kwds = {name: getattr(self, name, None) for name in self._attributes + if name not in ['n', 'normalize']} + return {name: kwds[name] for name in kwds if kwds[name] is not None} def __call__(self, other): return self.apply(other) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 8dd41c022d163..76219a07f4943 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -71,9 +71,11 @@ def f(self): if field in ['is_month_start', 'is_month_end', 'is_quarter_start', 'is_quarter_end', 'is_year_start', 'is_year_end']: - month_kw = (self.freq.kwds.get('startingMonth', - self.freq.kwds.get('month', 12)) - if self.freq else 12) + freq = self.freq + month_kw = 12 + if freq: + kwds = freq.kwds + month_kw = kwds.get('startingMonth', kwds.get('month', 12)) result = fields.get_start_end_field(values, field, self.freqstr, month_kw) diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index b086884ecd250..d96ebab615d12 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -218,7 +218,7 @@ def test_offset_freqstr(self, offset_types): freqstr = offset.freqstr if freqstr not in ('', - "", + "", 'LWOM-SAT', ): code = get_offset(freqstr) assert offset.rule_code == code diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index ec206e0997d0b..2e4be7fbdeebf 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -185,6 +185,8 @@ def __add__(date): """ _use_relativedelta = False _adjust_dst = False + _attributes = frozenset(['n', 'normalize'] + + list(liboffsets.relativedelta_kwds)) # default for prior pickles normalize = False @@ -192,9 +194,9 @@ def __add__(date): def __init__(self, n=1, normalize=False, **kwds): self.n = self._validate_n(n) self.normalize = normalize - self.kwds = kwds self._offset, self._use_relativedelta = _determine_offset(kwds) + self.__dict__.update(kwds) @apply_wraps def apply(self, other): @@ -238,30 +240,31 @@ def apply_index(self, i): y : DatetimeIndex """ - if not type(self) is DateOffset: + if type(self) is not DateOffset: raise NotImplementedError("DateOffset subclass {name} " "does not have a vectorized " "implementation".format( name=self.__class__.__name__)) + kwds = self.kwds relativedelta_fast = set(['years', 'months', 'weeks', 'days', 'hours', 'minutes', 'seconds', 'microseconds']) # relativedelta/_offset path only valid for base DateOffset if (self._use_relativedelta and - set(self.kwds).issubset(relativedelta_fast)): + set(kwds).issubset(relativedelta_fast)): - months = ((self.kwds.get('years', 0) * 12 + - self.kwds.get('months', 0)) * self.n) + months = ((kwds.get('years', 0) * 12 + + kwds.get('months', 0)) * self.n) if months: shifted = liboffsets.shift_months(i.asi8, months) i = i._shallow_copy(shifted) - weeks = (self.kwds.get('weeks', 0)) * self.n + weeks = (kwds.get('weeks', 0)) * self.n if weeks: i = (i.to_period('W') + weeks).to_timestamp() + \ i.to_perioddelta('W') - timedelta_kwds = {k: v for k, v in self.kwds.items() + timedelta_kwds = {k: v for k, v in kwds.items() if k in ['days', 'hours', 'minutes', 'seconds', 'microseconds']} if timedelta_kwds: @@ -273,7 +276,7 @@ def apply_index(self, i): return i + (self._offset * self.n) else: # relativedelta with other keywords - kwd = set(self.kwds) - relativedelta_fast + kwd = set(kwds) - relativedelta_fast raise NotImplementedError("DateOffset with relativedelta " "keyword(s) {kwd} not able to be " "applied vectorized".format(kwd=kwd)) @@ -284,7 +287,7 @@ def isAnchored(self): return (self.n == 1) def _params(self): - all_paras = dict(list(vars(self).items()) + list(self.kwds.items())) + all_paras = self.__dict__.copy() if 'holidays' in all_paras and not all_paras['holidays']: all_paras.pop('holidays') exclude = ['kwds', 'name', 'normalize', 'calendar'] @@ -301,15 +304,8 @@ def _repr_attrs(self): exclude = set(['n', 'inc', 'normalize']) attrs = [] for attr in sorted(self.__dict__): - if attr.startswith('_'): + if attr.startswith('_') or attr == 'kwds': continue - elif attr == 'kwds': # TODO: get rid of this - kwds_new = {} - for key in self.kwds: - if not hasattr(self, key): - kwds_new[key] = self.kwds[key] - if len(kwds_new) > 0: - attrs.append('kwds={kwds_new}'.format(kwds_new=kwds_new)) elif attr not in exclude: value = getattr(self, attr) attrs.append('{attr}={value}'.format(attr=attr, value=value)) @@ -427,6 +423,30 @@ def _offset_str(self): def nanos(self): raise ValueError("{name} is a non-fixed frequency".format(name=self)) + def __setstate__(self, state): + """Reconstruct an instance from a pickled state""" + if 'offset' in state: + # Older (<0.22.0) versions have offset attribute instead of _offset + if '_offset' in state: # pragma: no cover + raise AssertionError('Unexpected key `_offset`') + state['_offset'] = state.pop('offset') + state['kwds']['offset'] = state['_offset'] + + if '_offset' in state and not isinstance(state['_offset'], timedelta): + # relativedelta, we need to populate using its kwds + offset = state['_offset'] + odict = offset.__dict__ + kwds = {key: odict[key] for key in odict if odict[key]} + state.update(kwds) + + self.__dict__ = state + if 'weekmask' in state and 'holidays' in state: + calendar, holidays = _get_calendar(weekmask=self.weekmask, + holidays=self.holidays, + calendar=None) + self.calendar = calendar + self.holidays = holidays + class SingleConstructorOffset(DateOffset): @classmethod @@ -450,10 +470,9 @@ def __init__(self, weekmask, holidays, calendar): # following two attributes. See DateOffset._params() # holidays, weekmask - # assumes self.kwds already exists - self.kwds['weekmask'] = self.weekmask = weekmask - self.kwds['holidays'] = self.holidays = holidays - self.kwds['calendar'] = self.calendar = calendar + self.weekmask = weekmask + self.holidays = holidays + self.calendar = calendar class BusinessMixin(object): @@ -490,23 +509,6 @@ def __getstate__(self): return state - def __setstate__(self, state): - """Reconstruct an instance from a pickled state""" - if 'offset' in state: - # Older versions have offset attribute instead of _offset - if '_offset' in state: # pragma: no cover - raise ValueError('Unexpected key `_offset`') - state['_offset'] = state.pop('offset') - state['kwds']['offset'] = state['_offset'] - self.__dict__ = state - if 'weekmask' in state and 'holidays' in state: - calendar, holidays = _get_calendar(weekmask=self.weekmask, - holidays=self.holidays, - calendar=None) - self.kwds['calendar'] = self.calendar = calendar - self.kwds['holidays'] = self.holidays = holidays - self.kwds['weekmask'] = state['weekmask'] - class BusinessDay(BusinessMixin, SingleConstructorOffset): """ @@ -514,11 +516,11 @@ class BusinessDay(BusinessMixin, SingleConstructorOffset): """ _prefix = 'B' _adjust_dst = True + _attributes = frozenset(['n', 'normalize', 'offset']) def __init__(self, n=1, normalize=False, offset=timedelta(0)): self.n = self._validate_n(n) self.normalize = normalize - self.kwds = {'offset': offset} self._offset = offset def _offset_str(self): @@ -615,10 +617,8 @@ class BusinessHourMixin(BusinessMixin): def __init__(self, start='09:00', end='17:00', offset=timedelta(0)): # must be validated here to equality check - kwds = {'offset': offset} - self.start = kwds['start'] = liboffsets._validate_business_time(start) - self.end = kwds['end'] = liboffsets._validate_business_time(end) - self.kwds.update(kwds) + self.start = liboffsets._validate_business_time(start) + self.end = liboffsets._validate_business_time(end) self._offset = offset @cache_readonly @@ -843,12 +843,12 @@ class BusinessHour(BusinessHourMixin, SingleConstructorOffset): """ _prefix = 'BH' _anchor = 0 + _attributes = frozenset(['n', 'normalize', 'start', 'end', 'offset']) def __init__(self, n=1, normalize=False, start='09:00', end='17:00', offset=timedelta(0)): self.n = self._validate_n(n) self.normalize = normalize - self.kwds = {} super(BusinessHour, self).__init__(start=start, end=end, offset=offset) @@ -872,13 +872,14 @@ class CustomBusinessDay(_CustomMixin, BusinessDay): """ _cacheable = False _prefix = 'C' + _attributes = frozenset(['n', 'normalize', + 'weekmask', 'holidays', 'calendar', 'offset']) def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri', holidays=None, calendar=None, offset=timedelta(0)): self.n = self._validate_n(n) self.normalize = normalize self._offset = offset - self.kwds = {'offset': offset} _CustomMixin.__init__(self, weekmask, holidays, calendar) @@ -930,6 +931,9 @@ class CustomBusinessHour(_CustomMixin, BusinessHourMixin, """ _prefix = 'CBH' _anchor = 0 + _attributes = frozenset(['n', 'normalize', + 'weekmask', 'holidays', 'calendar', + 'start', 'end', 'offset']) def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri', holidays=None, calendar=None, @@ -937,7 +941,6 @@ def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri', self.n = self._validate_n(n) self.normalize = normalize self._offset = offset - self.kwds = {'offset': offset} _CustomMixin.__init__(self, weekmask, holidays, calendar) BusinessHourMixin.__init__(self, start=start, end=end, offset=offset) @@ -949,11 +952,11 @@ def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri', class MonthOffset(SingleConstructorOffset): _adjust_dst = True + _attributes = frozenset(['n', 'normalize']) def __init__(self, n=1, normalize=False): self.n = self._validate_n(n) self.normalize = normalize - self.kwds = {} @property def name(self): @@ -1024,6 +1027,8 @@ class _CustomBusinessMonth(_CustomMixin, BusinessMixin, MonthOffset): calendar : pd.HolidayCalendar or np.busdaycalendar """ _cacheable = False + _attributes = frozenset(['n', 'normalize', + 'weekmask', 'holidays', 'calendar', 'offset']) onOffset = DateOffset.onOffset # override MonthOffset method apply_index = DateOffset.apply_index # override MonthOffset method @@ -1033,7 +1038,6 @@ def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri', self.n = self._validate_n(n) self.normalize = normalize self._offset = offset - self.kwds = {'offset': offset} _CustomMixin.__init__(self, weekmask, holidays, calendar) @@ -1102,6 +1106,7 @@ class SemiMonthOffset(DateOffset): _adjust_dst = True _default_day_of_month = 15 _min_day_of_month = 2 + _attributes = frozenset(['n', 'normalize', 'day_of_month']) def __init__(self, n=1, normalize=False, day_of_month=None): if day_of_month is None: @@ -1115,7 +1120,6 @@ def __init__(self, n=1, normalize=False, day_of_month=None): self.n = self._validate_n(n) self.normalize = normalize - self.kwds = {'day_of_month': self.day_of_month} @classmethod def _from_name(cls, suffix=None): @@ -1319,6 +1323,7 @@ class Week(DateOffset): _adjust_dst = True _inc = timedelta(weeks=1) _prefix = 'W' + _attributes = frozenset(['n', 'normalize', 'weekday']) def __init__(self, n=1, normalize=False, weekday=None): self.n = self._validate_n(n) @@ -1330,8 +1335,6 @@ def __init__(self, n=1, normalize=False, weekday=None): raise ValueError('Day must be 0<=day<=6, got {day}' .format(day=self.weekday)) - self.kwds = {'weekday': weekday} - def isAnchored(self): return (self.n == 1 and self.weekday is not None) @@ -1450,6 +1453,7 @@ class WeekOfMonth(_WeekOfMonthMixin, DateOffset): """ _prefix = 'WOM' _adjust_dst = True + _attributes = frozenset(['n', 'normalize', 'week', 'weekday']) def __init__(self, n=1, normalize=False, week=0, weekday=0): self.n = self._validate_n(n) @@ -1467,8 +1471,6 @@ def __init__(self, n=1, normalize=False, week=0, weekday=0): raise ValueError('Week must be 0<=week<=3, got {week}' .format(week=self.week)) - self.kwds = {'weekday': weekday, 'week': week} - def _get_offset_day(self, other): """ Find the day in the same month as other that has the same @@ -1526,6 +1528,7 @@ class LastWeekOfMonth(_WeekOfMonthMixin, DateOffset): """ _prefix = 'LWOM' _adjust_dst = True + _attributes = frozenset(['n', 'normalize', 'weekday']) def __init__(self, n=1, normalize=False, weekday=0): self.n = self._validate_n(n) @@ -1539,8 +1542,6 @@ def __init__(self, n=1, normalize=False, weekday=0): raise ValueError('Day must be 0<=day<=6, got {day}' .format(day=self.weekday)) - self.kwds = {'weekday': weekday} - def _get_offset_day(self, other): """ Find the day in the same month as other that has the same @@ -1584,6 +1585,7 @@ class QuarterOffset(DateOffset): _default_startingMonth = None _from_name_startingMonth = None _adjust_dst = True + _attributes = frozenset(['n', 'normalize', 'startingMonth']) # TODO: Consider combining QuarterOffset and YearOffset __init__ at some # point. Also apply_index, onOffset, rule_code if # startingMonth vs month attr names are resolved @@ -1595,8 +1597,6 @@ def __init__(self, n=1, normalize=False, startingMonth=None): startingMonth = self._default_startingMonth self.startingMonth = startingMonth - self.kwds = {'startingMonth': startingMonth} - def isAnchored(self): return (self.n == 1 and self.startingMonth is not None) @@ -1690,6 +1690,7 @@ class QuarterBegin(QuarterOffset): class YearOffset(DateOffset): """DateOffset that just needs a month""" _adjust_dst = True + _attributes = frozenset(['n', 'normalize', 'month']) def _get_offset_day(self, other): # override BaseOffset method to use self.month instead of other.month @@ -1725,8 +1726,6 @@ def __init__(self, n=1, normalize=False, month=None): if self.month < 1 or self.month > 12: raise ValueError('Month must go from 1 to 12') - self.kwds = {'month': month} - @classmethod def _from_name(cls, suffix=None): kwargs = {} @@ -1811,6 +1810,7 @@ class FY5253(DateOffset): """ _prefix = 'RE' _adjust_dst = True + _attributes = frozenset(['weekday', 'startingMonth', 'variation']) def __init__(self, n=1, normalize=False, weekday=0, startingMonth=1, variation="nearest"): @@ -1821,9 +1821,6 @@ def __init__(self, n=1, normalize=False, weekday=0, startingMonth=1, self.variation = variation - self.kwds = {'weekday': weekday, 'startingMonth': startingMonth, - 'variation': variation} - if self.n == 0: raise ValueError('N cannot be 0') @@ -2012,6 +2009,8 @@ class FY5253Quarter(DateOffset): _prefix = 'REQ' _adjust_dst = True + _attributes = frozenset(['weekday', 'startingMonth', 'qtr_with_extra_week', + 'variation']) def __init__(self, n=1, normalize=False, weekday=0, startingMonth=1, qtr_with_extra_week=1, variation="nearest"): @@ -2023,10 +2022,6 @@ def __init__(self, n=1, normalize=False, weekday=0, startingMonth=1, self.qtr_with_extra_week = qtr_with_extra_week self.variation = variation - self.kwds = {'weekday': weekday, 'startingMonth': startingMonth, - 'qtr_with_extra_week': qtr_with_extra_week, - 'variation': variation} - if self.n == 0: raise ValueError('N cannot be 0') @@ -2170,11 +2165,11 @@ class Easter(DateOffset): 1583-4099. """ _adjust_dst = True + _attributes = frozenset(['n', 'normalize']) def __init__(self, n=1, normalize=False): self.n = self._validate_n(n) self.normalize = normalize - self.kwds = {} @apply_wraps def apply(self, other): @@ -2217,12 +2212,12 @@ def f(self, other): class Tick(SingleConstructorOffset): _inc = Timedelta(microseconds=1000) _prefix = 'undefined' + _attributes = frozenset(['n', 'normalize']) def __init__(self, n=1, normalize=False): # TODO: do Tick classes with normalize=True make sense? self.n = self._validate_n(n) self.normalize = normalize - self.kwds = {} __gt__ = _tick_comp(operator.gt) __ge__ = _tick_comp(operator.ge) From cd6510d36ccddae19808cb02b60b15dfa42a47b1 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 2 Feb 2018 03:38:05 -0800 Subject: [PATCH 032/214] Fix DTI comparison with None, datetime.date (#19301) --- doc/source/whatsnew/v0.23.0.txt | 4 +- pandas/core/indexes/datetimes.py | 18 +- .../indexes/datetimes/test_arithmetic.py | 208 ++++++++++++------ 3 files changed, 156 insertions(+), 74 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 818b17baa38aa..b28378f13057b 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -453,6 +453,8 @@ Datetimelike - Bug in subtracting :class:`Series` from ``NaT`` incorrectly returning ``NaT`` (:issue:`19158`) - Bug in :func:`Series.truncate` which raises ``TypeError`` with a monotonic ``PeriodIndex`` (:issue:`17717`) - Bug in :func:`~DataFrame.pct_change` using ``periods`` and ``freq`` returned different length outputs (:issue:`7292`) +- Bug in comparison of :class:`DatetimeIndex` against ``None`` or ``datetime.date`` objects raising ``TypeError`` for ``==`` and ``!=`` comparisons instead of all-``False`` and all-``True``, respectively (:issue:`19301`) +- Timezones ^^^^^^^^^ @@ -484,8 +486,6 @@ Numeric - Bug in the :class:`DataFrame` constructor in which data containing very large positive or very large negative numbers was causing ``OverflowError`` (:issue:`18584`) - Bug in :class:`Index` constructor with ``dtype='uint64'`` where int-like floats were not coerced to :class:`UInt64Index` (:issue:`18400`) -- - Indexing ^^^^^^^^ diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 76219a07f4943..e09fa87477122 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -120,8 +120,16 @@ def wrapper(self, other): else: if isinstance(other, list): other = DatetimeIndex(other) - elif not isinstance(other, (np.ndarray, Index, ABCSeries)): - other = _ensure_datetime64(other) + elif not isinstance(other, (np.datetime64, np.ndarray, + Index, ABCSeries)): + # Following Timestamp convention, __eq__ is all-False + # and __ne__ is all True, others raise TypeError. + if opname == '__eq__': + return np.zeros(shape=self.shape, dtype=bool) + elif opname == '__ne__': + return np.ones(shape=self.shape, dtype=bool) + raise TypeError('%s type object %s' % + (type(other), str(other))) if is_datetimelike(other): self._assert_tzawareness_compat(other) @@ -148,12 +156,6 @@ def wrapper(self, other): return compat.set_function_name(wrapper, opname, cls) -def _ensure_datetime64(other): - if isinstance(other, np.datetime64): - return other - raise TypeError('%s type object %s' % (type(other), str(other))) - - _midnight = time(0, 0) diff --git a/pandas/tests/indexes/datetimes/test_arithmetic.py b/pandas/tests/indexes/datetimes/test_arithmetic.py index 671071b5e4945..09a6b35a0ff0e 100644 --- a/pandas/tests/indexes/datetimes/test_arithmetic.py +++ b/pandas/tests/indexes/datetimes/test_arithmetic.py @@ -14,6 +14,7 @@ from pandas import (Timestamp, Timedelta, Series, DatetimeIndex, TimedeltaIndex, date_range) +from pandas._libs import tslib @pytest.fixture(params=[None, 'UTC', 'Asia/Tokyo', @@ -44,7 +45,83 @@ def addend(request): class TestDatetimeIndexComparisons(object): - # TODO: De-duplicate with test_comparisons_nat below + @pytest.mark.parametrize('other', [datetime(2016, 1, 1), + Timestamp('2016-01-01'), + np.datetime64('2016-01-01')]) + def test_dti_cmp_datetimelike(self, other, tz): + dti = pd.date_range('2016-01-01', periods=2, tz=tz) + if tz is not None: + if isinstance(other, np.datetime64): + # no tzaware version available + return + elif isinstance(other, Timestamp): + other = other.tz_localize(dti.tzinfo) + else: + other = tslib._localize_pydatetime(other, dti.tzinfo) + + result = dti == other + expected = np.array([True, False]) + tm.assert_numpy_array_equal(result, expected) + + result = dti > other + expected = np.array([False, True]) + tm.assert_numpy_array_equal(result, expected) + + result = dti >= other + expected = np.array([True, True]) + tm.assert_numpy_array_equal(result, expected) + + result = dti < other + expected = np.array([False, False]) + tm.assert_numpy_array_equal(result, expected) + + result = dti <= other + expected = np.array([True, False]) + tm.assert_numpy_array_equal(result, expected) + + def dti_cmp_non_datetime(self, tz): + # GH#19301 by convention datetime.date is not considered comparable + # to Timestamp or DatetimeIndex. This may change in the future. + dti = pd.date_range('2016-01-01', periods=2, tz=tz) + + other = datetime(2016, 1, 1).date() + assert not (dti == other).any() + assert (dti != other).all() + with pytest.raises(TypeError): + dti < other + with pytest.raises(TypeError): + dti <= other + with pytest.raises(TypeError): + dti > other + with pytest.raises(TypeError): + dti >= other + + @pytest.mark.parametrize('other', [None, np.nan, pd.NaT]) + def test_dti_eq_null_scalar(self, other, tz): + # GH#19301 + dti = pd.date_range('2016-01-01', periods=2, tz=tz) + assert not (dti == other).any() + + @pytest.mark.parametrize('other', [None, np.nan, pd.NaT]) + def test_dti_ne_null_scalar(self, other, tz): + # GH#19301 + dti = pd.date_range('2016-01-01', periods=2, tz=tz) + assert (dti != other).all() + + @pytest.mark.parametrize('other', [None, np.nan]) + def test_dti_cmp_null_scalar_inequality(self, tz, other): + # GH#19301 + dti = pd.date_range('2016-01-01', periods=2, tz=tz) + + with pytest.raises(TypeError): + dti < other + with pytest.raises(TypeError): + dti <= other + with pytest.raises(TypeError): + dti > other + with pytest.raises(TypeError): + dti >= other + def test_dti_cmp_nat(self): left = pd.DatetimeIndex([pd.Timestamp('2011-01-01'), pd.NaT, pd.Timestamp('2011-01-03')]) @@ -72,69 +149,7 @@ def test_dti_cmp_nat(self): tm.assert_numpy_array_equal(lhs < pd.NaT, expected) tm.assert_numpy_array_equal(pd.NaT > lhs, expected) - @pytest.mark.parametrize('op', [operator.eq, operator.ne, - operator.gt, operator.ge, - operator.lt, operator.le]) - def test_comparison_tzawareness_compat(self, op): - # GH#18162 - dr = pd.date_range('2016-01-01', periods=6) - dz = dr.tz_localize('US/Pacific') - - with pytest.raises(TypeError): - op(dr, dz) - with pytest.raises(TypeError): - op(dr, list(dz)) - with pytest.raises(TypeError): - op(dz, dr) - with pytest.raises(TypeError): - op(dz, list(dr)) - - # Check that there isn't a problem aware-aware and naive-naive do not - # raise - assert (dr == dr).all() - assert (dr == list(dr)).all() - assert (dz == dz).all() - assert (dz == list(dz)).all() - - # Check comparisons against scalar Timestamps - ts = pd.Timestamp('2000-03-14 01:59') - ts_tz = pd.Timestamp('2000-03-14 01:59', tz='Europe/Amsterdam') - - assert (dr > ts).all() - with pytest.raises(TypeError): - op(dr, ts_tz) - - assert (dz > ts_tz).all() - with pytest.raises(TypeError): - op(dz, ts) - - @pytest.mark.parametrize('op', [operator.eq, operator.ne, - operator.gt, operator.ge, - operator.lt, operator.le]) - def test_nat_comparison_tzawareness(self, op): - # GH#19276 - # tzaware DatetimeIndex should not raise when compared to NaT - dti = pd.DatetimeIndex(['2014-01-01', pd.NaT, '2014-03-01', pd.NaT, - '2014-05-01', '2014-07-01']) - expected = np.array([op == operator.ne] * len(dti)) - result = op(dti, pd.NaT) - tm.assert_numpy_array_equal(result, expected) - - result = op(dti.tz_localize('US/Pacific'), pd.NaT) - tm.assert_numpy_array_equal(result, expected) - - def test_comparisons_coverage(self): - rng = date_range('1/1/2000', periods=10) - - # raise TypeError for now - pytest.raises(TypeError, rng.__lt__, rng[3].value) - - result = rng == list(rng) - exp = rng == rng - tm.assert_numpy_array_equal(result, exp) - - def test_comparisons_nat(self): - + def test_dti_cmp_nat_behaves_like_float_cmp_nan(self): fidx1 = pd.Index([1.0, np.nan, 3.0, np.nan, 5.0, 7.0]) fidx2 = pd.Index([2.0, 3.0, np.nan, np.nan, 6.0, 7.0]) @@ -223,6 +238,71 @@ def test_comparisons_nat(self): expected = np.array([True, True, False, True, True, True]) tm.assert_numpy_array_equal(result, expected) + @pytest.mark.parametrize('op', [operator.eq, operator.ne, + operator.gt, operator.ge, + operator.lt, operator.le]) + def test_comparison_tzawareness_compat(self, op): + # GH#18162 + dr = pd.date_range('2016-01-01', periods=6) + dz = dr.tz_localize('US/Pacific') + + with pytest.raises(TypeError): + op(dr, dz) + with pytest.raises(TypeError): + op(dr, list(dz)) + with pytest.raises(TypeError): + op(dz, dr) + with pytest.raises(TypeError): + op(dz, list(dr)) + + # Check that there isn't a problem aware-aware and naive-naive do not + # raise + assert (dr == dr).all() + assert (dr == list(dr)).all() + assert (dz == dz).all() + assert (dz == list(dz)).all() + + # Check comparisons against scalar Timestamps + ts = pd.Timestamp('2000-03-14 01:59') + ts_tz = pd.Timestamp('2000-03-14 01:59', tz='Europe/Amsterdam') + + assert (dr > ts).all() + with pytest.raises(TypeError): + op(dr, ts_tz) + + assert (dz > ts_tz).all() + with pytest.raises(TypeError): + op(dz, ts) + + @pytest.mark.parametrize('op', [operator.eq, operator.ne, + operator.gt, operator.ge, + operator.lt, operator.le]) + def test_nat_comparison_tzawareness(self, op): + # GH#19276 + # tzaware DatetimeIndex should not raise when compared to NaT + dti = pd.DatetimeIndex(['2014-01-01', pd.NaT, '2014-03-01', pd.NaT, + '2014-05-01', '2014-07-01']) + expected = np.array([op == operator.ne] * len(dti)) + result = op(dti, pd.NaT) + tm.assert_numpy_array_equal(result, expected) + + result = op(dti.tz_localize('US/Pacific'), pd.NaT) + tm.assert_numpy_array_equal(result, expected) + + def test_dti_cmp_int_raises(self): + rng = date_range('1/1/2000', periods=10) + + # raise TypeError for now + with pytest.raises(TypeError): + rng < rng[3].value + + def test_dti_cmp_list(self): + rng = date_range('1/1/2000', periods=10) + + result = rng == list(rng) + expected = rng == rng + tm.assert_numpy_array_equal(result, expected) + class TestDatetimeIndexArithmetic(object): From 69cd5fbfd8b57033b0eb280ebc6dd502652f269c Mon Sep 17 00:00:00 2001 From: Tommy <10076072+tommyod@users.noreply.github.com> Date: Fri, 2 Feb 2018 13:50:46 +0100 Subject: [PATCH 033/214] DOC: Exposed arguments in plot.kde (#19229) * Exposed arguments in plot.kde, added number of sample points as option * Added a test for plot.kde with as an integer * Added whatsnew. Fixed flake8 errors. Used is_integer to infer type. * Updated scipy reference * Added test, rewrote whatsnew, removed import * Changed from Series to DataFrame in doc * Fixed PEP8 errors in test file * Fixed typo which made tests crash --- doc/source/whatsnew/v0.23.0.txt | 2 +- pandas/plotting/_core.py | 32 ++++++++++++++++++++++++---- pandas/tests/plotting/test_series.py | 14 ++++++------ 3 files changed, 37 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index b28378f13057b..26a7a78bb5c55 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -536,7 +536,7 @@ Plotting - :func: `DataFrame.plot` now raises a ``ValueError`` when the ``x`` or ``y`` argument is improperly formed (:issue:`18671`) - Bug in formatting tick labels with ``datetime.time()`` and fractional seconds (:issue:`18478`). -- +- :meth:`Series.plot.kde` has exposed the args ``ind`` and ``bw_method`` in the docstring (:issue:`18461`). The argument ``ind`` may now also be an integer (number of sample points). - Groupby/Resample/Rolling diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 88b899ad60313..b15c5271ae321 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -1398,6 +1398,10 @@ def _get_ind(self, y): sample_range = np.nanmax(y) - np.nanmin(y) ind = np.linspace(np.nanmin(y) - 0.5 * sample_range, np.nanmax(y) + 0.5 * sample_range, 1000) + elif is_integer(self.ind): + sample_range = np.nanmax(y) - np.nanmin(y) + ind = np.linspace(np.nanmin(y) - 0.5 * sample_range, + np.nanmax(y) + 0.5 * sample_range, self.ind) else: ind = self.ind return ind @@ -2598,12 +2602,22 @@ def hist(self, bins=10, **kwds): """ return self(kind='hist', bins=bins, **kwds) - def kde(self, **kwds): + def kde(self, bw_method=None, ind=None, **kwds): """ Kernel Density Estimate plot Parameters ---------- + bw_method: str, scalar or callable, optional + The method used to calculate the estimator bandwidth. This can be + 'scott', 'silverman', a scalar constant or a callable. + If None (default), 'scott' is used. + See :class:`scipy.stats.gaussian_kde` for more information. + ind : NumPy array or integer, optional + Evaluation points. If None (default), 1000 equally spaced points + are used. If `ind` is a NumPy array, the kde is evaluated at the + points passed. If `ind` is an integer, `ind` number of equally + spaced points are used. `**kwds` : optional Keyword arguments to pass on to :py:meth:`pandas.Series.plot`. @@ -2611,7 +2625,7 @@ def kde(self, **kwds): ------- axes : matplotlib.AxesSubplot or np.array of them """ - return self(kind='kde', **kwds) + return self(kind='kde', bw_method=bw_method, ind=ind, **kwds) density = kde @@ -2766,12 +2780,22 @@ def hist(self, by=None, bins=10, **kwds): """ return self(kind='hist', by=by, bins=bins, **kwds) - def kde(self, **kwds): + def kde(self, bw_method=None, ind=None, **kwds): """ Kernel Density Estimate plot Parameters ---------- + bw_method: str, scalar or callable, optional + The method used to calculate the estimator bandwidth. This can be + 'scott', 'silverman', a scalar constant or a callable. + If None (default), 'scott' is used. + See :class:`scipy.stats.gaussian_kde` for more information. + ind : NumPy array or integer, optional + Evaluation points. If None (default), 1000 equally spaced points + are used. If `ind` is a NumPy array, the kde is evaluated at the + points passed. If `ind` is an integer, `ind` number of equally + spaced points are used. `**kwds` : optional Keyword arguments to pass on to :py:meth:`pandas.DataFrame.plot`. @@ -2779,7 +2803,7 @@ def kde(self, **kwds): ------- axes : matplotlib.AxesSubplot or np.array of them """ - return self(kind='kde', **kwds) + return self(kind='kde', bw_method=bw_method, ind=ind, **kwds) density = kde diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index 2458fc0dc992c..278be433183fa 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -621,14 +621,16 @@ def test_kde_kwargs(self): if not self.mpl_ge_1_5_0: pytest.skip("mpl is not supported") - from numpy import linspace - _check_plot_works(self.ts.plot.kde, bw_method=.5, - ind=linspace(-100, 100, 20)) + sample_points = np.linspace(-100, 100, 20) + _check_plot_works(self.ts.plot.kde, bw_method='scott', ind=20) + _check_plot_works(self.ts.plot.kde, bw_method=None, ind=20) + _check_plot_works(self.ts.plot.kde, bw_method=None, ind=np.int(20)) + _check_plot_works(self.ts.plot.kde, bw_method=.5, ind=sample_points) _check_plot_works(self.ts.plot.density, bw_method=.5, - ind=linspace(-100, 100, 20)) + ind=sample_points) _, ax = self.plt.subplots() - ax = self.ts.plot.kde(logy=True, bw_method=.5, - ind=linspace(-100, 100, 20), ax=ax) + ax = self.ts.plot.kde(logy=True, bw_method=.5, ind=sample_points, + ax=ax) self._check_ax_scales(ax, yaxis='log') self._check_text_labels(ax.yaxis.get_label(), 'Density') From e8620abc12a4c468a75adb8607fd8e0eb1c472e7 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 2 Feb 2018 15:34:20 -0600 Subject: [PATCH 034/214] ENH: Array Interface and Categorical internals Refactor (#19268) * REF: Define extension base classes * Updated for comments * removed take_nd * Changed to_dense to return get_values * Fixed docstrings, types * Removed is_sparse * Remove metaclasses from PeriodDtype and IntervalDtype * Fixup form_blocks rebase * Restore concat casting cat -> object * Remove _slice, clarify semantics around __getitem__ * Document and use take. * Clarify type, kind, init * Remove base * API: Remove unused __iter__ and get_values * API: Implement repr and str * Remove default value_counts for now * Fixed merge conflicts * Remove implementation of construct_from_string * Example implementation of take * Cleanup ExtensionBlock * Pass through ndim * Use series._values * Removed repr, updated take doc * Various cleanups * Handle get_values, to_dense, is_view * Docs * Remove is_extension, is_bool Remove inherited convert * Sparse formatter * Revert "Sparse formatter" This reverts commit ab2f0457839fece3b3ef067f29994b42908bd037. * Unbox SparseSeries * Added test for sparse consolidation * Docs * Moved to errors * Handle classmethods, properties * Use our AbstractMethodError * Lint * Cleanup * Move ndim validation to a method. * Try this * Make ExtensionBlock._holder a property Removed ExtensionBlock.__init__ * Make _holder a property for all * Refactored validate_ndim * fixup! Refactored validate_ndim * lint --- pandas/core/arrays/__init__.py | 1 + pandas/core/arrays/base.py | 247 +++++++++++++ pandas/core/arrays/categorical.py | 18 +- pandas/core/common.py | 16 +- pandas/core/dtypes/base.py | 129 +++++++ pandas/core/dtypes/common.py | 29 ++ pandas/core/dtypes/dtypes.py | 16 +- pandas/core/internals.py | 329 ++++++++++++++---- pandas/errors/__init__.py | 23 ++ pandas/tests/dtypes/test_dtypes.py | 32 +- pandas/tests/internals/test_external_block.py | 4 +- pandas/tests/internals/test_internals.py | 24 +- pandas/tests/sparse/frame/test_frame.py | 9 + pandas/tests/test_errors.py | 29 ++ 14 files changed, 803 insertions(+), 103 deletions(-) create mode 100644 pandas/core/arrays/base.py create mode 100644 pandas/core/dtypes/base.py diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py index ee32b12f0e712..f8adcf520c15b 100644 --- a/pandas/core/arrays/__init__.py +++ b/pandas/core/arrays/__init__.py @@ -1 +1,2 @@ +from .base import ExtensionArray # noqa from .categorical import Categorical # noqa diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py new file mode 100644 index 0000000000000..1556b653819a6 --- /dev/null +++ b/pandas/core/arrays/base.py @@ -0,0 +1,247 @@ +"""An interface for extending pandas with custom arrays.""" +from pandas.errors import AbstractMethodError + +_not_implemented_message = "{} does not implement {}." + + +class ExtensionArray(object): + """Abstract base class for custom 1-D array types. + + pandas will recognize instances of this class as proper arrays + with a custom type and will not attempt to coerce them to objects. They + may be stored directly inside a :class:`DataFrame` or :class:`Series`. + + Notes + ----- + The interface includes the following abstract methods that must be + implemented by subclasses: + + * __getitem__ + * __len__ + * dtype + * nbytes + * isna + * take + * copy + * _formatting_values + * _concat_same_type + + Some additional methods are required to satisfy pandas' internal, private + block API. + + * _concat_same_type + * _can_hold_na + + This class does not inherit from 'abc.ABCMeta' for performance reasons. + Methods and properties required by the interface raise + ``pandas.errors.AbstractMethodError`` and no ``register`` method is + provided for registering virtual subclasses. + + ExtensionArrays are limited to 1 dimension. + + They may be backed by none, one, or many NumPy ararys. For example, + ``pandas.Categorical`` is an extension array backed by two arrays, + one for codes and one for categories. An array of IPv6 address may + be backed by a NumPy structured array with two fields, one for the + lower 64 bits and one for the upper 64 bits. Or they may be backed + by some other storage type, like Python lists. Pandas makes no + assumptions on how the data are stored, just that it can be converted + to a NumPy array. + + Extension arrays should be able to be constructed with instances of + the class, i.e. ``ExtensionArray(extension_array)`` should return + an instance, not error. + + Additionally, certain methods and interfaces are required for proper + this array to be properly stored inside a ``DataFrame`` or ``Series``. + """ + # ------------------------------------------------------------------------ + # Must be a Sequence + # ------------------------------------------------------------------------ + def __getitem__(self, item): + # type (Any) -> Any + """Select a subset of self. + + Parameters + ---------- + item : int, slice, or ndarray + * int: The position in 'self' to get. + + * slice: A slice object, where 'start', 'stop', and 'step' are + integers or None + + * ndarray: A 1-d boolean NumPy ndarray the same length as 'self' + + Returns + ------- + item : scalar or ExtensionArray + + Notes + ----- + For scalar ``item``, return a scalar value suitable for the array's + type. This should be an instance of ``self.dtype.type``. + + For slice ``key``, return an instance of ``ExtensionArray``, even + if the slice is length 0 or 1. + + For a boolean mask, return an instance of ``ExtensionArray``, filtered + to the values where ``item`` is True. + """ + raise AbstractMethodError(self) + + def __setitem__(self, key, value): + # type: (Any, Any) -> None + raise NotImplementedError(_not_implemented_message.format( + type(self), '__setitem__') + ) + + def __len__(self): + """Length of this array + + Returns + ------- + length : int + """ + # type: () -> int + raise AbstractMethodError(self) + + # ------------------------------------------------------------------------ + # Required attributes + # ------------------------------------------------------------------------ + @property + def dtype(self): + # type: () -> ExtensionDtype + """An instance of 'ExtensionDtype'.""" + raise AbstractMethodError(self) + + @property + def shape(self): + # type: () -> Tuple[int, ...] + return (len(self),) + + @property + def ndim(self): + # type: () -> int + """Extension Arrays are only allowed to be 1-dimensional.""" + return 1 + + @property + def nbytes(self): + # type: () -> int + """The number of bytes needed to store this object in memory. + + If this is expensive to compute, return an approximate lower bound + on the number of bytes needed. + """ + raise AbstractMethodError(self) + + # ------------------------------------------------------------------------ + # Additional Methods + # ------------------------------------------------------------------------ + def isna(self): + # type: () -> np.ndarray + """Boolean NumPy array indicating if each value is missing. + + This should return a 1-D array the same length as 'self'. + """ + raise AbstractMethodError(self) + + # ------------------------------------------------------------------------ + # Indexing methods + # ------------------------------------------------------------------------ + def take(self, indexer, allow_fill=True, fill_value=None): + # type: (Sequence[int], bool, Optional[Any]) -> ExtensionArray + """Take elements from an array. + + Parameters + ---------- + indexer : sequence of integers + indices to be taken. -1 is used to indicate values + that are missing. + allow_fill : bool, default True + If False, indexer is assumed to contain no -1 values so no filling + will be done. This short-circuits computation of a mask. Result is + undefined if allow_fill == False and -1 is present in indexer. + fill_value : any, default None + Fill value to replace -1 values with. By default, this uses + the missing value sentinel for this type, ``self._fill_value``. + + Notes + ----- + This should follow pandas' semantics where -1 indicates missing values. + Positions where indexer is ``-1`` should be filled with the missing + value for this type. + + This is called by ``Series.__getitem__``, ``.loc``, ``iloc``, when the + indexer is a sequence of values. + + Examples + -------- + Suppose the extension array somehow backed by a NumPy structured array + and that the underlying structured array is stored as ``self.data``. + Then ``take`` may be written as + + .. code-block:: python + + def take(self, indexer, allow_fill=True, fill_value=None): + mask = indexer == -1 + result = self.data.take(indexer) + result[mask] = self._fill_value + return type(self)(result) + """ + raise AbstractMethodError(self) + + def copy(self, deep=False): + # type: (bool) -> ExtensionArray + """Return a copy of the array. + + Parameters + ---------- + deep : bool, default False + Also copy the underlying data backing this array. + + Returns + ------- + ExtensionArray + """ + raise AbstractMethodError(self) + + # ------------------------------------------------------------------------ + # Block-related methods + # ------------------------------------------------------------------------ + @property + def _fill_value(self): + # type: () -> Any + """The missing value for this type, e.g. np.nan""" + return None + + def _formatting_values(self): + # type: () -> np.ndarray + # At the moment, this has to be an array since we use result.dtype + """An array of values to be printed in, e.g. the Series repr""" + raise AbstractMethodError(self) + + @classmethod + def _concat_same_type(cls, to_concat): + # type: (Sequence[ExtensionArray]) -> ExtensionArray + """Concatenate multiple array + + Parameters + ---------- + to_concat : sequence of this type + + Returns + ------- + ExtensionArray + """ + raise AbstractMethodError(cls) + + def _can_hold_na(self): + # type: () -> bool + """Whether your array can hold missing values. True by default. + + Notes + ----- + Setting this to false will optimize some operations like fillna. + """ + return True diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index b50e01b0fb55a..62c6a6b16cbe9 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -43,6 +43,8 @@ from pandas.util._validators import validate_bool_kwarg from pandas.core.config import get_option +from .base import ExtensionArray + def _cat_compare_op(op): def f(self, other): @@ -148,7 +150,7 @@ def _maybe_to_categorical(array): """ -class Categorical(PandasObject): +class Categorical(ExtensionArray, PandasObject): """ Represents a categorical variable in classic R / S-plus fashion @@ -2130,6 +2132,20 @@ def repeat(self, repeats, *args, **kwargs): return self._constructor(values=codes, categories=self.categories, ordered=self.ordered, fastpath=True) + # Implement the ExtensionArray interface + @property + def _can_hold_na(self): + return True + + @classmethod + def _concat_same_type(self, to_concat): + from pandas.core.dtypes.concat import _concat_categorical + + return _concat_categorical(to_concat) + + def _formatting_values(self): + return self + # The Series.cat accessor diff --git a/pandas/core/common.py b/pandas/core/common.py index e606be3cc2a23..6748db825acf0 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -25,7 +25,8 @@ # compat from pandas.errors import ( # noqa - PerformanceWarning, UnsupportedFunctionCall, UnsortedIndexError) + PerformanceWarning, UnsupportedFunctionCall, UnsortedIndexError, + AbstractMethodError) # back-compat of public API # deprecate these functions @@ -88,19 +89,6 @@ class SettingWithCopyWarning(Warning): pass -class AbstractMethodError(NotImplementedError): - """Raise this error instead of NotImplementedError for abstract methods - while keeping compatibility with Python 2 and Python 3. - """ - - def __init__(self, class_instance): - self.class_instance = class_instance - - def __str__(self): - msg = "This method must be defined in the concrete class of {name}" - return (msg.format(name=self.class_instance.__class__.__name__)) - - def flatten(l): """Flatten an arbitrarily nested sequence. diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py new file mode 100644 index 0000000000000..c7c5378801f02 --- /dev/null +++ b/pandas/core/dtypes/base.py @@ -0,0 +1,129 @@ +"""Extend pandas with custom array types""" +from pandas.errors import AbstractMethodError + + +class ExtensionDtype(object): + """A custom data type, to be paired with an ExtensionArray. + + Notes + ----- + The interface includes the following abstract methods that must + be implemented by subclasses: + + * type + * name + * construct_from_string + + This class does not inherit from 'abc.ABCMeta' for performance reasons. + Methods and properties required by the interface raise + ``pandas.errors.AbstractMethodError`` and no ``register`` method is + provided for registering virtual subclasses. + """ + + def __str__(self): + return self.name + + @property + def type(self): + # type: () -> type + """The scalar type for the array, e.g. ``int`` + + It's expected ``ExtensionArray[item]`` returns an instance + of ``ExtensionDtype.type`` for scalar ``item``. + """ + raise AbstractMethodError(self) + + @property + def kind(self): + # type () -> str + """A character code (one of 'biufcmMOSUV'), default 'O' + + This should match the NumPy dtype used when the array is + converted to an ndarray, which is probably 'O' for object if + the extension type cannot be represented as a built-in NumPy + type. + + See Also + -------- + numpy.dtype.kind + """ + return 'O' + + @property + def name(self): + # type: () -> str + """A string identifying the data type. + + Will be used for display in, e.g. ``Series.dtype`` + """ + raise AbstractMethodError(self) + + @property + def names(self): + # type: () -> Optional[List[str]] + """Ordered list of field names, or None if there are no fields. + + This is for compatibility with NumPy arrays, and may be removed in the + future. + """ + return None + + @classmethod + def construct_from_string(cls, string): + """Attempt to construct this type from a string. + + Parameters + ---------- + string : str + + Returns + ------- + self : instance of 'cls' + + Raises + ------ + TypeError + If a class cannot be constructed from this 'string'. + + Examples + -------- + If the extension dtype can be constructed without any arguments, + the following may be an adequate implementation. + + >>> @classmethod + ... def construct_from_string(cls, string) + ... if string == cls.name: + ... return cls() + ... else: + ... raise TypeError("Cannot construct a '{}' from " + ... "'{}'".format(cls, string)) + """ + raise AbstractMethodError(cls) + + @classmethod + def is_dtype(cls, dtype): + """Check if we match 'dtype' + + Parameters + ---------- + dtype : str or dtype + + Returns + ------- + is_dtype : bool + + Notes + ----- + The default implementation is True if + + 1. ``cls.construct_from_string(dtype)`` is an instance + of ``cls``. + 2. 'dtype' is ``cls`` or a subclass of ``cls``. + """ + if isinstance(dtype, str): + try: + return isinstance(cls.construct_from_string(dtype), cls) + except TypeError: + return False + else: + return issubclass(dtype, cls) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index dca9a5fde0d74..c66e7fcfc6978 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1685,6 +1685,35 @@ def is_extension_type(arr): return False +def is_extension_array_dtype(arr_or_dtype): + """Check if an object is a pandas extension array type. + + Parameters + ---------- + arr_or_dtype : object + + Returns + ------- + bool + + Notes + ----- + This checks whether an object implements the pandas extension + array interface. In pandas, this includes: + + * Categorical + + Third-party libraries may implement arrays or types satisfying + this interface as well. + """ + from pandas.core.arrays import ExtensionArray + + # we want to unpack series, anything else? + if isinstance(arr_or_dtype, ABCSeries): + arr_or_dtype = arr_or_dtype._values + return isinstance(arr_or_dtype, (ExtensionDtype, ExtensionArray)) + + def is_complex_dtype(arr_or_dtype): """ Check whether the provided array or dtype is of a complex dtype. diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 1eb87aa99fd1e..d8d3a96992757 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -5,15 +5,15 @@ from pandas import compat from pandas.core.dtypes.generic import ABCIndexClass, ABCCategoricalIndex +from .base import ExtensionDtype -class ExtensionDtype(object): + +class PandasExtensionDtype(ExtensionDtype): """ A np.dtype duck-typed class, suitable for holding a custom dtype. THIS IS NOT A REAL NUMPY DTYPE """ - name = None - names = None type = None subdtype = None kind = None @@ -108,7 +108,7 @@ class CategoricalDtypeType(type): pass -class CategoricalDtype(ExtensionDtype): +class CategoricalDtype(PandasExtensionDtype): """ Type for categorical data with the categories and orderedness @@ -387,7 +387,7 @@ class DatetimeTZDtypeType(type): pass -class DatetimeTZDtype(ExtensionDtype): +class DatetimeTZDtype(PandasExtensionDtype): """ A np.dtype duck-typed class, suitable for holding a custom datetime with tz @@ -501,8 +501,7 @@ class PeriodDtypeType(type): pass -class PeriodDtype(ExtensionDtype): - __metaclass__ = PeriodDtypeType +class PeriodDtype(PandasExtensionDtype): """ A Period duck-typed class, suitable for holding a period with freq dtype. @@ -619,8 +618,7 @@ class IntervalDtypeType(type): pass -class IntervalDtype(ExtensionDtype): - __metaclass__ = IntervalDtypeType +class IntervalDtype(PandasExtensionDtype): """ A Interval duck-typed class, suitable for holding an interval diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 52e8317f5209a..f553e1a02c9d6 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -33,6 +33,7 @@ is_datetimelike_v_numeric, is_float_dtype, is_numeric_dtype, is_numeric_v_string_like, is_extension_type, + is_extension_array_dtype, is_list_like, is_re, is_re_compilable, @@ -61,8 +62,9 @@ from pandas.core.index import Index, MultiIndex, _ensure_index from pandas.core.indexing import maybe_convert_indices, length_of_indexer -from pandas.core.arrays.categorical import Categorical, _maybe_to_categorical +from pandas.core.arrays import Categorical from pandas.core.indexes.datetimes import DatetimeIndex +from pandas.core.indexes.timedeltas import TimedeltaIndex from pandas.io.formats.printing import pprint_thing import pandas.core.missing as missing @@ -103,24 +105,58 @@ class Block(PandasObject): _verify_integrity = True _validate_ndim = True _ftype = 'dense' - _holder = None _concatenator = staticmethod(np.concatenate) def __init__(self, values, placement, ndim=None): - if ndim is None: - ndim = values.ndim - elif values.ndim != ndim: - raise ValueError('Wrong number of dimensions') - self.ndim = ndim - + self.ndim = self._check_ndim(values, ndim) self.mgr_locs = placement self.values = values - if ndim and len(self.mgr_locs) != len(self.values): + if (self._validate_ndim and self.ndim and + len(self.mgr_locs) != len(self.values)): raise ValueError( 'Wrong number of items passed {val}, placement implies ' '{mgr}'.format(val=len(self.values), mgr=len(self.mgr_locs))) + def _check_ndim(self, values, ndim): + """ndim inference and validation. + + Infers ndim from 'values' if not provided to __init__. + Validates that values.ndim and ndim are consistent if and only if + the class variable '_validate_ndim' is True. + + Parameters + ---------- + values : array-like + ndim : int or None + + Returns + ------- + ndim : int + + Raises + ------ + ValueError : the number of dimensions do not match + """ + if ndim is None: + ndim = values.ndim + + if self._validate_ndim and values.ndim != ndim: + msg = ("Wrong number of dimensions. values.ndim != ndim " + "[{} != {}]") + raise ValueError(msg.format(values.ndim, ndim)) + + return ndim + + @property + def _holder(self): + """The array-like that can hold the underlying values. + + None for 'Block', overridden by subclasses that don't + use an ndarray. + """ + return None + @property def _consolidate_key(self): return (self._can_consolidate, self.dtype.name) @@ -279,7 +315,6 @@ def reshape_nd(self, labels, shape, ref_items, mgr=None): return a new block that is transformed to a nd block """ - return _block2d_to_blocknd(values=self.get_values().T, placement=self.mgr_locs, shape=shape, labels=labels, ref_items=ref_items) @@ -535,15 +570,20 @@ def astype(self, dtype, copy=False, errors='raise', values=None, **kwargs): def _astype(self, dtype, copy=False, errors='raise', values=None, klass=None, mgr=None, **kwargs): - """ - Coerce to the new type + """Coerce to the new type + Parameters + ---------- dtype : str, dtype convertible copy : boolean, default False copy if indicated errors : str, {'raise', 'ignore'}, default 'ignore' - ``raise`` : allow exceptions to be raised - ``ignore`` : suppress exceptions. On error return original object + + Returns + ------- + Block """ errors_legal_values = ('raise', 'ignore') @@ -1674,27 +1714,28 @@ class NonConsolidatableMixIn(object): _can_consolidate = False _verify_integrity = False _validate_ndim = False - _holder = None def __init__(self, values, placement, ndim=None): + """Initialize a non-consolidatable block. - # Placement must be converted to BlockPlacement via property setter - # before ndim logic, because placement may be a slice which doesn't - # have a length. - self.mgr_locs = placement + 'ndim' may be inferred from 'placement'. - # kludgetastic + This will call continue to call __init__ for the other base + classes mixed in with this Mixin. + """ + # Placement must be converted to BlockPlacement so that we can check + # its length + if not isinstance(placement, BlockPlacement): + placement = BlockPlacement(placement) + + # Maybe infer ndim from placement if ndim is None: - if len(self.mgr_locs) != 1: + if len(placement) != 1: ndim = 1 else: ndim = 2 - self.ndim = ndim - - if not isinstance(values, self._holder): - raise TypeError("values must be {0}".format(self._holder.__name__)) - - self.values = values + super(NonConsolidatableMixIn, self).__init__(values, placement, + ndim=ndim) @property def shape(self): @@ -1745,7 +1786,7 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0, Returns ------- - a new block(s), the result of the putmask + a new block, the result of the putmask """ inplace = validate_bool_kwarg(inplace, 'inplace') @@ -1803,6 +1844,92 @@ def _unstack(self, unstacker_func, new_columns): return blocks, mask +class ExtensionBlock(NonConsolidatableMixIn, Block): + """Block for holding extension types. + + Notes + ----- + This holds all 3rd-party extension array types. It's also the immediate + parent class for our internal extension types' blocks, CategoricalBlock. + + ExtensionArrays are limited to 1-D. + """ + @property + def _holder(self): + # For extension blocks, the holder is values-dependent. + return type(self.values) + + @property + def is_view(self): + """Extension arrays are never treated as views.""" + return False + + def get_values(self, dtype=None): + # ExtensionArrays must be iterable, so this works. + values = np.asarray(self.values) + if values.ndim == self.ndim - 1: + values = values.reshape((1,) + values.shape) + return values + + def to_dense(self): + return np.asarray(self.values) + + def take_nd(self, indexer, axis=0, new_mgr_locs=None, fill_tuple=None): + """ + Take values according to indexer and return them as a block. + """ + if fill_tuple is None: + fill_value = None + else: + fill_value = fill_tuple[0] + + # axis doesn't matter; we are really a single-dim object + # but are passed the axis depending on the calling routing + # if its REALLY axis 0, then this will be a reindex and not a take + new_values = self.values.take(indexer, fill_value=fill_value) + + # if we are a 1-dim object, then always place at 0 + if self.ndim == 1: + new_mgr_locs = [0] + else: + if new_mgr_locs is None: + new_mgr_locs = self.mgr_locs + + return self.make_block_same_class(new_values, new_mgr_locs) + + def _can_hold_element(self, element): + # XXX: We may need to think about pushing this onto the array. + # We're doing the same as CategoricalBlock here. + return True + + def _slice(self, slicer): + """ return a slice of my values """ + + # slice the category + # return same dims as we currently have + + if isinstance(slicer, tuple) and len(slicer) == 2: + if not com.is_null_slice(slicer[0]): + raise AssertionError("invalid slicing for a 1-ndim " + "categorical") + slicer = slicer[1] + + return self.values[slicer] + + def formatting_values(self): + return self.values._formatting_values() + + def concat_same_type(self, to_concat, placement=None): + """ + Concatenate list of single blocks of the same type. + """ + values = self._holder._concat_same_type( + [blk.values for blk in to_concat]) + placement = placement or slice(0, len(values), 1) + return self.make_block_same_class(values, ndim=self.ndim, + placement=placement) + + class NumericBlock(Block): __slots__ = () is_numeric = True @@ -1908,6 +2035,11 @@ def should_store(self, value): class DatetimeLikeBlockMixin(object): + """Mixin class for DatetimeBlock and DatetimeTZBlock.""" + + @property + def _holder(self): + return DatetimeIndex @property def _na_value(self): @@ -1940,6 +2072,10 @@ def __init__(self, values, placement, ndim=None): super(TimeDeltaBlock, self).__init__(values, placement=placement, ndim=ndim) + @property + def _holder(self): + return TimedeltaIndex + @property def _box_func(self): return lambda x: tslib.Timedelta(x, unit='ns') @@ -2315,30 +2451,24 @@ def re_replacer(s): return block -class CategoricalBlock(NonConsolidatableMixIn, ObjectBlock): +class CategoricalBlock(ExtensionBlock): __slots__ = () is_categorical = True _verify_integrity = True _can_hold_na = True - _holder = Categorical _concatenator = staticmethod(_concat._concat_categorical) def __init__(self, values, placement, ndim=None): + from pandas.core.arrays.categorical import _maybe_to_categorical # coerce to categorical if we can super(CategoricalBlock, self).__init__(_maybe_to_categorical(values), - placement=placement, ndim=ndim) + placement=placement, + ndim=ndim) @property - def is_view(self): - """ I am never a view """ - return False - - def to_dense(self): - return self.values.to_dense().view() - - def convert(self, copy=True, **kwargs): - return self.copy() if copy else self + def _holder(self): + return Categorical @property def array_dtype(self): @@ -2347,13 +2477,6 @@ def array_dtype(self): """ return np.object_ - def _slice(self, slicer): - """ return a slice of my values """ - - # slice the category - # return same dims as we currently have - return self.values._slice(slicer) - def _try_coerce_result(self, result): """ reverse of try_coerce_args """ @@ -2390,28 +2513,11 @@ def shift(self, periods, axis=0, mgr=None): return self.make_block_same_class(values=self.values.shift(periods), placement=self.mgr_locs) - def take_nd(self, indexer, axis=0, new_mgr_locs=None, fill_tuple=None): - """ - Take values according to indexer and return them as a block.bb - """ - if fill_tuple is None: - fill_value = None - else: - fill_value = fill_tuple[0] - - # axis doesn't matter; we are really a single-dim object - # but are passed the axis depending on the calling routing - # if its REALLY axis 0, then this will be a reindex and not a take - new_values = self.values.take_nd(indexer, fill_value=fill_value) - - # if we are a 1-dim object, then always place at 0 - if self.ndim == 1: - new_mgr_locs = [0] - else: - if new_mgr_locs is None: - new_mgr_locs = self.mgr_locs - - return self.make_block_same_class(new_values, new_mgr_locs) + def to_dense(self): + # Categorical.get_values returns a DatetimeIndex for datetime + # categories, so we can't simply use `np.asarray(self.values)` like + # other types. + return self.values.get_values() def to_native_types(self, slicer=None, na_rep='', quoting=None, **kwargs): """ convert to our native types format, slicing if desired """ @@ -2430,6 +2536,15 @@ def to_native_types(self, slicer=None, na_rep='', quoting=None, **kwargs): def concat_same_type(self, to_concat, placement=None): """ Concatenate list of single blocks of the same type. + + Note that this CategoricalBlock._concat_same_type *may* not + return a CategoricalBlock. When the categories in `to_concat` + differ, this will return an object ndarray. + + If / when we decide we don't like that behavior: + + 1. Change Categorical._concat_same_type to use union_categoricals + 2. Delete this method. """ values = self._concatenator([blk.values for blk in to_concat], axis=self.ndim - 1) @@ -2445,12 +2560,29 @@ class DatetimeBlock(DatetimeLikeBlockMixin, Block): _can_hold_na = True def __init__(self, values, placement, ndim=None): - if values.dtype != _NS_DTYPE: - values = conversion.ensure_datetime64ns(values) - + values = self._maybe_coerce_values(values) super(DatetimeBlock, self).__init__(values, placement=placement, ndim=ndim) + def _maybe_coerce_values(self, values): + """Input validation for values passed to __init__. Ensure that + we have datetime64ns, coercing if nescessary. + + Parametetrs + ----------- + values : array-like + Must be convertable to datetime64 + + Returns + ------- + values : ndarray[datetime64ns] + + Overridden by DatetimeTZBlock. + """ + if values.dtype != _NS_DTYPE: + values = conversion.ensure_datetime64ns(values) + return values + def _astype(self, dtype, mgr=None, **kwargs): """ these automatically copy, so copy=True has no effect @@ -2576,12 +2708,37 @@ def set(self, locs, values, check=False): class DatetimeTZBlock(NonConsolidatableMixIn, DatetimeBlock): """ implement a datetime64 block with a tz attribute """ __slots__ = () - _holder = DatetimeIndex _concatenator = staticmethod(_concat._concat_datetime) is_datetimetz = True def __init__(self, values, placement, ndim=2, dtype=None): + # XXX: This will end up calling _maybe_coerce_values twice + # when dtype is not None. It's relatively cheap (just an isinstance) + # but it'd nice to avoid. + # + # If we can remove dtype from __init__, and push that conversion + # push onto the callers, then we can remove this entire __init__ + # and just use DatetimeBlock's. + if dtype is not None: + values = self._maybe_coerce_values(values, dtype=dtype) + super(DatetimeTZBlock, self).__init__(values, placement=placement, + ndim=ndim) + + def _maybe_coerce_values(self, values, dtype=None): + """Input validation for values passed to __init__. Ensure that + we have datetime64TZ, coercing if nescessary. + Parametetrs + ----------- + values : array-like + Must be convertable to datetime64 + dtype : string or DatetimeTZDtype, optional + Does a shallow copy to this tz + + Returns + ------- + values : ndarray[datetime64ns] + """ if not isinstance(values, self._holder): values = self._holder(values) @@ -2593,8 +2750,7 @@ def __init__(self, values, placement, ndim=2, dtype=None): if values.tz is None: raise ValueError("cannot create a DatetimeTZBlock without a tz") - super(DatetimeTZBlock, self).__init__(values, placement=placement, - ndim=ndim) + return values def copy(self, deep=True, mgr=None): """ copy constructor """ @@ -2734,9 +2890,19 @@ class SparseBlock(NonConsolidatableMixIn, Block): _box_to_block_values = False _can_hold_na = True _ftype = 'sparse' - _holder = SparseArray _concatenator = staticmethod(_concat._concat_sparse) + def __init__(self, values, placement, ndim=None): + # Ensure that we have the underlying SparseArray here... + if isinstance(values, ABCSeries): + values = values.values + assert isinstance(values, SparseArray) + super(SparseBlock, self).__init__(values, placement, ndim=ndim) + + @property + def _holder(self): + return SparseArray + @property def shape(self): return (len(self.mgr_locs), self.sp_index.length) @@ -2910,6 +3076,8 @@ def get_block_type(values, dtype=None): cls = BoolBlock elif is_categorical(values): cls = CategoricalBlock + elif is_extension_array_dtype(values): + cls = ExtensionBlock else: cls = ObjectBlock return cls @@ -4663,6 +4831,19 @@ def form_blocks(arrays, names, axes): for i, _, array in items_dict['CategoricalBlock']] blocks.extend(cat_blocks) + if len(items_dict['ExtensionBlock']): + + external_blocks = [] + for i, _, array in items_dict['ExtensionBlock']: + if isinstance(array, ABCSeries): + array = array.values + # Allow our internal arrays to chose their block type. + block_type = getattr(array, '_block_type', ExtensionBlock) + external_blocks.append( + make_block(array, klass=block_type, + fastpath=True, placement=[i])) + blocks.extend(external_blocks) + if len(extra_locs): shape = (len(extra_locs),) + tuple(len(x) for x in axes[1:]) diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index 22b6d33be9d38..af4e83f506257 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -77,3 +77,26 @@ class NullFrequencyError(ValueError): class AccessorRegistrationWarning(Warning): """Warning for attribute conflicts in accessor registration.""" + + +class AbstractMethodError(NotImplementedError): + """Raise this error instead of NotImplementedError for abstract methods + while keeping compatibility with Python 2 and Python 3. + """ + + def __init__(self, class_instance, methodtype='method'): + types = {'method', 'classmethod', 'staticmethod', 'property'} + if methodtype not in types: + msg = 'methodtype must be one of {}, got {} instead.'.format( + methodtype, types) + raise ValueError(msg) + self.methodtype = methodtype + self.class_instance = class_instance + + def __str__(self): + if self.methodtype == 'classmethod': + name = self.class_instance.__name__ + else: + name = self.class_instance.__class__.__name__ + msg = "This {methodtype} must be defined in the concrete class {name}" + return (msg.format(methodtype=self.methodtype, name=name)) diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index d800a7b92b559..eca4dd4cf2106 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -10,12 +10,14 @@ Series, Categorical, CategoricalIndex, IntervalIndex, date_range) from pandas.compat import string_types +from pandas.core.arrays import ExtensionArray from pandas.core.dtypes.dtypes import ( DatetimeTZDtype, PeriodDtype, - IntervalDtype, CategoricalDtype) + IntervalDtype, CategoricalDtype, ExtensionDtype) from pandas.core.dtypes.common import ( is_categorical_dtype, is_categorical, is_datetime64tz_dtype, is_datetimetz, + is_extension_array_dtype, is_period_dtype, is_period, is_dtype_equal, is_datetime64_ns_dtype, is_datetime64_dtype, is_interval_dtype, @@ -742,3 +744,31 @@ def test_categorical_categories(self): tm.assert_index_equal(c1.categories, pd.Index(['a', 'b'])) c1 = CategoricalDtype(CategoricalIndex(['a', 'b'])) tm.assert_index_equal(c1.categories, pd.Index(['a', 'b'])) + + +class DummyArray(ExtensionArray): + pass + + +class DummyDtype(ExtensionDtype): + pass + + +class TestExtensionArrayDtype(object): + + @pytest.mark.parametrize('values', [ + pd.Categorical([]), + pd.Categorical([]).dtype, + pd.Series(pd.Categorical([])), + DummyDtype(), + DummyArray(), + ]) + def test_is_extension_array_dtype(self, values): + assert is_extension_array_dtype(values) + + @pytest.mark.parametrize('values', [ + np.array([]), + pd.Series(np.array([])), + ]) + def test_is_not_extension_array_dtype(self, values): + assert not is_extension_array_dtype(values) diff --git a/pandas/tests/internals/test_external_block.py b/pandas/tests/internals/test_external_block.py index 729ee0093b6dc..2487363df8f99 100644 --- a/pandas/tests/internals/test_external_block.py +++ b/pandas/tests/internals/test_external_block.py @@ -5,12 +5,12 @@ import pandas as pd from pandas.core.internals import ( - Block, BlockManager, SingleBlockManager, NonConsolidatableMixIn) + BlockManager, SingleBlockManager, ExtensionBlock) import pytest -class CustomBlock(NonConsolidatableMixIn, Block): +class CustomBlock(ExtensionBlock): _holder = np.ndarray diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index e3490f465b24a..9338aba90d7cb 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -11,9 +11,8 @@ from distutils.version import LooseVersion import itertools from pandas import (Index, MultiIndex, DataFrame, DatetimeIndex, - Series, Categorical) + Series, Categorical, TimedeltaIndex, SparseArray) from pandas.compat import OrderedDict, lrange -from pandas.core.sparse.array import SparseArray from pandas.core.internals import (BlockPlacement, SingleBlockManager, make_block, BlockManager) import pandas.core.algorithms as algos @@ -1263,9 +1262,30 @@ def test_binop_other(self, op, value, dtype): assert_series_equal(result, expected) +@pytest.mark.parametrize('typestr, holder', [ + ('category', Categorical), + ('M8[ns]', DatetimeIndex), + ('M8[ns, US/Central]', DatetimeIndex), + ('m8[ns]', TimedeltaIndex), + ('sparse', SparseArray), +]) +def test_holder(typestr, holder): + blk = create_block(typestr, [1]) + assert blk._holder is holder + + def test_deprecated_fastpath(): # GH#19265 values = np.random.rand(3, 3) with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): make_block(values, placement=np.arange(3), fastpath=True) + + +def test_validate_ndim(): + values = np.array([1.0, 2.0]) + placement = slice(2) + msg = "Wrong number of dimensions. values.ndim != ndim \[1 != 2\]" + + with tm.assert_raises_regex(ValueError, msg): + make_block(values, placement, ndim=2) diff --git a/pandas/tests/sparse/frame/test_frame.py b/pandas/tests/sparse/frame/test_frame.py index 0b7948cc32d24..54f567bcd2a8c 100644 --- a/pandas/tests/sparse/frame/test_frame.py +++ b/pandas/tests/sparse/frame/test_frame.py @@ -574,6 +574,15 @@ def test_setitem_array(self): self.frame['F'].reindex(index), check_names=False) + def test_setitem_chained_no_consolidate(self): + # https://github.com/pandas-dev/pandas/pull/19268 + # issuecomment-361696418 + # chained setitem used to cause consolidation + sdf = pd.SparseDataFrame([[np.nan, 1], [2, np.nan]]) + with pd.option_context('mode.chained_assignment', None): + sdf[0][1] = 2 + assert len(sdf._data.blocks) == 2 + def test_delitem(self): A = self.frame['A'] C = self.frame['C'] diff --git a/pandas/tests/test_errors.py b/pandas/tests/test_errors.py index babf88ef1df8d..e2a142366a89e 100644 --- a/pandas/tests/test_errors.py +++ b/pandas/tests/test_errors.py @@ -4,6 +4,8 @@ from warnings import catch_warnings import pandas # noqa import pandas as pd +from pandas.errors import AbstractMethodError +import pandas.util.testing as tm @pytest.mark.parametrize( @@ -50,3 +52,30 @@ def test_error_rename(): raise ParserError() except pd.parser.CParserError: pass + + +class Foo: + @classmethod + def classmethod(cls): + raise AbstractMethodError(cls, methodtype='classmethod') + + @property + def property(self): + raise AbstractMethodError(self, methodtype='property') + + def method(self): + raise AbstractMethodError(self) + + +def test_AbstractMethodError_classmethod(): + xpr = "This classmethod must be defined in the concrete class Foo" + with tm.assert_raises_regex(AbstractMethodError, xpr): + Foo.classmethod() + + xpr = "This property must be defined in the concrete class Foo" + with tm.assert_raises_regex(AbstractMethodError, xpr): + Foo().property + + xpr = "This method must be defined in the concrete class Foo" + with tm.assert_raises_regex(AbstractMethodError, xpr): + Foo().method() From ed3afc4afac1e46f1da8e24ca7848f2420b8ef9e Mon Sep 17 00:00:00 2001 From: Paul Reidy Date: Sat, 3 Feb 2018 20:30:29 +0000 Subject: [PATCH 035/214] ERR: Better error msg when merging on tz-aware and tz-naive columns (#19525) --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/core/reshape/merge.py | 27 +++++++++--------------- pandas/tests/reshape/merge/test_merge.py | 4 +++- 3 files changed, 14 insertions(+), 18 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 26a7a78bb5c55..69965f44d87a8 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -342,6 +342,7 @@ Other API Changes - Addition and subtraction of ``NaN`` from a :class:`Series` with ``dtype='timedelta64[ns]'`` will raise a ``TypeError` instead of treating the ``NaN`` as ``NaT`` (:issue:`19274`) - Set operations (union, difference...) on :class:`IntervalIndex` with incompatible index types will now raise a ``TypeError`` rather than a ``ValueError`` (:issue:`19329`) - :class:`DateOffset` objects render more simply, e.g. "" instead of "" (:issue:`19403`) +- :func:`pandas.merge` provides a more informative error message when trying to merge on timezone-aware and timezone-naive columns (:issue:`15800`) .. _whatsnew_0230.deprecations: diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 99ea2c4fe4688..3ec78ce52c6e5 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -940,6 +940,11 @@ def _maybe_coerce_merge_keys(self): elif is_dtype_equal(lk.dtype, rk.dtype): continue + msg = ("You are trying to merge on {lk_dtype} and " + "{rk_dtype} columns. If you wish to proceed " + "you should use pd.concat".format(lk_dtype=lk.dtype, + rk_dtype=rk.dtype)) + # if we are numeric, then allow differing # kinds to proceed, eg. int64 and int8, int and float # further if we are object, but we infer to @@ -968,30 +973,18 @@ def _maybe_coerce_merge_keys(self): pass # Check if we are trying to merge on obviously - # incompatible dtypes GH 9780 + # incompatible dtypes GH 9780, GH 15800 elif is_numeric_dtype(lk) and not is_numeric_dtype(rk): - msg = ("You are trying to merge on {lk_dtype} and " - "{rk_dtype} columns. If you wish to proceed " - "you should use pd.concat".format(lk_dtype=lk.dtype, - rk_dtype=rk.dtype)) raise ValueError(msg) elif not is_numeric_dtype(lk) and is_numeric_dtype(rk): - msg = ("You are trying to merge on {lk_dtype} and " - "{rk_dtype} columns. If you wish to proceed " - "you should use pd.concat".format(lk_dtype=lk.dtype, - rk_dtype=rk.dtype)) raise ValueError(msg) elif is_datetimelike(lk) and not is_datetimelike(rk): - msg = ("You are trying to merge on {lk_dtype} and " - "{rk_dtype} columns. If you wish to proceed " - "you should use pd.concat".format(lk_dtype=lk.dtype, - rk_dtype=rk.dtype)) raise ValueError(msg) elif not is_datetimelike(lk) and is_datetimelike(rk): - msg = ("You are trying to merge on {lk_dtype} and " - "{rk_dtype} columns. If you wish to proceed " - "you should use pd.concat".format(lk_dtype=lk.dtype, - rk_dtype=rk.dtype)) + raise ValueError(msg) + elif is_datetime64tz_dtype(lk) and not is_datetime64tz_dtype(rk): + raise ValueError(msg) + elif not is_datetime64tz_dtype(lk) and is_datetime64tz_dtype(rk): raise ValueError(msg) # Houston, we have a problem! diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index a8319339c6435..f63c206c0c407 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1512,11 +1512,13 @@ def test_merge_on_ints_floats_warning(self): '2011-01-02']), (pd.date_range('1/1/2011', periods=2, freq='D'), [0, 1]), (pd.date_range('1/1/2011', periods=2, freq='D'), [0.0, 1.0]), + (pd.date_range('20130101', periods=3), + pd.date_range('20130101', periods=3, tz='US/Eastern')), ([0, 1, 2], Series(['a', 'b', 'a']).astype('category')), ([0.0, 1.0, 2.0], Series(['a', 'b', 'a']).astype('category')), ]) def test_merge_incompat_dtypes(self, df1_vals, df2_vals): - # GH 9780 + # GH 9780, GH 15800 # Raise a ValueError when a user tries to merge on # dtypes that are incompatible (e.g., obj and int/float) From 3f3b4e0bc15107153167ec605b45676113ffb9c1 Mon Sep 17 00:00:00 2001 From: Tommy <10076072+tommyod@users.noreply.github.com> Date: Sat, 3 Feb 2018 21:32:56 +0100 Subject: [PATCH 036/214] DOC: Spellcheck of enhancingperf.rst (#19516) * Spellchecked enhancingperf, sparse * Uppercased 'cython' to 'Cython' * Typeset variants of numba as 'Numba', as on their page * Updated reference to Numba docs to latest version --- doc/source/enhancingperf.rst | 81 +++++++++++++++++++++++------------- doc/source/sparse.rst | 8 ++-- 2 files changed, 55 insertions(+), 34 deletions(-) diff --git a/doc/source/enhancingperf.rst b/doc/source/enhancingperf.rst index 7afa852262a38..b786b1d0c134a 100644 --- a/doc/source/enhancingperf.rst +++ b/doc/source/enhancingperf.rst @@ -19,6 +19,13 @@ Enhancing Performance ********************* +In this part of the tutorial, we will investigate how to speed up certain +functions operating on pandas ``DataFrames`` using three different techniques: +Cython, Numba and :func:`pandas.eval`. We will see a speed improvement of ~200 +when we use Cython and Numba on a test function operating row-wise on the +``DataFrame``. Using :func:`pandas.eval` we will speed up a sum by an order of +~2. + .. _enhancingperf.cython: Cython (Writing C extensions for pandas) @@ -29,20 +36,20 @@ computationally heavy applications however, it can be possible to achieve sizeab speed-ups by offloading work to `cython `__. This tutorial assumes you have refactored as much as possible in Python, for example -trying to remove for loops and making use of NumPy vectorization, it's always worth +by trying to remove for-loops and making use of NumPy vectorization. It's always worth optimising in Python first. This tutorial walks through a "typical" process of cythonizing a slow computation. -We use an `example from the cython documentation `__ +We use an `example from the Cython documentation `__ but in the context of pandas. Our final cythonized solution is around 100 times -faster than the pure Python. +faster than the pure Python solution. .. _enhancingperf.pure: Pure python ~~~~~~~~~~~ -We have a DataFrame to which we want to apply a function row-wise. +We have a ``DataFrame`` to which we want to apply a function row-wise. .. ipython:: python @@ -91,10 +98,10 @@ hence we'll concentrate our efforts cythonizing these two functions. .. _enhancingperf.plain: -Plain cython +Plain Cython ~~~~~~~~~~~~ -First we're going to need to import the cython magic function to ipython: +First we're going to need to import the Cython magic function to ipython: .. ipython:: python :okwarning: @@ -102,7 +109,7 @@ First we're going to need to import the cython magic function to ipython: %load_ext Cython -Now, let's simply copy our functions over to cython as is (the suffix +Now, let's simply copy our functions over to Cython as is (the suffix is here to distinguish between function versions): .. ipython:: @@ -177,8 +184,8 @@ in Python, so maybe we could minimize these by cythonizing the apply part. .. note:: - We are now passing ndarrays into the cython function, fortunately cython plays - very nicely with numpy. + We are now passing ndarrays into the Cython function, fortunately Cython plays + very nicely with NumPy. .. ipython:: @@ -213,9 +220,9 @@ the rows, applying our ``integrate_f_typed``, and putting this in the zeros arra .. warning:: You can **not pass** a ``Series`` directly as a ``ndarray`` typed parameter - to a cython function. Instead pass the actual ``ndarray`` using the - ``.values`` attribute of the Series. The reason is that the cython - definition is specific to an ndarray and not the passed Series. + to a Cython function. Instead pass the actual ``ndarray`` using the + ``.values`` attribute of the ``Series``. The reason is that the Cython + definition is specific to an ndarray and not the passed ``Series``. So, do not do this: @@ -223,7 +230,7 @@ the rows, applying our ``integrate_f_typed``, and putting this in the zeros arra apply_integrate_f(df['a'], df['b'], df['N']) - But rather, use ``.values`` to get the underlying ``ndarray`` + But rather, use ``.values`` to get the underlying ``ndarray``: .. code-block:: python @@ -255,7 +262,7 @@ More advanced techniques ~~~~~~~~~~~~~~~~~~~~~~~~ There is still hope for improvement. Here's an example of using some more -advanced cython techniques: +advanced Cython techniques: .. ipython:: @@ -289,16 +296,17 @@ advanced cython techniques: In [4]: %timeit apply_integrate_f_wrap(df['a'].values, df['b'].values, df['N'].values) 1000 loops, best of 3: 987 us per loop -Even faster, with the caveat that a bug in our cython code (an off-by-one error, +Even faster, with the caveat that a bug in our Cython code (an off-by-one error, for example) might cause a segfault because memory access isn't checked. - +For more about ``boundscheck`` and ``wraparound``, see the Cython docs on +`compiler directives `__. .. _enhancingperf.numba: -Using numba +Using Numba ----------- -A recent alternative to statically compiling cython code, is to use a *dynamic jit-compiler*, ``numba``. +A recent alternative to statically compiling Cython code, is to use a *dynamic jit-compiler*, Numba. Numba gives you the power to speed up your applications with high performance functions written directly in Python. With a few annotations, array-oriented and math-heavy Python code can be just-in-time compiled to native machine instructions, similar in performance to C, C++ and Fortran, without having to switch languages or Python interpreters. @@ -306,16 +314,17 @@ Numba works by generating optimized machine code using the LLVM compiler infrast .. note:: - You will need to install ``numba``. This is easy with ``conda``, by using: ``conda install numba``, see :ref:`installing using miniconda`. + You will need to install Numba. This is easy with ``conda``, by using: ``conda install numba``, see :ref:`installing using miniconda`. .. note:: - As of ``numba`` version 0.20, pandas objects cannot be passed directly to numba-compiled functions. Instead, one must pass the ``numpy`` array underlying the ``pandas`` object to the numba-compiled function as demonstrated below. + As of Numba version 0.20, pandas objects cannot be passed directly to Numba-compiled functions. Instead, one must pass the NumPy array underlying the pandas object to the Numba-compiled function as demonstrated below. Jit ~~~ -Using ``numba`` to just-in-time compile your code. We simply take the plain Python code from above and annotate with the ``@jit`` decorator. +We demonstrate how to use Numba to just-in-time compile our code. We simply +take the plain Python code from above and annotate with the ``@jit`` decorator. .. code-block:: python @@ -346,17 +355,19 @@ Using ``numba`` to just-in-time compile your code. We simply take the plain Pyth result = apply_integrate_f_numba(df['a'].values, df['b'].values, df['N'].values) return pd.Series(result, index=df.index, name='result') -Note that we directly pass ``numpy`` arrays to the numba function. ``compute_numba`` is just a wrapper that provides a nicer interface by passing/returning pandas objects. +Note that we directly pass NumPy arrays to the Numba function. ``compute_numba`` is just a wrapper that provides a nicer interface by passing/returning pandas objects. .. code-block:: ipython In [4]: %timeit compute_numba(df) 1000 loops, best of 3: 798 us per loop +In this example, using Numba was faster than Cython. + Vectorize ~~~~~~~~~ -``numba`` can also be used to write vectorized functions that do not require the user to explicitly +Numba can also be used to write vectorized functions that do not require the user to explicitly loop over the observations of a vector; a vectorized function will be applied to each row automatically. Consider the following toy example of doubling each observation: @@ -389,13 +400,23 @@ Caveats .. note:: - ``numba`` will execute on any function, but can only accelerate certain classes of functions. + Numba will execute on any function, but can only accelerate certain classes of functions. -``numba`` is best at accelerating functions that apply numerical functions to NumPy arrays. When passed a function that only uses operations it knows how to accelerate, it will execute in ``nopython`` mode. +Numba is best at accelerating functions that apply numerical functions to NumPy +arrays. When passed a function that only uses operations it knows how to +accelerate, it will execute in ``nopython`` mode. -If ``numba`` is passed a function that includes something it doesn't know how to work with -- a category that currently includes sets, lists, dictionaries, or string functions -- it will revert to ``object mode``. In ``object mode``, numba will execute but your code will not speed up significantly. If you would prefer that ``numba`` throw an error if it cannot compile a function in a way that speeds up your code, pass numba the argument ``nopython=True`` (e.g. ``@numba.jit(nopython=True)``). For more on troubleshooting ``numba`` modes, see the `numba troubleshooting page `__. +If Numba is passed a function that includes something it doesn't know how to +work with -- a category that currently includes sets, lists, dictionaries, or +string functions -- it will revert to ``object mode``. In ``object mode``, +Numba will execute but your code will not speed up significantly. If you would +prefer that Numba throw an error if it cannot compile a function in a way that +speeds up your code, pass Numba the argument +``nopython=True`` (e.g. ``@numba.jit(nopython=True)``). For more on +troubleshooting Numba modes, see the `Numba troubleshooting page +`__. -Read more in the `numba docs `__. +Read more in the `Numba docs `__. .. _enhancingperf.eval: @@ -448,7 +469,7 @@ These operations are supported by :func:`pandas.eval`: - Attribute access, e.g., ``df.a`` - Subscript expressions, e.g., ``df[0]`` - Simple variable evaluation, e.g., ``pd.eval('df')`` (this is not very useful) -- Math functions, `sin`, `cos`, `exp`, `log`, `expm1`, `log1p`, +- Math functions: `sin`, `cos`, `exp`, `log`, `expm1`, `log1p`, `sqrt`, `sinh`, `cosh`, `tanh`, `arcsin`, `arccos`, `arctan`, `arccosh`, `arcsinh`, `arctanh`, `abs` and `arctan2`. @@ -581,7 +602,7 @@ on the original ``DataFrame`` or return a copy with the new column. For backwards compatibility, ``inplace`` defaults to ``True`` if not specified. This will change in a future version of pandas - if your code depends on an inplace assignment you should update to explicitly - set ``inplace=True`` + set ``inplace=True``. .. ipython:: python @@ -780,7 +801,7 @@ Technical Minutia Regarding Expression Evaluation Expressions that would result in an object dtype or involve datetime operations (because of ``NaT``) must be evaluated in Python space. The main reason for this behavior is to maintain backwards compatibility with versions of NumPy < -1.7. In those versions of ``numpy`` a call to ``ndarray.astype(str)`` will +1.7. In those versions of NumPy a call to ``ndarray.astype(str)`` will truncate any strings that are more than 60 characters in length. Second, we can't pass ``object`` arrays to ``numexpr`` thus string comparisons must be evaluated in Python space. diff --git a/doc/source/sparse.rst b/doc/source/sparse.rst index 2e224f103a95e..260d8aa32ef52 100644 --- a/doc/source/sparse.rst +++ b/doc/source/sparse.rst @@ -17,11 +17,11 @@ Sparse data structures .. note:: The ``SparsePanel`` class has been removed in 0.19.0 -We have implemented "sparse" versions of Series and DataFrame. These are not sparse +We have implemented "sparse" versions of ``Series`` and ``DataFrame``. These are not sparse in the typical "mostly 0". Rather, you can view these objects as being "compressed" where any data matching a specific value (``NaN`` / missing value, though any value can be chosen) is omitted. A special ``SparseIndex`` object tracks where data has been -"sparsified". This will make much more sense in an example. All of the standard pandas +"sparsified". This will make much more sense with an example. All of the standard pandas data structures have a ``to_sparse`` method: .. ipython:: python @@ -32,7 +32,7 @@ data structures have a ``to_sparse`` method: sts The ``to_sparse`` method takes a ``kind`` argument (for the sparse index, see -below) and a ``fill_value``. So if we had a mostly zero Series, we could +below) and a ``fill_value``. So if we had a mostly zero ``Series``, we could convert it to sparse with ``fill_value=0``: .. ipython:: python @@ -40,7 +40,7 @@ convert it to sparse with ``fill_value=0``: ts.fillna(0).to_sparse(fill_value=0) The sparse objects exist for memory efficiency reasons. Suppose you had a -large, mostly NA DataFrame: +large, mostly NA ``DataFrame``: .. ipython:: python From 1bf7688d5bb4d70bf5e8b7e74abe055472c9adfe Mon Sep 17 00:00:00 2001 From: jschendel Date: Sun, 4 Feb 2018 08:54:14 -0700 Subject: [PATCH 037/214] TST: Remove duplicate TimdeltaIndex tests (#19509) --- .../tests/indexes/timedeltas/test_astype.py | 49 +++---------------- 1 file changed, 6 insertions(+), 43 deletions(-) diff --git a/pandas/tests/indexes/timedeltas/test_astype.py b/pandas/tests/indexes/timedeltas/test_astype.py index af16fe71edcf3..c3bd857036efc 100644 --- a/pandas/tests/indexes/timedeltas/test_astype.py +++ b/pandas/tests/indexes/timedeltas/test_astype.py @@ -2,36 +2,20 @@ import numpy as np -import pandas as pd import pandas.util.testing as tm from pandas import (TimedeltaIndex, timedelta_range, Int64Index, Float64Index, - Index, Timedelta) + Index, Timedelta, NaT) -from ..datetimelike import DatetimeLike - -class TestTimedeltaIndex(DatetimeLike): - _holder = TimedeltaIndex +class TestTimedeltaIndex(object): _multiprocess_can_split_ = True - def test_numeric_compat(self): - # Dummy method to override super's version; this test is now done - # in test_arithmetic.py - pass - - def setup_method(self, method): - self.indices = dict(index=tm.makeTimedeltaIndex(10)) - self.setup_indices() - - def create_index(self): - return pd.to_timedelta(range(5), unit='d') + pd.offsets.Hour(1) - def test_astype(self): # GH 13149, GH 13209 - idx = TimedeltaIndex([1e14, 'NaT', pd.NaT, np.NaN]) + idx = TimedeltaIndex([1e14, 'NaT', NaT, np.NaN]) result = idx.astype(object) - expected = Index([Timedelta('1 days 03:46:40')] + [pd.NaT] * 3, + expected = Index([Timedelta('1 days 03:46:40')] + [NaT] * 3, dtype=object) tm.assert_index_equal(result, expected) @@ -51,7 +35,7 @@ def test_astype(self): def test_astype_timedelta64(self): # GH 13149, GH 13209 - idx = TimedeltaIndex([1e14, 'NaT', pd.NaT, np.NaN]) + idx = TimedeltaIndex([1e14, 'NaT', NaT, np.NaN]) result = idx.astype('timedelta64') expected = Float64Index([1e+14] + [np.NaN] * 3, dtype='float64') @@ -69,28 +53,7 @@ def test_astype_timedelta64(self): float, 'datetime64', 'datetime64[ns]']) def test_astype_raises(self, dtype): # GH 13149, GH 13209 - idx = TimedeltaIndex([1e14, 'NaT', pd.NaT, np.NaN]) + idx = TimedeltaIndex([1e14, 'NaT', NaT, np.NaN]) msg = 'Cannot cast TimedeltaIndex to dtype' with tm.assert_raises_regex(TypeError, msg): idx.astype(dtype) - - def test_pickle_compat_construction(self): - pass - - def test_shift(self): - # test shift for TimedeltaIndex - # err8083 - - drange = self.create_index() - result = drange.shift(1) - expected = TimedeltaIndex(['1 days 01:00:00', '2 days 01:00:00', - '3 days 01:00:00', - '4 days 01:00:00', '5 days 01:00:00'], - freq='D') - tm.assert_index_equal(result, expected) - - result = drange.shift(3, freq='2D 1s') - expected = TimedeltaIndex(['6 days 01:00:03', '7 days 01:00:03', - '8 days 01:00:03', '9 days 01:00:03', - '10 days 01:00:03'], freq='D') - tm.assert_index_equal(result, expected) From 58f2a4c99a358cd172aded6a59bba8fb6333e7a2 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 4 Feb 2018 08:05:30 -0800 Subject: [PATCH 038/214] Frame specific parts of #19504 (#19512) --- pandas/tests/frame/test_timezones.py | 135 +++++++++++++++++++++++++ pandas/tests/tseries/test_timezones.py | 123 ++-------------------- 2 files changed, 144 insertions(+), 114 deletions(-) create mode 100644 pandas/tests/frame/test_timezones.py diff --git a/pandas/tests/frame/test_timezones.py b/pandas/tests/frame/test_timezones.py new file mode 100644 index 0000000000000..fa589a0aa4817 --- /dev/null +++ b/pandas/tests/frame/test_timezones.py @@ -0,0 +1,135 @@ +# -*- coding: utf-8 -*- +""" +Tests for DataFrame timezone-related methods +""" +from datetime import datetime + +import pytest +import pytz +import numpy as np + +import pandas.util.testing as tm +from pandas.compat import lrange +from pandas.core.indexes.datetimes import date_range +from pandas.core.dtypes.dtypes import DatetimeTZDtype +from pandas import Series, DataFrame + + +class TestDataFrameTimezones(object): + def test_frame_from_records_utc(self): + rec = {'datum': 1.5, + 'begin_time': datetime(2006, 4, 27, tzinfo=pytz.utc)} + + # it works + DataFrame.from_records([rec], index='begin_time') + + def test_frame_tz_localize(self): + rng = date_range('1/1/2011', periods=100, freq='H') + + df = DataFrame({'a': 1}, index=rng) + result = df.tz_localize('utc') + expected = DataFrame({'a': 1}, rng.tz_localize('UTC')) + assert result.index.tz.zone == 'UTC' + tm.assert_frame_equal(result, expected) + + df = df.T + result = df.tz_localize('utc', axis=1) + assert result.columns.tz.zone == 'UTC' + tm.assert_frame_equal(result, expected.T) + + def test_frame_tz_convert(self): + rng = date_range('1/1/2011', periods=200, freq='D', tz='US/Eastern') + + df = DataFrame({'a': 1}, index=rng) + result = df.tz_convert('Europe/Berlin') + expected = DataFrame({'a': 1}, rng.tz_convert('Europe/Berlin')) + assert result.index.tz.zone == 'Europe/Berlin' + tm.assert_frame_equal(result, expected) + + df = df.T + result = df.tz_convert('Europe/Berlin', axis=1) + assert result.columns.tz.zone == 'Europe/Berlin' + tm.assert_frame_equal(result, expected.T) + + def test_frame_join_tzaware(self): + test1 = DataFrame(np.zeros((6, 3)), + index=date_range("2012-11-15 00:00:00", periods=6, + freq="100L", tz="US/Central")) + test2 = DataFrame(np.zeros((3, 3)), + index=date_range("2012-11-15 00:00:00", periods=3, + freq="250L", tz="US/Central"), + columns=lrange(3, 6)) + + result = test1.join(test2, how='outer') + ex_index = test1.index.union(test2.index) + + tm.assert_index_equal(result.index, ex_index) + assert result.index.tz.zone == 'US/Central' + + def test_frame_add_tz_mismatch_converts_to_utc(self): + rng = date_range('1/1/2011', periods=10, freq='H', tz='US/Eastern') + df = DataFrame(np.random.randn(len(rng)), index=rng, columns=['a']) + + df_moscow = df.tz_convert('Europe/Moscow') + result = df + df_moscow + assert result.index.tz is pytz.utc + + result = df_moscow + df + assert result.index.tz is pytz.utc + + def test_frame_align_aware(self): + idx1 = date_range('2001', periods=5, freq='H', tz='US/Eastern') + idx2 = date_range('2001', periods=5, freq='2H', tz='US/Eastern') + df1 = DataFrame(np.random.randn(len(idx1), 3), idx1) + df2 = DataFrame(np.random.randn(len(idx2), 3), idx2) + new1, new2 = df1.align(df2) + assert df1.index.tz == new1.index.tz + assert df2.index.tz == new2.index.tz + + # different timezones convert to UTC + + # frame with frame + df1_central = df1.tz_convert('US/Central') + new1, new2 = df1.align(df1_central) + assert new1.index.tz == pytz.UTC + assert new2.index.tz == pytz.UTC + + # frame with Series + new1, new2 = df1.align(df1_central[0], axis=0) + assert new1.index.tz == pytz.UTC + assert new2.index.tz == pytz.UTC + + df1[0].align(df1_central, axis=0) + assert new1.index.tz == pytz.UTC + assert new2.index.tz == pytz.UTC + + @pytest.mark.parametrize('tz', ['US/Eastern', 'dateutil/US/Eastern']) + def test_frame_no_datetime64_dtype(self, tz): + # after GH#7822 + # these retain the timezones on dict construction + dr = date_range('2011/1/1', '2012/1/1', freq='W-FRI') + dr_tz = dr.tz_localize(tz) + df = DataFrame({'A': 'foo', 'B': dr_tz}, index=dr) + tz_expected = DatetimeTZDtype('ns', dr_tz.tzinfo) + assert df['B'].dtype == tz_expected + + # GH#2810 (with timezones) + datetimes_naive = [ts.to_pydatetime() for ts in dr] + datetimes_with_tz = [ts.to_pydatetime() for ts in dr_tz] + df = DataFrame({'dr': dr, + 'dr_tz': dr_tz, + 'datetimes_naive': datetimes_naive, + 'datetimes_with_tz': datetimes_with_tz}) + result = df.get_dtype_counts().sort_index() + expected = Series({'datetime64[ns]': 2, + str(tz_expected): 2}).sort_index() + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize('tz', ['US/Eastern', 'dateutil/US/Eastern']) + def test_frame_reset_index(self, tz): + dr = date_range('2012-06-02', periods=10, tz=tz) + df = DataFrame(np.random.randn(len(dr)), dr) + roundtripped = df.reset_index().set_index('index') + xp = df.index.tz + rs = roundtripped.index.tz + assert xp == rs diff --git a/pandas/tests/tseries/test_timezones.py b/pandas/tests/tseries/test_timezones.py index cc5f4d30f9aaf..e47be69b79feb 100644 --- a/pandas/tests/tseries/test_timezones.py +++ b/pandas/tests/tseries/test_timezones.py @@ -16,13 +16,11 @@ import pandas.tseries.offsets as offsets from pandas.compat import lrange, zip from pandas.core.indexes.datetimes import bdate_range, date_range -from pandas.core.dtypes.dtypes import DatetimeTZDtype from pandas._libs import tslib from pandas._libs.tslibs import timezones, conversion -from pandas import (Index, Series, DataFrame, isna, Timestamp, NaT, +from pandas import (Index, Series, isna, Timestamp, NaT, DatetimeIndex, to_datetime) -from pandas.util.testing import (assert_frame_equal, assert_series_equal, - set_timezone) +from pandas.util.testing import assert_series_equal, set_timezone class FixedOffset(tzinfo): @@ -786,29 +784,6 @@ def test_to_datetime_tzlocal(self): result = to_datetime(arr, utc=True) assert result.tz is pytz.utc - def test_frame_no_datetime64_dtype(self): - - # after 7822 - # these retain the timezones on dict construction - - dr = date_range('2011/1/1', '2012/1/1', freq='W-FRI') - dr_tz = dr.tz_localize(self.tzstr('US/Eastern')) - e = DataFrame({'A': 'foo', 'B': dr_tz}, index=dr) - tz_expected = DatetimeTZDtype('ns', dr_tz.tzinfo) - assert e['B'].dtype == tz_expected - - # GH 2810 (with timezones) - datetimes_naive = [ts.to_pydatetime() for ts in dr] - datetimes_with_tz = [ts.to_pydatetime() for ts in dr_tz] - df = DataFrame({'dr': dr, - 'dr_tz': dr_tz, - 'datetimes_naive': datetimes_naive, - 'datetimes_with_tz': datetimes_with_tz}) - result = df.get_dtype_counts().sort_index() - expected = Series({'datetime64[ns]': 2, - str(tz_expected): 2}).sort_index() - assert_series_equal(result, expected) - def test_hongkong_tz_convert(self): # #1673 dr = date_range('2012-01-01', '2012-01-10', freq='D', tz='Hongkong') @@ -872,21 +847,6 @@ def test_convert_datetime_list(self): assert dr.tz == dr2.tz assert dr2.name == 'foo' - def test_frame_from_records_utc(self): - rec = {'datum': 1.5, - 'begin_time': datetime(2006, 4, 27, tzinfo=pytz.utc)} - - # it works - DataFrame.from_records([rec], index='begin_time') - - def test_frame_reset_index(self): - dr = date_range('2012-06-02', periods=10, tz=self.tzstr('US/Eastern')) - df = DataFrame(np.random.randn(len(dr)), dr) - roundtripped = df.reset_index().set_index('index') - xp = df.index.tz - rs = roundtripped.index.tz - assert xp == rs - def test_dateutil_tzoffset_support(self): values = [188.5, 328.25] tzinfo = tzoffset(None, 7200) @@ -1289,7 +1249,7 @@ def test_tz_localize_roundtrip(self): tm.assert_index_equal(reset, idx) assert reset.tzinfo is None - def test_series_frame_tz_localize(self): + def test_series_tz_localize(self): rng = date_range('1/1/2011', periods=100, freq='H') ts = Series(1, index=rng) @@ -1297,41 +1257,19 @@ def test_series_frame_tz_localize(self): result = ts.tz_localize('utc') assert result.index.tz.zone == 'UTC' - df = DataFrame({'a': 1}, index=rng) - result = df.tz_localize('utc') - expected = DataFrame({'a': 1}, rng.tz_localize('UTC')) - assert result.index.tz.zone == 'UTC' - assert_frame_equal(result, expected) - - df = df.T - result = df.tz_localize('utc', axis=1) - assert result.columns.tz.zone == 'UTC' - assert_frame_equal(result, expected.T) - # Can't localize if already tz-aware rng = date_range('1/1/2011', periods=100, freq='H', tz='utc') ts = Series(1, index=rng) tm.assert_raises_regex(TypeError, 'Already tz-aware', ts.tz_localize, 'US/Eastern') - def test_series_frame_tz_convert(self): + def test_series_tz_convert(self): rng = date_range('1/1/2011', periods=200, freq='D', tz='US/Eastern') ts = Series(1, index=rng) result = ts.tz_convert('Europe/Berlin') assert result.index.tz.zone == 'Europe/Berlin' - df = DataFrame({'a': 1}, index=rng) - result = df.tz_convert('Europe/Berlin') - expected = DataFrame({'a': 1}, rng.tz_convert('Europe/Berlin')) - assert result.index.tz.zone == 'Europe/Berlin' - assert_frame_equal(result, expected) - - df = df.T - result = df.tz_convert('Europe/Berlin', axis=1) - assert result.columns.tz.zone == 'Europe/Berlin' - assert_frame_equal(result, expected.T) - # can't convert tz-naive rng = date_range('1/1/2011', periods=200, freq='D') ts = Series(1, index=rng) @@ -1389,20 +1327,6 @@ def test_join_aware(self): pytest.raises(Exception, ts.__add__, ts_utc) pytest.raises(Exception, ts_utc.__add__, ts) - test1 = DataFrame(np.zeros((6, 3)), - index=date_range("2012-11-15 00:00:00", periods=6, - freq="100L", tz="US/Central")) - test2 = DataFrame(np.zeros((3, 3)), - index=date_range("2012-11-15 00:00:00", periods=3, - freq="250L", tz="US/Central"), - columns=lrange(3, 6)) - - result = test1.join(test2, how='outer') - ex_index = test1.index.union(test2.index) - - tm.assert_index_equal(result.index, ex_index) - assert result.index.tz.zone == 'US/Central' - # non-overlapping rng = date_range("2012-11-15 00:00:00", periods=6, freq="H", tz="US/Central") @@ -1413,34 +1337,13 @@ def test_join_aware(self): result = rng.union(rng2) assert result.tz.zone == 'UTC' - def test_align_aware(self): + def test_series_align_aware(self): idx1 = date_range('2001', periods=5, freq='H', tz='US/Eastern') - idx2 = date_range('2001', periods=5, freq='2H', tz='US/Eastern') - df1 = DataFrame(np.random.randn(len(idx1), 3), idx1) - df2 = DataFrame(np.random.randn(len(idx2), 3), idx2) - new1, new2 = df1.align(df2) - assert df1.index.tz == new1.index.tz - assert df2.index.tz == new2.index.tz - + ser = Series(np.random.randn(len(idx1)), index=idx1) + ser_central = ser.tz_convert('US/Central') # # different timezones convert to UTC - # frame - df1_central = df1.tz_convert('US/Central') - new1, new2 = df1.align(df1_central) - assert new1.index.tz == pytz.UTC - assert new2.index.tz == pytz.UTC - - # series - new1, new2 = df1[0].align(df1_central[0]) - assert new1.index.tz == pytz.UTC - assert new2.index.tz == pytz.UTC - - # combination - new1, new2 = df1.align(df1_central[0], axis=0) - assert new1.index.tz == pytz.UTC - assert new2.index.tz == pytz.UTC - - df1[0].align(df1_central, axis=0) + new1, new2 = ser.align(ser_central) assert new1.index.tz == pytz.UTC assert new2.index.tz == pytz.UTC @@ -1523,7 +1426,7 @@ def test_append_aware_naive(self): assert ts_result.index.equals(ts1.index.astype(object).append( ts2.index)) - def test_equal_join_ensure_utc(self): + def test_series_add_tz_mismatch_converts_to_utc(self): rng = date_range('1/1/2011', periods=10, freq='H', tz='US/Eastern') ts = Series(np.random.randn(len(rng)), index=rng) @@ -1535,14 +1438,6 @@ def test_equal_join_ensure_utc(self): result = ts_moscow + ts assert result.index.tz is pytz.utc - df = DataFrame({'a': ts}) - df_moscow = df.tz_convert('Europe/Moscow') - result = df + df_moscow - assert result.index.tz is pytz.utc - - result = df_moscow + df - assert result.index.tz is pytz.utc - def test_arith_utc_convert(self): rng = date_range('1/1/2011', periods=100, freq='H', tz='utc') From a44f1c1b1d5944946c3fa6b15c7f962e015f2444 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 4 Feb 2018 08:06:51 -0800 Subject: [PATCH 039/214] split Timestamp tests off of 19504 (#19511) --- .../tests/scalar/timestamp/test_timezones.py | 189 +++++++++++++++++ pandas/tests/tseries/test_timezones.py | 195 +----------------- 2 files changed, 190 insertions(+), 194 deletions(-) diff --git a/pandas/tests/scalar/timestamp/test_timezones.py b/pandas/tests/scalar/timestamp/test_timezones.py index eeec70cc234f5..7a5c6feb8b651 100644 --- a/pandas/tests/scalar/timestamp/test_timezones.py +++ b/pandas/tests/scalar/timestamp/test_timezones.py @@ -2,11 +2,18 @@ """ Tests for Timestamp timezone-related methods """ +from datetime import date, timedelta +from distutils.version import LooseVersion import pytest +import pytz from pytz.exceptions import AmbiguousTimeError, NonExistentTimeError +import dateutil +from dateutil.tz import gettz, tzoffset import pandas.util.testing as tm +import pandas.util._test_decorators as td + from pandas import Timestamp, NaT @@ -14,6 +21,22 @@ class TestTimestampTZOperations(object): # -------------------------------------------------------------- # Timestamp.tz_localize + def test_tz_localize_ambiguous_bool(self): + # make sure that we are correctly accepting bool values as ambiguous + # GH#14402 + ts = Timestamp('2015-11-01 01:00:03') + expected0 = Timestamp('2015-11-01 01:00:03-0500', tz='US/Central') + expected1 = Timestamp('2015-11-01 01:00:03-0600', tz='US/Central') + + with pytest.raises(pytz.AmbiguousTimeError): + ts.tz_localize('US/Central') + + result = ts.tz_localize('US/Central', ambiguous=True) + assert result == expected0 + + result = ts.tz_localize('US/Central', ambiguous=False) + assert result == expected1 + def test_tz_localize_ambiguous(self): ts = Timestamp('2014-11-02 01:00') ts_dst = ts.tz_localize('US/Eastern', ambiguous=True) @@ -70,6 +93,55 @@ def test_tz_localize_roundtrip(self, stamp, tz): assert reset == ts assert reset.tzinfo is None + def test_tz_localize_ambiguous_compat(self): + # validate that pytz and dateutil are compat for dst + # when the transition happens + naive = Timestamp('2013-10-27 01:00:00') + + pytz_zone = 'Europe/London' + dateutil_zone = 'dateutil/Europe/London' + result_pytz = naive.tz_localize(pytz_zone, ambiguous=0) + result_dateutil = naive.tz_localize(dateutil_zone, ambiguous=0) + assert result_pytz.value == result_dateutil.value + assert result_pytz.value == 1382835600000000000 + + if LooseVersion(dateutil.__version__) < LooseVersion('2.6.0'): + # dateutil 2.6 buggy w.r.t. ambiguous=0 + # see gh-14621 + # see https://github.com/dateutil/dateutil/issues/321 + assert (result_pytz.to_pydatetime().tzname() == + result_dateutil.to_pydatetime().tzname()) + assert str(result_pytz) == str(result_dateutil) + elif LooseVersion(dateutil.__version__) > LooseVersion('2.6.0'): + # fixed ambiguous behavior + assert result_pytz.to_pydatetime().tzname() == 'GMT' + assert result_dateutil.to_pydatetime().tzname() == 'BST' + assert str(result_pytz) != str(result_dateutil) + + # 1 hour difference + result_pytz = naive.tz_localize(pytz_zone, ambiguous=1) + result_dateutil = naive.tz_localize(dateutil_zone, ambiguous=1) + assert result_pytz.value == result_dateutil.value + assert result_pytz.value == 1382832000000000000 + + # dateutil < 2.6 is buggy w.r.t. ambiguous timezones + if LooseVersion(dateutil.__version__) > LooseVersion('2.5.3'): + # see gh-14621 + assert str(result_pytz) == str(result_dateutil) + assert (result_pytz.to_pydatetime().tzname() == + result_dateutil.to_pydatetime().tzname()) + + @pytest.mark.parametrize('tz', [pytz.timezone('US/Eastern'), + gettz('US/Eastern'), + 'US/Eastern', 'dateutil/US/Eastern']) + def test_timestamp_tz_localize(self, tz): + stamp = Timestamp('3/11/2012 04:00') + + result = stamp.tz_localize(tz) + expected = Timestamp('3/11/2012 04:00', tz=tz) + assert result.hour == expected.hour + assert result == expected + # ------------------------------------------------------------------ # Timestamp.tz_convert @@ -85,3 +157,120 @@ def test_tz_convert_roundtrip(self, stamp, tz): assert reset == Timestamp(stamp) assert reset.tzinfo is None assert reset == converted.tz_convert('UTC').tz_localize(None) + + @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + def test_astimezone(self, tzstr): + # astimezone is an alias for tz_convert, so keep it with + # the tz_convert tests + utcdate = Timestamp('3/11/2012 22:00', tz='UTC') + expected = utcdate.tz_convert(tzstr) + result = utcdate.astimezone(tzstr) + assert expected == result + assert isinstance(result, Timestamp) + + @td.skip_if_windows + def test_tz_convert_utc_with_system_utc(self): + from pandas._libs.tslibs.timezones import maybe_get_tz + + # from system utc to real utc + ts = Timestamp('2001-01-05 11:56', tz=maybe_get_tz('dateutil/UTC')) + # check that the time hasn't changed. + assert ts == ts.tz_convert(dateutil.tz.tzutc()) + + # from system utc to real utc + ts = Timestamp('2001-01-05 11:56', tz=maybe_get_tz('dateutil/UTC')) + # check that the time hasn't changed. + assert ts == ts.tz_convert(dateutil.tz.tzutc()) + + # ------------------------------------------------------------------ + # Timestamp.__init__ with tz str or tzinfo + + def test_timestamp_constructor_tz_utc(self): + utc_stamp = Timestamp('3/11/2012 05:00', tz='utc') + assert utc_stamp.tzinfo is pytz.utc + assert utc_stamp.hour == 5 + + utc_stamp = Timestamp('3/11/2012 05:00').tz_localize('utc') + assert utc_stamp.hour == 5 + + def test_timestamp_to_datetime_tzoffset(self): + tzinfo = tzoffset(None, 7200) + expected = Timestamp('3/11/2012 04:00', tz=tzinfo) + result = Timestamp(expected.to_pydatetime()) + assert expected == result + + def test_timestamp_constructor_near_dst_boundary(self): + # GH#11481 & GH#15777 + # Naive string timestamps were being localized incorrectly + # with tz_convert_single instead of tz_localize_to_utc + + for tz in ['Europe/Brussels', 'Europe/Prague']: + result = Timestamp('2015-10-25 01:00', tz=tz) + expected = Timestamp('2015-10-25 01:00').tz_localize(tz) + assert result == expected + + with pytest.raises(pytz.AmbiguousTimeError): + Timestamp('2015-10-25 02:00', tz=tz) + + result = Timestamp('2017-03-26 01:00', tz='Europe/Paris') + expected = Timestamp('2017-03-26 01:00').tz_localize('Europe/Paris') + assert result == expected + + with pytest.raises(pytz.NonExistentTimeError): + Timestamp('2017-03-26 02:00', tz='Europe/Paris') + + # GH#11708 + naive = Timestamp('2015-11-18 10:00:00') + result = naive.tz_localize('UTC').tz_convert('Asia/Kolkata') + expected = Timestamp('2015-11-18 15:30:00+0530', tz='Asia/Kolkata') + assert result == expected + + # GH#15823 + result = Timestamp('2017-03-26 00:00', tz='Europe/Paris') + expected = Timestamp('2017-03-26 00:00:00+0100', tz='Europe/Paris') + assert result == expected + + result = Timestamp('2017-03-26 01:00', tz='Europe/Paris') + expected = Timestamp('2017-03-26 01:00:00+0100', tz='Europe/Paris') + assert result == expected + + with pytest.raises(pytz.NonExistentTimeError): + Timestamp('2017-03-26 02:00', tz='Europe/Paris') + + result = Timestamp('2017-03-26 02:00:00+0100', tz='Europe/Paris') + naive = Timestamp(result.value) + expected = naive.tz_localize('UTC').tz_convert('Europe/Paris') + assert result == expected + + result = Timestamp('2017-03-26 03:00', tz='Europe/Paris') + expected = Timestamp('2017-03-26 03:00:00+0200', tz='Europe/Paris') + assert result == expected + + @pytest.mark.parametrize('tz', [pytz.timezone('US/Eastern'), + gettz('US/Eastern'), + 'US/Eastern', 'dateutil/US/Eastern']) + def test_timestamp_constructed_by_date_and_tz(self, tz): + # GH#2993, Timestamp cannot be constructed by datetime.date + # and tz correctly + + result = Timestamp(date(2012, 3, 11), tz=tz) + + expected = Timestamp('3/11/2012', tz=tz) + assert result.hour == expected.hour + assert result == expected + + @pytest.mark.parametrize('tz', [pytz.timezone('US/Eastern'), + gettz('US/Eastern'), + 'US/Eastern', 'dateutil/US/Eastern']) + def test_timestamp_add_timedelta_push_over_dst_boundary(self, tz): + # GH#1389 + + # 4 hours before DST transition + stamp = Timestamp('3/10/2012 22:00', tz=tz) + + result = stamp + timedelta(hours=6) + + # spring forward, + "7" hours + expected = Timestamp('3/11/2012 05:00', tz=tz) + + assert result == expected diff --git a/pandas/tests/tseries/test_timezones.py b/pandas/tests/tseries/test_timezones.py index e47be69b79feb..2630984a70807 100644 --- a/pandas/tests/tseries/test_timezones.py +++ b/pandas/tests/tseries/test_timezones.py @@ -9,7 +9,7 @@ from pytz import NonExistentTimeError from distutils.version import LooseVersion from dateutil.tz import tzlocal, tzoffset -from datetime import datetime, timedelta, tzinfo, date +from datetime import datetime, timedelta, tzinfo import pandas.util.testing as tm import pandas.util._test_decorators as td @@ -119,120 +119,6 @@ def test_localize_utc_conversion_explicit(self): pytest.raises(NonExistentTimeError, rng.tz_localize, self.tz('US/Eastern')) - def test_timestamp_tz_localize(self): - stamp = Timestamp('3/11/2012 04:00') - - result = stamp.tz_localize(self.tzstr('US/Eastern')) - expected = Timestamp('3/11/2012 04:00', tz=self.tzstr('US/Eastern')) - assert result.hour == expected.hour - assert result == expected - - def test_timestamp_tz_localize_explicit(self): - stamp = Timestamp('3/11/2012 04:00') - - result = stamp.tz_localize(self.tz('US/Eastern')) - expected = Timestamp('3/11/2012 04:00', tz=self.tz('US/Eastern')) - assert result.hour == expected.hour - assert result == expected - - def test_timestamp_constructed_by_date_and_tz(self): - # Fix Issue 2993, Timestamp cannot be constructed by datetime.date - # and tz correctly - - result = Timestamp(date(2012, 3, 11), tz=self.tzstr('US/Eastern')) - - expected = Timestamp('3/11/2012', tz=self.tzstr('US/Eastern')) - assert result.hour == expected.hour - assert result == expected - - def test_timestamp_constructed_by_date_and_tz_explicit(self): - # Fix Issue 2993, Timestamp cannot be constructed by datetime.date - # and tz correctly - - result = Timestamp(date(2012, 3, 11), tz=self.tz('US/Eastern')) - - expected = Timestamp('3/11/2012', tz=self.tz('US/Eastern')) - assert result.hour == expected.hour - assert result == expected - - def test_timestamp_constructor_near_dst_boundary(self): - # GH 11481 & 15777 - # Naive string timestamps were being localized incorrectly - # with tz_convert_single instead of tz_localize_to_utc - - for tz in ['Europe/Brussels', 'Europe/Prague']: - result = Timestamp('2015-10-25 01:00', tz=tz) - expected = Timestamp('2015-10-25 01:00').tz_localize(tz) - assert result == expected - - with pytest.raises(pytz.AmbiguousTimeError): - Timestamp('2015-10-25 02:00', tz=tz) - - result = Timestamp('2017-03-26 01:00', tz='Europe/Paris') - expected = Timestamp('2017-03-26 01:00').tz_localize('Europe/Paris') - assert result == expected - - with pytest.raises(pytz.NonExistentTimeError): - Timestamp('2017-03-26 02:00', tz='Europe/Paris') - - # GH 11708 - result = to_datetime("2015-11-18 15:30:00+05:30").tz_localize( - 'UTC').tz_convert('Asia/Kolkata') - expected = Timestamp('2015-11-18 15:30:00+0530', tz='Asia/Kolkata') - assert result == expected - - # GH 15823 - result = Timestamp('2017-03-26 00:00', tz='Europe/Paris') - expected = Timestamp('2017-03-26 00:00:00+0100', tz='Europe/Paris') - assert result == expected - - result = Timestamp('2017-03-26 01:00', tz='Europe/Paris') - expected = Timestamp('2017-03-26 01:00:00+0100', tz='Europe/Paris') - assert result == expected - - with pytest.raises(pytz.NonExistentTimeError): - Timestamp('2017-03-26 02:00', tz='Europe/Paris') - result = Timestamp('2017-03-26 02:00:00+0100', tz='Europe/Paris') - expected = Timestamp(result.value).tz_localize( - 'UTC').tz_convert('Europe/Paris') - assert result == expected - - result = Timestamp('2017-03-26 03:00', tz='Europe/Paris') - expected = Timestamp('2017-03-26 03:00:00+0200', tz='Europe/Paris') - assert result == expected - - def test_timestamp_to_datetime_tzoffset(self): - tzinfo = tzoffset(None, 7200) - expected = Timestamp('3/11/2012 04:00', tz=tzinfo) - result = Timestamp(expected.to_pydatetime()) - assert expected == result - - def test_timedelta_push_over_dst_boundary(self): - # #1389 - - # 4 hours before DST transition - stamp = Timestamp('3/10/2012 22:00', tz=self.tzstr('US/Eastern')) - - result = stamp + timedelta(hours=6) - - # spring forward, + "7" hours - expected = Timestamp('3/11/2012 05:00', tz=self.tzstr('US/Eastern')) - - assert result == expected - - def test_timedelta_push_over_dst_boundary_explicit(self): - # #1389 - - # 4 hours before DST transition - stamp = Timestamp('3/10/2012 22:00', tz=self.tz('US/Eastern')) - - result = stamp + timedelta(hours=6) - - # spring forward, + "7" hours - expected = Timestamp('3/11/2012 05:00', tz=self.tz('US/Eastern')) - - assert result == expected - def test_tz_localize_dti(self): dti = DatetimeIndex(start='1/1/2005', end='1/1/2005 0:00:30.256', freq='L') @@ -267,13 +153,6 @@ def test_tz_localize_empty_series(self): ts2 = ts.tz_localize(self.tzstr('US/Eastern')) assert self.cmptz(ts2.index.tz, self.tz('US/Eastern')) - def test_astimezone(self): - utc = Timestamp('3/11/2012 22:00', tz='UTC') - expected = utc.tz_convert(self.tzstr('US/Eastern')) - result = utc.astimezone(self.tzstr('US/Eastern')) - assert expected == result - assert isinstance(result, Timestamp) - def test_create_with_tz(self): stamp = Timestamp('3/11/2012 05:00', tz=self.tzstr('US/Eastern')) assert stamp.hour == 5 @@ -283,13 +162,6 @@ def test_create_with_tz(self): assert stamp == rng[1] - utc_stamp = Timestamp('3/11/2012 05:00', tz='utc') - assert utc_stamp.tzinfo is pytz.utc - assert utc_stamp.hour == 5 - - utc_stamp = Timestamp('3/11/2012 05:00').tz_localize('utc') - assert utc_stamp.hour == 5 - def test_create_with_fixed_tz(self): off = FixedOffset(420, '+07:00') start = datetime(2012, 3, 11, 5, 0, 0, tzinfo=off) @@ -591,16 +463,6 @@ def test_ambiguous_bool(self): expected0 = Timestamp('2015-11-01 01:00:03-0500', tz='US/Central') expected1 = Timestamp('2015-11-01 01:00:03-0600', tz='US/Central') - def f(): - t.tz_localize('US/Central') - pytest.raises(pytz.AmbiguousTimeError, f) - - result = t.tz_localize('US/Central', ambiguous=True) - assert result == expected0 - - result = t.tz_localize('US/Central', ambiguous=False) - assert result == expected1 - s = Series([t]) expected0 = Series([expected0]) expected1 = Series([expected1]) @@ -948,20 +810,6 @@ def normalize(self, ts): # no-op for dateutil return ts - @td.skip_if_windows - def test_utc_with_system_utc(self): - from pandas._libs.tslibs.timezones import maybe_get_tz - - # from system utc to real utc - ts = Timestamp('2001-01-05 11:56', tz=maybe_get_tz('dateutil/UTC')) - # check that the time hasn't changed. - assert ts == ts.tz_convert(dateutil.tz.tzutc()) - - # from system utc to real utc - ts = Timestamp('2001-01-05 11:56', tz=maybe_get_tz('dateutil/UTC')) - # check that the time hasn't changed. - assert ts == ts.tz_convert(dateutil.tz.tzutc()) - def test_tz_convert_hour_overflow_dst(self): # Regression test for: # https://github.com/pandas-dev/pandas/issues/13306 @@ -1175,47 +1023,6 @@ def test_cache_keys_are_distinct_for_pytz_vs_dateutil(self, tz_name): class TestTimeZones(object): timezones = ['UTC', 'Asia/Tokyo', 'US/Eastern', 'dateutil/US/Pacific'] - def test_ambiguous_compat(self): - # validate that pytz and dateutil are compat for dst - # when the transition happens - - pytz_zone = 'Europe/London' - dateutil_zone = 'dateutil/Europe/London' - result_pytz = (Timestamp('2013-10-27 01:00:00') - .tz_localize(pytz_zone, ambiguous=0)) - result_dateutil = (Timestamp('2013-10-27 01:00:00') - .tz_localize(dateutil_zone, ambiguous=0)) - assert result_pytz.value == result_dateutil.value - assert result_pytz.value == 1382835600000000000 - - if LooseVersion(dateutil.__version__) < LooseVersion('2.6.0'): - # dateutil 2.6 buggy w.r.t. ambiguous=0 - # see gh-14621 - # see https://github.com/dateutil/dateutil/issues/321 - assert (result_pytz.to_pydatetime().tzname() == - result_dateutil.to_pydatetime().tzname()) - assert str(result_pytz) == str(result_dateutil) - elif LooseVersion(dateutil.__version__) > LooseVersion('2.6.0'): - # fixed ambiguous behavior - assert result_pytz.to_pydatetime().tzname() == 'GMT' - assert result_dateutil.to_pydatetime().tzname() == 'BST' - assert str(result_pytz) != str(result_dateutil) - - # 1 hour difference - result_pytz = (Timestamp('2013-10-27 01:00:00') - .tz_localize(pytz_zone, ambiguous=1)) - result_dateutil = (Timestamp('2013-10-27 01:00:00') - .tz_localize(dateutil_zone, ambiguous=1)) - assert result_pytz.value == result_dateutil.value - assert result_pytz.value == 1382832000000000000 - - # dateutil < 2.6 is buggy w.r.t. ambiguous timezones - if LooseVersion(dateutil.__version__) > LooseVersion('2.5.3'): - # see gh-14621 - assert str(result_pytz) == str(result_dateutil) - assert (result_pytz.to_pydatetime().tzname() == - result_dateutil.to_pydatetime().tzname()) - def test_index_equals_with_tz(self): left = date_range('1/1/2011', periods=100, freq='H', tz='utc') right = date_range('1/1/2011', periods=100, freq='H', tz='US/Eastern') From 56dbaaef4fdd61974c447c124df2331acbbc7d27 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 4 Feb 2018 08:18:10 -0800 Subject: [PATCH 040/214] ops cleanup, named functions instead of lambdas (#19515) --- pandas/core/ops.py | 92 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 64 insertions(+), 28 deletions(-) diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 6ea4a81cb52a1..6db84aedce7e7 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -42,6 +42,60 @@ ABCSparseSeries, ABCSparseArray) +# ----------------------------------------------------------------------------- +# Reversed Operations not available in the stdlib operator module. +# Defining these instead of using lambdas allows us to reference them by name. + +def radd(left, right): + return right + left + + +def rsub(left, right): + return right - left + + +def rmul(left, right): + return right * left + + +def rdiv(left, right): + return right / left + + +def rtruediv(left, right): + return right / left + + +def rfloordiv(left, right): + return right // left + + +def rmod(left, right): + return right % left + + +def rdivmod(left, right): + return divmod(right, left) + + +def rpow(left, right): + return right ** left + + +def rand_(left, right): + return operator.and_(right, left) + + +def ror_(left, right): + return operator.or_(right, left) + + +def rxor(left, right): + return operator.xor(right, left) + + +# ----------------------------------------------------------------------------- + def _gen_eval_kwargs(name): """ Find the keyword arguments to pass to numexpr for the given operation. @@ -140,64 +194,51 @@ def _get_frame_op_default_axis(name): _op_descriptions = { 'add': {'op': '+', 'desc': 'Addition', - 'reversed': False, 'reverse': 'radd'}, 'sub': {'op': '-', 'desc': 'Subtraction', - 'reversed': False, 'reverse': 'rsub'}, 'mul': {'op': '*', 'desc': 'Multiplication', - 'reversed': False, 'reverse': 'rmul'}, 'mod': {'op': '%', 'desc': 'Modulo', - 'reversed': False, 'reverse': 'rmod'}, 'pow': {'op': '**', 'desc': 'Exponential power', - 'reversed': False, 'reverse': 'rpow'}, 'truediv': {'op': '/', 'desc': 'Floating division', - 'reversed': False, 'reverse': 'rtruediv'}, 'floordiv': {'op': '//', 'desc': 'Integer division', - 'reversed': False, 'reverse': 'rfloordiv'}, 'divmod': {'op': 'divmod', 'desc': 'Integer division and modulo', - 'reversed': False, 'reverse': None}, 'eq': {'op': '==', 'desc': 'Equal to', - 'reversed': False, 'reverse': None}, 'ne': {'op': '!=', 'desc': 'Not equal to', - 'reversed': False, 'reverse': None}, 'lt': {'op': '<', 'desc': 'Less than', - 'reversed': False, 'reverse': None}, 'le': {'op': '<=', 'desc': 'Less than or equal to', - 'reversed': False, 'reverse': None}, 'gt': {'op': '>', 'desc': 'Greater than', - 'reversed': False, 'reverse': None}, 'ge': {'op': '>=', 'desc': 'Greater than or equal to', - 'reversed': False, 'reverse': None}} _op_names = list(_op_descriptions.keys()) for key in _op_names: + _op_descriptions[key]['reversed'] = False reverse_op = _op_descriptions[key]['reverse'] if reverse_op is not None: _op_descriptions[reverse_op] = _op_descriptions[key].copy() @@ -392,7 +433,7 @@ def names(x): # yapf: disable new_methods = dict( add=arith_method(operator.add, names('add'), op('+')), - radd=arith_method(lambda x, y: y + x, names('radd'), op('+')), + radd=arith_method(radd, names('radd'), op('+')), sub=arith_method(operator.sub, names('sub'), op('-')), mul=arith_method(operator.mul, names('mul'), op('*')), truediv=arith_method(operator.truediv, names('truediv'), op('/')), @@ -404,13 +445,11 @@ def names(x): # not entirely sure why this is necessary, but previously was included # so it's here to maintain compatibility rmul=arith_method(operator.mul, names('rmul'), op('*')), - rsub=arith_method(lambda x, y: y - x, names('rsub'), op('-')), - rtruediv=arith_method(lambda x, y: operator.truediv(y, x), - names('rtruediv'), op('/')), - rfloordiv=arith_method(lambda x, y: operator.floordiv(y, x), - names('rfloordiv'), op('//')), - rpow=arith_method(lambda x, y: y**x, names('rpow'), op('**')), - rmod=arith_method(lambda x, y: y % x, names('rmod'), op('%'))) + rsub=arith_method(rsub, names('rsub'), op('-')), + rtruediv=arith_method(rtruediv, names('rtruediv'), op('/')), + rfloordiv=arith_method(rfloordiv, names('rfloordiv'), op('//')), + rpow=arith_method(rpow, names('rpow'), op('**')), + rmod=arith_method(rmod, names('rmod'), op('%'))) # yapf: enable new_methods['div'] = new_methods['truediv'] new_methods['rdiv'] = new_methods['rtruediv'] @@ -430,12 +469,9 @@ def names(x): or_=bool_method(operator.or_, names('or_'), op('|')), # For some reason ``^`` wasn't used in original. xor=bool_method(operator.xor, names('xor'), op('^')), - rand_=bool_method(lambda x, y: operator.and_(y, x), - names('rand_'), op('&')), - ror_=bool_method(lambda x, y: operator.or_(y, x), - names('ror_'), op('|')), - rxor=bool_method(lambda x, y: operator.xor(y, x), - names('rxor'), op('^')))) + rand_=bool_method(rand_, names('rand_'), op('&')), + ror_=bool_method(ror_, names('ror_'), op('|')), + rxor=bool_method(rxor, names('rxor'), op('^')))) if have_divmod: # divmod doesn't have an op that is supported by numexpr new_methods['divmod'] = arith_method(divmod, names('divmod'), None) From bc1d0273cd4b7ddf348feff5f46b6eb114ea04c9 Mon Sep 17 00:00:00 2001 From: Paul Reidy Date: Sun, 4 Feb 2018 16:32:52 +0000 Subject: [PATCH 041/214] DOC: Improve replace docstring (#18100) --- pandas/core/frame.py | 8 ++ pandas/core/generic.py | 212 +++++++++++++++++++++++++++++++++-------- pandas/core/series.py | 8 ++ 3 files changed, 187 insertions(+), 41 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 96d28581cfdd9..201d8ba427c8a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3080,6 +3080,14 @@ def fillna(self, value=None, method=None, axis=None, inplace=False, inplace=inplace, limit=limit, downcast=downcast, **kwargs) + @Appender(_shared_docs['replace'] % _shared_doc_kwargs) + def replace(self, to_replace=None, value=None, inplace=False, limit=None, + regex=False, method='pad', axis=None): + return super(DataFrame, self).replace(to_replace=to_replace, + value=value, inplace=inplace, + limit=limit, regex=regex, + method=method, axis=axis) + @Appender(_shared_docs['shift'] % _shared_doc_kwargs) def shift(self, periods=1, freq=None, axis=0): return super(DataFrame, self).shift(periods=periods, freq=freq, diff --git a/pandas/core/generic.py b/pandas/core/generic.py index d34a85b5b4388..0f038cd687dfd 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -69,6 +69,10 @@ def _single_replace(self, to_replace, method, inplace, limit): + """ + Replaces values in a Series using the fill method specified when no + replacement value is given in the replace method + """ if self.ndim != 1: raise TypeError('cannot replace {0} with method {1} on a {2}' .format(to_replace, method, type(self).__name__)) @@ -4787,94 +4791,111 @@ def bfill(self, axis=None, inplace=False, limit=None, downcast=None): return self.fillna(method='bfill', axis=axis, inplace=inplace, limit=limit, downcast=downcast) - def replace(self, to_replace=None, value=None, inplace=False, limit=None, - regex=False, method='pad', axis=None): - """ + _shared_docs['replace'] = (""" Replace values given in 'to_replace' with 'value'. Parameters ---------- to_replace : str, regex, list, dict, Series, numeric, or None - * str or regex: + * numeric, str or regex: - - str: string exactly matching `to_replace` will be replaced - with `value` - - regex: regexs matching `to_replace` will be replaced with - `value` + - numeric: numeric values equal to ``to_replace`` will be + replaced with ``value`` + - str: string exactly matching ``to_replace`` will be replaced + with ``value`` + - regex: regexs matching ``to_replace`` will be replaced with + ``value`` * list of str, regex, or numeric: - - First, if `to_replace` and `value` are both lists, they + - First, if ``to_replace`` and ``value`` are both lists, they **must** be the same length. - Second, if ``regex=True`` then all of the strings in **both** lists will be interpreted as regexs otherwise they will match - directly. This doesn't matter much for `value` since there + directly. This doesn't matter much for ``value`` since there are only a few possible substitution regexes you can use. - - str and regex rules apply as above. + - str, regex and numeric rules apply as above. * dict: - - Nested dictionaries, e.g., {'a': {'b': nan}}, are read as - follows: look in column 'a' for the value 'b' and replace it - with nan. You can nest regular expressions as well. Note that + - Dicts can be used to specify different replacement values + for different existing values. For example, + {'a': 'b', 'y': 'z'} replaces the value 'a' with 'b' and + 'y' with 'z'. To use a dict in this way the ``value`` + parameter should be ``None``. + - For a DataFrame a dict can specify that different values + should be replaced in different columns. For example, + {'a': 1, 'b': 'z'} looks for the value 1 in column 'a' and + the value 'z' in column 'b' and replaces these values with + whatever is specified in ``value``. The ``value`` parameter + should not be ``None`` in this case. You can treat this as a + special case of passing two lists except that you are + specifying the column to search in. + - For a DataFrame nested dictionaries, e.g., + {'a': {'b': np.nan}}, are read as follows: look in column 'a' + for the value 'b' and replace it with NaN. The ``value`` + parameter should be ``None`` to use a nested dict in this + way. You can nest regular expressions as well. Note that column names (the top-level dictionary keys in a nested dictionary) **cannot** be regular expressions. - - Keys map to column names and values map to substitution - values. You can treat this as a special case of passing two - lists except that you are specifying the column to search in. * None: - This means that the ``regex`` argument must be a string, compiled regular expression, or list, dict, ndarray or Series - of such elements. If `value` is also ``None`` then this + of such elements. If ``value`` is also ``None`` then this **must** be a nested dictionary or ``Series``. See the examples section for examples of each of these. value : scalar, dict, list, str, regex, default None - Value to use to fill holes (e.g. 0), alternately a dict of values - specifying which value to use for each column (columns not in the - dict will not be filled). Regular expressions, strings and lists or - dicts of such objects are also allowed. + Value to replace any values matching ``to_replace`` with. + For a DataFrame a dict of values can be used to specify which + value to use for each column (columns not in the dict will not be + filled). Regular expressions, strings and lists or dicts of such + objects are also allowed. inplace : boolean, default False If True, in place. Note: this will modify any other views on this object (e.g. a column from a DataFrame). Returns the caller if this is True. limit : int, default None Maximum size gap to forward or backward fill - regex : bool or same types as `to_replace`, default False - Whether to interpret `to_replace` and/or `value` as regular - expressions. If this is ``True`` then `to_replace` *must* be a - string. Otherwise, `to_replace` must be ``None`` because this - parameter will be interpreted as a regular expression or a list, - dict, or array of regular expressions. + regex : bool or same types as ``to_replace``, default False + Whether to interpret ``to_replace`` and/or ``value`` as regular + expressions. If this is ``True`` then ``to_replace`` *must* be a + string. Alternatively, this could be a regular expression or a + list, dict, or array of regular expressions in which case + ``to_replace`` must be ``None``. method : string, optional, {'pad', 'ffill', 'bfill'} The method to use when for replacement, when ``to_replace`` is a ``list``. See Also -------- - NDFrame.reindex - NDFrame.asfreq - NDFrame.fillna + %(klass)s.fillna : Fill NA/NaN values + %(klass)s.where : Replace values based on boolean condition Returns ------- - filled : NDFrame + filled : %(klass)s Raises ------ AssertionError - * If `regex` is not a ``bool`` and `to_replace` is not ``None``. + * If ``regex`` is not a ``bool`` and ``to_replace`` is not + ``None``. TypeError - * If `to_replace` is a ``dict`` and `value` is not a ``list``, + * If ``to_replace`` is a ``dict`` and ``value`` is not a ``list``, ``dict``, ``ndarray``, or ``Series`` - * If `to_replace` is ``None`` and `regex` is not compilable into a - regular expression or is a list, dict, ndarray, or Series. + * If ``to_replace`` is ``None`` and ``regex`` is not compilable + into a regular expression or is a list, dict, ndarray, or + Series. + * When replacing multiple ``bool`` or ``datetime64`` objects and + the arguments to ``to_replace`` does not match the type of the + value being replaced ValueError - * If `to_replace` and `value` are ``list`` s or ``ndarray`` s, but - they are not the same length. + * If a ``list`` or an ``ndarray`` is passed to ``to_replace`` and + `value` but they are not the same length. Notes ----- @@ -4883,12 +4904,121 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, * Regular expressions will only substitute on strings, meaning you cannot provide, for example, a regular expression matching floating point numbers and expect the columns in your frame that have a - numeric dtype to be matched. However, if those floating point numbers - *are* strings, then you can do this. + numeric dtype to be matched. However, if those floating point + numbers *are* strings, then you can do this. * This method has *a lot* of options. You are encouraged to experiment and play with this method to gain intuition about how it works. - """ + Examples + -------- + + >>> s = pd.Series([0, 1, 2, 3, 4]) + >>> s.replace(0, 5) + 0 5 + 1 1 + 2 2 + 3 3 + 4 4 + dtype: int64 + >>> df = pd.DataFrame({'A': [0, 1, 2, 3, 4], + ... 'B': [5, 6, 7, 8, 9], + ... 'C': ['a', 'b', 'c', 'd', 'e']}) + >>> df.replace(0, 5) + A B C + 0 5 5 a + 1 1 6 b + 2 2 7 c + 3 3 8 d + 4 4 9 e + + >>> df.replace([0, 1, 2, 3], 4) + A B C + 0 4 5 a + 1 4 6 b + 2 4 7 c + 3 4 8 d + 4 4 9 e + >>> df.replace([0, 1, 2, 3], [4, 3, 2, 1]) + A B C + 0 4 5 a + 1 3 6 b + 2 2 7 c + 3 1 8 d + 4 4 9 e + >>> s.replace([1, 2], method='bfill') + 0 0 + 1 3 + 2 3 + 3 3 + 4 4 + dtype: int64 + + >>> df.replace({0: 10, 1: 100}) + A B C + 0 10 5 a + 1 100 6 b + 2 2 7 c + 3 3 8 d + 4 4 9 e + >>> df.replace({'A': 0, 'B': 5}, 100) + A B C + 0 100 100 a + 1 1 6 b + 2 2 7 c + 3 3 8 d + 4 4 9 e + >>> df.replace({'A': {0: 100, 4: 400}}) + A B C + 0 100 5 a + 1 1 6 b + 2 2 7 c + 3 3 8 d + 4 400 9 e + + >>> df = pd.DataFrame({'A': ['bat', 'foo', 'bait'], + ... 'B': ['abc', 'bar', 'xyz']}) + >>> df.replace(to_replace=r'^ba.$', value='new', regex=True) + A B + 0 new abc + 1 foo new + 2 bait xyz + >>> df.replace({'A': r'^ba.$'}, {'A': 'new'}, regex=True) + A B + 0 new abc + 1 foo bar + 2 bait xyz + >>> df.replace(regex=r'^ba.$', value='new') + A B + 0 new abc + 1 foo new + 2 bait xyz + >>> df.replace(regex={r'^ba.$':'new', 'foo':'xyz'}) + A B + 0 new abc + 1 xyz new + 2 bait xyz + >>> df.replace(regex=[r'^ba.$', 'foo'], value='new') + A B + 0 new abc + 1 new new + 2 bait xyz + + Note that when replacing multiple ``bool`` or ``datetime64`` objects, + the data types in the ``to_replace`` parameter must match the data + type of the value being replaced: + + >>> df = pd.DataFrame({'A': [True, False, True], + ... 'B': [False, True, False]}) + >>> df.replace({'a string': 'new value', True: False}) # raises + TypeError: Cannot compare types 'ndarray(dtype=bool)' and 'str' + + This raises a ``TypeError`` because one of the ``dict`` keys is not of + the correct type for replacement. + """) + + @Appender(_shared_docs['replace'] % _shared_doc_kwargs) + def replace(self, to_replace=None, value=None, inplace=False, limit=None, + regex=False, method='pad', axis=None): inplace = validate_bool_kwarg(inplace, 'inplace') if not is_bool(regex) and to_replace is not None: raise AssertionError("'to_replace' must be 'None' if 'regex' is " diff --git a/pandas/core/series.py b/pandas/core/series.py index 78b4c3a70a519..e4b8979d6393a 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2671,6 +2671,14 @@ def fillna(self, value=None, method=None, axis=None, inplace=False, limit=limit, downcast=downcast, **kwargs) + @Appender(generic._shared_docs['replace'] % _shared_doc_kwargs) + def replace(self, to_replace=None, value=None, inplace=False, limit=None, + regex=False, method='pad', axis=None): + return super(Series, self).replace(to_replace=to_replace, value=value, + inplace=inplace, limit=limit, + regex=regex, method=method, + axis=axis) + @Appender(generic._shared_docs['shift'] % _shared_doc_kwargs) def shift(self, periods=1, freq=None, axis=0): return super(Series, self).shift(periods=periods, freq=freq, axis=axis) From de39a1572fcf82071f3c0b5f22be1611222bdf41 Mon Sep 17 00:00:00 2001 From: topper-123 Date: Sun, 4 Feb 2018 21:44:39 +0000 Subject: [PATCH 042/214] DOC: minor groupby and resampler improvements (#19514) --- doc/source/groupby.rst | 7 ++++--- pandas/core/generic.py | 27 ++++++++++++++++++++++++--- pandas/core/groupby.py | 2 +- 3 files changed, 29 insertions(+), 7 deletions(-) diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index 413138b1e52fc..407fad39ba232 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -1219,8 +1219,8 @@ see :ref:`here `. Combining ``.groupby`` and ``.pipe`` is often useful when you need to reuse GroupBy objects. -For an example, imagine having a DataFrame with columns for stores, products, -revenue and sold quantity. We'd like to do a groupwise calculation of *prices* +As an example, imagine having a DataFrame with columns for stores, products, +revenue and quantity sold. We'd like to do a groupwise calculation of *prices* (i.e. revenue/quantity) per store and per product. We could do this in a multi-step operation, but expressing it in terms of piping can make the code more readable. First we set the data: @@ -1230,7 +1230,8 @@ code more readable. First we set the data: import numpy as np n = 1000 df = pd.DataFrame({'Store': np.random.choice(['Store_1', 'Store_2'], n), - 'Product': np.random.choice(['Product_1', 'Product_2', 'Product_3'], n), + 'Product': np.random.choice(['Product_1', + 'Product_2'], n), 'Revenue': (np.random.random(n)*50+10).round(2), 'Quantity': np.random.randint(1, 10, size=n)}) df.head(2) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0f038cd687dfd..cb4bbb7b27c42 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5691,6 +5691,10 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, reduce the dimensionality of the return type if possible, otherwise return a consistent type + Returns + ------- + GroupBy object + Examples -------- DataFrame results @@ -5702,10 +5706,15 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, >>> data.groupby(['col1', 'col2']).mean() - Returns - ------- - GroupBy object + Notes + ----- + See the `user guide + `_ for more. + See also + -------- + resample : Convenience method for frequency conversion and resampling + of time series. """ from pandas.core.groupby import groupby @@ -5904,8 +5913,16 @@ def resample(self, rule, how=None, axis=0, fill_method=None, closed=None, .. versionadded:: 0.19.0 + Returns + ------- + Resampler object + Notes ----- + See the `user guide + `_ + for more. + To learn more about the offset strings, please see `this link `__. @@ -6071,6 +6088,10 @@ def resample(self, rule, how=None, axis=0, fill_method=None, closed=None, a b c d 2000-01-01 00:00:00 0 6 12 18 2000-01-01 00:03:00 0 4 8 12 + + See also + -------- + groupby : Group by mapping, function, label, or list of labels. """ from pandas.core.resample import (resample, _maybe_process_deprecations) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 2c1deb9db7bba..88af80e295d74 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -230,7 +230,7 @@ Notes ----- See more `here -`_ +`_ Examples -------- From ce435dfefaec4582fbd435ceb6127f14ca8d6975 Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Mon, 5 Feb 2018 08:39:43 +0000 Subject: [PATCH 043/214] DEPR: Changing default of str.extract(expand=False) to str.extract(expand=True) (#19118) --- doc/source/text.rst | 3 ++- doc/source/whatsnew/v0.23.0.txt | 47 +++++++++++++++++++++++++++++++++ pandas/core/strings.py | 15 +++-------- pandas/tests/test_strings.py | 9 ++++--- 4 files changed, 58 insertions(+), 16 deletions(-) diff --git a/doc/source/text.rst b/doc/source/text.rst index 2b6459b581c1e..1e620acb1f88a 100644 --- a/doc/source/text.rst +++ b/doc/source/text.rst @@ -218,7 +218,8 @@ Extract first match in each subject (extract) ``DataFrame``, depending on the subject and regular expression pattern (same behavior as pre-0.18.0). When ``expand=True`` it always returns a ``DataFrame``, which is more consistent and less - confusing from the perspective of a user. + confusing from the perspective of a user. ``expand=True`` is the + default since version 0.23.0. The ``extract`` method accepts a `regular expression `__ with at least one diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 69965f44d87a8..0ac27a2f23386 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -296,6 +296,53 @@ Build Changes - Building from source now explicitly requires ``setuptools`` in ``setup.py`` (:issue:`18113`) - Updated conda recipe to be in compliance with conda-build 3.0+ (:issue:`18002`) +Extraction of matching patterns from strings +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +By default, extracting matching patterns from strings with :func:`str.extract` used to return a +``Series`` if a single group was being extracted (a ``DataFrame`` if more than one group was +extracted``). As of Pandas 0.23.0 :func:`str.extract` always returns a ``DataFrame``, unless +``expand`` is set to ``False`` (:issue:`11386`). + +Also, ``None`` was an accepted value for the ``expand`` parameter (which was equivalent to +``False``), but now raises a ``ValueError``. + +Previous Behavior: + +.. code-block:: ipython + + In [1]: s = pd.Series(['number 10', '12 eggs']) + + In [2]: extracted = s.str.extract('.*(\d\d).*') + + In [3]: extracted + Out [3]: + 0 10 + 1 12 + dtype: object + + In [4]: type(extracted) + Out [4]: + pandas.core.series.Series + +New Behavior: + +.. ipython:: python + + s = pd.Series(['number 10', '12 eggs']) + extracted = s.str.extract('.*(\d\d).*') + extracted + type(extracted) + +To restore previous behavior, simply set ``expand`` to ``False``: + +.. ipython:: python + + s = pd.Series(['number 10', '12 eggs']) + extracted = s.str.extract('.*(\d\d).*', expand=False) + extracted + type(extracted) + .. _whatsnew_0230.api: Other API Changes diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 12c7feb5f2b15..b1c1ede66236c 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -598,7 +598,7 @@ def _str_extract_frame(arr, pat, flags=0): dtype=object) -def str_extract(arr, pat, flags=0, expand=None): +def str_extract(arr, pat, flags=0, expand=True): r""" For each subject string in the Series, extract groups from the first match of regular expression pat. @@ -610,7 +610,7 @@ def str_extract(arr, pat, flags=0, expand=None): flags : int, default 0 (no flags) re module flags, e.g. re.IGNORECASE - expand : bool, default False + expand : bool, default True * If True, return DataFrame. * If False, return Series/Index/DataFrame. @@ -676,15 +676,6 @@ def str_extract(arr, pat, flags=0, expand=None): dtype: object """ - if expand is None: - warnings.warn( - "currently extract(expand=None) " + - "means expand=False (return Index/Series/DataFrame) " + - "but in a future version of pandas this will be changed " + - "to expand=True (return DataFrame)", - FutureWarning, - stacklevel=3) - expand = False if not isinstance(expand, bool): raise ValueError("expand must be True or False") if expand: @@ -1739,7 +1730,7 @@ def translate(self, table, deletechars=None): findall = _pat_wrapper(str_findall, flags=True) @copy(str_extract) - def extract(self, pat, flags=0, expand=None): + def extract(self, pat, flags=0, expand=True): return str_extract(self, pat, flags=flags, expand=expand) @copy(str_extractall) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 973fe74429551..178c5ff655b04 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -612,13 +612,16 @@ def test_match(self): def test_extract_expand_None(self): values = Series(['fooBAD__barBAD', NA, 'foo']) - with tm.assert_produces_warning(FutureWarning): + with tm.assert_raises_regex(ValueError, + 'expand must be True or False'): values.str.extract('.*(BAD[_]+).*(BAD)', expand=None) def test_extract_expand_unspecified(self): values = Series(['fooBAD__barBAD', NA, 'foo']) - with tm.assert_produces_warning(FutureWarning): - values.str.extract('.*(BAD[_]+).*(BAD)') + result_unspecified = values.str.extract('.*(BAD[_]+).*') + assert isinstance(result_unspecified, DataFrame) + result_true = values.str.extract('.*(BAD[_]+).*', expand=True) + tm.assert_frame_equal(result_unspecified, result_true) def test_extract_expand_False(self): # Contains tests like those in test_match and some others. From 074d88159667c33319f1c5ab848870b4bd1e7e6e Mon Sep 17 00:00:00 2001 From: jschendel Date: Mon, 5 Feb 2018 04:05:20 -0700 Subject: [PATCH 044/214] TST: Remove legacy instances of _multiprocess_can_split_ (#19536) --- pandas/tests/frame/test_apply.py | 2 -- pandas/tests/indexes/period/test_period.py | 1 - pandas/tests/indexes/timedeltas/test_astype.py | 1 - pandas/tests/indexes/timedeltas/test_construction.py | 1 - pandas/tests/indexes/timedeltas/test_indexing.py | 1 - pandas/tests/indexes/timedeltas/test_ops.py | 1 - pandas/tests/indexes/timedeltas/test_setops.py | 1 - pandas/tests/indexes/timedeltas/test_timedelta.py | 2 -- pandas/tests/indexes/timedeltas/test_timedelta_range.py | 1 - pandas/tests/indexes/timedeltas/test_tools.py | 1 - pandas/tests/scalar/test_timedelta.py | 2 -- pandas/tests/series/test_apply.py | 2 -- 12 files changed, 16 deletions(-) diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index e0fc6c470fe57..d69ddcd8f14d4 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -496,8 +496,6 @@ def zip_frames(*frames): class TestDataFrameAggregate(TestData): - _multiprocess_can_split_ = True - def test_agg_transform(self): with np.errstate(all='ignore'): diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index ab341b70dfe91..6fc7fa5486f82 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -14,7 +14,6 @@ class TestPeriodIndex(DatetimeLike): _holder = PeriodIndex - _multiprocess_can_split_ = True def setup_method(self, method): self.indices = dict(index=tm.makePeriodIndex(10), diff --git a/pandas/tests/indexes/timedeltas/test_astype.py b/pandas/tests/indexes/timedeltas/test_astype.py index c3bd857036efc..6c644d239069a 100644 --- a/pandas/tests/indexes/timedeltas/test_astype.py +++ b/pandas/tests/indexes/timedeltas/test_astype.py @@ -8,7 +8,6 @@ class TestTimedeltaIndex(object): - _multiprocess_can_split_ = True def test_astype(self): # GH 13149, GH 13209 diff --git a/pandas/tests/indexes/timedeltas/test_construction.py b/pandas/tests/indexes/timedeltas/test_construction.py index 70aadd9f57174..68dc0003e2312 100644 --- a/pandas/tests/indexes/timedeltas/test_construction.py +++ b/pandas/tests/indexes/timedeltas/test_construction.py @@ -9,7 +9,6 @@ class TestTimedeltaIndex(object): - _multiprocess_can_split_ = True def test_construction_base_constructor(self): arr = [pd.Timedelta('1 days'), pd.NaT, pd.Timedelta('3 days')] diff --git a/pandas/tests/indexes/timedeltas/test_indexing.py b/pandas/tests/indexes/timedeltas/test_indexing.py index e64c4e6ac54a5..59e38c2e738b0 100644 --- a/pandas/tests/indexes/timedeltas/test_indexing.py +++ b/pandas/tests/indexes/timedeltas/test_indexing.py @@ -9,7 +9,6 @@ class TestTimedeltaIndex(object): - _multiprocess_can_split_ = True def test_insert(self): diff --git a/pandas/tests/indexes/timedeltas/test_ops.py b/pandas/tests/indexes/timedeltas/test_ops.py index e944aad13f8d5..86d7dd4e1b117 100644 --- a/pandas/tests/indexes/timedeltas/test_ops.py +++ b/pandas/tests/indexes/timedeltas/test_ops.py @@ -420,7 +420,6 @@ def test_equals(self): class TestTimedeltas(object): - _multiprocess_can_split_ = True def test_timedelta_ops(self): # GH4984 diff --git a/pandas/tests/indexes/timedeltas/test_setops.py b/pandas/tests/indexes/timedeltas/test_setops.py index 22546d25273a7..020e9079b3436 100644 --- a/pandas/tests/indexes/timedeltas/test_setops.py +++ b/pandas/tests/indexes/timedeltas/test_setops.py @@ -6,7 +6,6 @@ class TestTimedeltaIndex(object): - _multiprocess_can_split_ = True def test_union(self): diff --git a/pandas/tests/indexes/timedeltas/test_timedelta.py b/pandas/tests/indexes/timedeltas/test_timedelta.py index 32157a9a44e04..ce0f3b89b753e 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta.py @@ -18,7 +18,6 @@ class TestTimedeltaIndex(DatetimeLike): _holder = TimedeltaIndex - _multiprocess_can_split_ = True def setup_method(self, method): self.indices = dict(index=tm.makeTimedeltaIndex(10)) @@ -300,7 +299,6 @@ def test_freq_conversion(self): class TestTimeSeries(object): - _multiprocess_can_split_ = True def test_series_box_timedelta(self): rng = timedelta_range('1 day 1 s', periods=5, freq='h') diff --git a/pandas/tests/indexes/timedeltas/test_timedelta_range.py b/pandas/tests/indexes/timedeltas/test_timedelta_range.py index 7624e1f79af15..784ef845fea10 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta_range.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta_range.py @@ -7,7 +7,6 @@ class TestTimedeltas(object): - _multiprocess_can_split_ = True def test_timedelta_range(self): diff --git a/pandas/tests/indexes/timedeltas/test_tools.py b/pandas/tests/indexes/timedeltas/test_tools.py index b4ad28eeacb69..daa9739132d9e 100644 --- a/pandas/tests/indexes/timedeltas/test_tools.py +++ b/pandas/tests/indexes/timedeltas/test_tools.py @@ -11,7 +11,6 @@ class TestTimedeltas(object): - _multiprocess_can_split_ = True def test_to_timedelta(self): def conv(v): diff --git a/pandas/tests/scalar/test_timedelta.py b/pandas/tests/scalar/test_timedelta.py index 64d4940082978..667266be2a89b 100644 --- a/pandas/tests/scalar/test_timedelta.py +++ b/pandas/tests/scalar/test_timedelta.py @@ -13,7 +13,6 @@ class TestTimedeltaArithmetic(object): - _multiprocess_can_split_ = True def test_arithmetic_overflow(self): with pytest.raises(OverflowError): @@ -286,7 +285,6 @@ def test_compare_timedelta_ndarray(self): class TestTimedeltas(object): - _multiprocess_can_split_ = True def setup_method(self, method): pass diff --git a/pandas/tests/series/test_apply.py b/pandas/tests/series/test_apply.py index 3822ecd0a1b0e..0780c846a6c19 100644 --- a/pandas/tests/series/test_apply.py +++ b/pandas/tests/series/test_apply.py @@ -164,8 +164,6 @@ def test_apply_dict_depr(self): class TestSeriesAggregate(TestData): - _multiprocess_can_split_ = True - def test_transform(self): # transforming functions From 98f3937c3bec176fe0fe8e08bfa9d689a7fc45ce Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 5 Feb 2018 03:06:42 -0800 Subject: [PATCH 045/214] remove unused calendar options from period_helper (#19534) --- pandas/_libs/src/period_helper.c | 119 +++++++++++-------------------- pandas/_libs/src/period_helper.h | 4 -- pandas/_libs/tslibs/period.pyx | 1 - 3 files changed, 43 insertions(+), 81 deletions(-) diff --git a/pandas/_libs/src/period_helper.c b/pandas/_libs/src/period_helper.c index f1367978bd6c9..8f1c527a68455 100644 --- a/pandas/_libs/src/period_helper.c +++ b/pandas/_libs/src/period_helper.c @@ -47,13 +47,10 @@ static int days_in_month[2][12] = { {31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}, {31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}}; -/* Return 1/0 iff year points to a leap year in calendar. */ -static int dInfoCalc_Leapyear(npy_int64 year, int calendar) { - if (calendar == GREGORIAN_CALENDAR) { - return (year % 4 == 0) && ((year % 100 != 0) || (year % 400 == 0)); - } else { - return (year % 4 == 0); - } +/* Return 1/0 iff year points to a leap year. + * Assumes GREGORIAN_CALENDAR */ +static int dInfoCalc_Leapyear(npy_int64 year) { + return (year % 4 == 0) && ((year % 100 != 0) || (year % 400 == 0)); } /* Return the day of the week for the given absolute date. */ @@ -71,40 +68,33 @@ static int dInfoCalc_DayOfWeek(npy_int64 absdate) { static int monthToQuarter(int month) { return ((month - 1) / 3) + 1; } /* Return the year offset, that is the absolute date of the day - 31.12.(year-1) in the given calendar. + 31.12.(year-1) + + Assumes GREGORIAN_CALENDAR + + This is equivalent to: + + (datetime(year, 1, 1) - datetime(1970, 1, 1)).days Note: For the Julian calendar we shift the absdate (which is measured using the Gregorian Epoch) value by two days because the Epoch (0001-01-01) in the Julian calendar lies 2 days before the Epoch in the Gregorian calendar. */ -static int dInfoCalc_YearOffset(npy_int64 year, int calendar) { +static int dInfoCalc_YearOffset(npy_int64 year) { year--; - if (calendar == GREGORIAN_CALENDAR) { - if (year >= 0 || -1 / 4 == -1) - return year * 365 + year / 4 - year / 100 + year / 400; - else - return year * 365 + (year - 3) / 4 - (year - 99) / 100 + + if (year >= 0 || -1 / 4 == -1) + return year * 365 + year / 4 - year / 100 + year / 400; + else + return year * 365 + (year - 3) / 4 - (year - 99) / 100 + (year - 399) / 400; - } else if (calendar == JULIAN_CALENDAR) { - if (year >= 0 || -1 / 4 == -1) - return year * 365 + year / 4 - 2; - else - return year * 365 + (year - 3) / 4 - 2; - } - Py_Error(PyExc_ValueError, "unknown calendar"); -onError: - return INT_ERR_CODE; } -/* Set the instance's value using the given date and time. calendar may be set - * to the flags: GREGORIAN_CALENDAR, JULIAN_CALENDAR to indicate the calendar - * to be used. */ - +/* Set the instance's value using the given date and time. + * Assumes GREGORIAN_CALENDAR */ static int dInfoCalc_SetFromDateAndTime(struct date_info *dinfo, int year, int month, int day, int hour, - int minute, double second, - int calendar) { + int minute, double second) { /* Calculate the absolute date */ { int leap; @@ -116,7 +106,7 @@ static int dInfoCalc_SetFromDateAndTime(struct date_info *dinfo, int year, PyExc_ValueError, "year out of range: %i", year); /* Is it a leap year ? */ - leap = dInfoCalc_Leapyear(year, calendar); + leap = dInfoCalc_Leapyear(year); /* Negative month values indicate months relative to the years end */ if (month < 0) month += 13; @@ -128,7 +118,7 @@ static int dInfoCalc_SetFromDateAndTime(struct date_info *dinfo, int year, Py_AssertWithArg(day >= 1 && day <= days_in_month[leap][month - 1], PyExc_ValueError, "day out of range: %i", day); - yearoffset = dInfoCalc_YearOffset(year, calendar); + yearoffset = dInfoCalc_YearOffset(year); if (yearoffset == INT_ERR_CODE) goto onError; absdate = day + month_offset[leap][month - 1] + yearoffset; @@ -142,8 +132,6 @@ static int dInfoCalc_SetFromDateAndTime(struct date_info *dinfo, int year, dinfo->day_of_week = dInfoCalc_DayOfWeek(absdate); dinfo->day_of_year = (short)(absdate - yearoffset); - - dinfo->calendar = calendar; } /* Calculate the absolute time */ @@ -171,33 +159,27 @@ static int dInfoCalc_SetFromDateAndTime(struct date_info *dinfo, int year, return INT_ERR_CODE; } -/* Sets the date part of the date_info struct using the indicated - calendar. +/* Sets the date part of the date_info struct + Assumes GREGORIAN_CALENDAR XXX This could also be done using some integer arithmetics rather than with this iterative approach... */ static int dInfoCalc_SetFromAbsDate(register struct date_info *dinfo, - npy_int64 absdate, int calendar) { + npy_int64 absdate) { register npy_int64 year; npy_int64 yearoffset; int leap, dayoffset; int *monthoffset; /* Approximate year */ - if (calendar == GREGORIAN_CALENDAR) { - year = (npy_int64)(((double)absdate) / 365.2425); - } else if (calendar == JULIAN_CALENDAR) { - year = (npy_int64)(((double)absdate) / 365.25); - } else { - Py_Error(PyExc_ValueError, "unknown calendar"); - } + year = (npy_int64)(((double)absdate) / 365.2425); if (absdate > 0) year++; /* Apply corrections to reach the correct year */ while (1) { /* Calculate the year offset */ - yearoffset = dInfoCalc_YearOffset(year, calendar); + yearoffset = dInfoCalc_YearOffset(year); if (yearoffset == INT_ERR_CODE) goto onError; /* Backward correction: absdate must be greater than the @@ -208,7 +190,7 @@ static int dInfoCalc_SetFromAbsDate(register struct date_info *dinfo, } dayoffset = absdate - yearoffset; - leap = dInfoCalc_Leapyear(year, calendar); + leap = dInfoCalc_Leapyear(year); /* Forward correction: non leap years only have 365 days */ if (dayoffset > 365 && !leap) { @@ -219,7 +201,6 @@ static int dInfoCalc_SetFromAbsDate(register struct date_info *dinfo, } dinfo->year = year; - dinfo->calendar = calendar; /* Now iterate to find the month */ monthoffset = month_offset[leap]; @@ -410,8 +391,7 @@ static npy_int64 DtoB_WeekendToFriday(npy_int64 absdate, int day_of_week) { static npy_int64 absdate_from_ymd(int y, int m, int d) { struct date_info tempDate; - if (dInfoCalc_SetFromDateAndTime(&tempDate, y, m, d, 0, 0, 0, - GREGORIAN_CALENDAR)) { + if (dInfoCalc_SetFromDateAndTime(&tempDate, y, m, d, 0, 0, 0)) { return INT_ERR_CODE; } return tempDate.absdate; @@ -423,8 +403,7 @@ static npy_int64 asfreq_DTtoA(npy_int64 ordinal, char relation, asfreq_info *af_info) { struct date_info dinfo; ordinal = downsample_daytime(ordinal, af_info, 0); - if (dInfoCalc_SetFromAbsDate(&dinfo, ordinal + ORD_OFFSET, - GREGORIAN_CALENDAR)) + if (dInfoCalc_SetFromAbsDate(&dinfo, ordinal + ORD_OFFSET)) return INT_ERR_CODE; if (dinfo.month > af_info->to_a_year_end) { return (npy_int64)(dinfo.year + 1 - BASE_YEAR); @@ -436,8 +415,7 @@ static npy_int64 asfreq_DTtoA(npy_int64 ordinal, char relation, static npy_int64 DtoQ_yq(npy_int64 ordinal, asfreq_info *af_info, int *year, int *quarter) { struct date_info dinfo; - if (dInfoCalc_SetFromAbsDate(&dinfo, ordinal + ORD_OFFSET, - GREGORIAN_CALENDAR)) + if (dInfoCalc_SetFromAbsDate(&dinfo, ordinal + ORD_OFFSET)) return INT_ERR_CODE; if (af_info->to_q_year_end != 12) { dinfo.month -= af_info->to_q_year_end; @@ -474,8 +452,7 @@ static npy_int64 asfreq_DTtoM(npy_int64 ordinal, char relation, ordinal = downsample_daytime(ordinal, af_info, 0); - if (dInfoCalc_SetFromAbsDate(&dinfo, ordinal + ORD_OFFSET, - GREGORIAN_CALENDAR)) + if (dInfoCalc_SetFromAbsDate(&dinfo, ordinal + ORD_OFFSET)) return INT_ERR_CODE; return (npy_int64)((dinfo.year - BASE_YEAR) * 12 + dinfo.month - 1); } @@ -493,8 +470,7 @@ static npy_int64 asfreq_DTtoB(npy_int64 ordinal, char relation, ordinal = downsample_daytime(ordinal, af_info, 0); - if (dInfoCalc_SetFromAbsDate(&dinfo, ordinal + ORD_OFFSET, - GREGORIAN_CALENDAR)) + if (dInfoCalc_SetFromAbsDate(&dinfo, ordinal + ORD_OFFSET)) return INT_ERR_CODE; if (relation == 'S') { @@ -595,8 +571,7 @@ static npy_int64 asfreq_WtoB(npy_int64 ordinal, char relation, asfreq_info *af_info) { struct date_info dinfo; if (dInfoCalc_SetFromAbsDate( - &dinfo, asfreq_WtoDT(ordinal, relation, af_info) + ORD_OFFSET, - GREGORIAN_CALENDAR)) + &dinfo, asfreq_WtoDT(ordinal, relation, af_info) + ORD_OFFSET)) return INT_ERR_CODE; if (relation == 'S') { @@ -655,8 +630,7 @@ static npy_int64 asfreq_MtoB(npy_int64 ordinal, char relation, struct date_info dinfo; if (dInfoCalc_SetFromAbsDate( - &dinfo, asfreq_MtoDT(ordinal, relation, af_info) + ORD_OFFSET, - GREGORIAN_CALENDAR)) + &dinfo, asfreq_MtoDT(ordinal, relation, af_info) + ORD_OFFSET)) return INT_ERR_CODE; if (relation == 'S') { @@ -731,8 +705,7 @@ static npy_int64 asfreq_QtoB(npy_int64 ordinal, char relation, asfreq_info *af_info) { struct date_info dinfo; if (dInfoCalc_SetFromAbsDate( - &dinfo, asfreq_QtoDT(ordinal, relation, af_info) + ORD_OFFSET, - GREGORIAN_CALENDAR)) + &dinfo, asfreq_QtoDT(ordinal, relation, af_info) + ORD_OFFSET)) return INT_ERR_CODE; if (relation == 'S') { @@ -803,8 +776,7 @@ static npy_int64 asfreq_AtoB(npy_int64 ordinal, char relation, asfreq_info *af_info) { struct date_info dinfo; if (dInfoCalc_SetFromAbsDate( - &dinfo, asfreq_AtoDT(ordinal, relation, af_info) + ORD_OFFSET, - GREGORIAN_CALENDAR)) + &dinfo, asfreq_AtoDT(ordinal, relation, af_info) + ORD_OFFSET)) return INT_ERR_CODE; if (relation == 'S') { @@ -1096,19 +1068,17 @@ static int dInfoCalc_SetFromAbsTime(struct date_info *dinfo, double abstime) { return 0; } -/* Set the instance's value using the given date and time. calendar - may be set to the flags: GREGORIAN_CALENDAR, JULIAN_CALENDAR to - indicate the calendar to be used. */ +/* Set the instance's value using the given date and time. + Assumes GREGORIAN_CALENDAR. */ static int dInfoCalc_SetFromAbsDateTime(struct date_info *dinfo, - npy_int64 absdate, double abstime, - int calendar) { + npy_int64 absdate, double abstime) { /* Bounds check */ Py_AssertWithArg(abstime >= 0.0 && abstime <= SECONDS_PER_DAY, PyExc_ValueError, "abstime out of range (0.0 - 86400.0): %f", abstime); /* Calculate the date */ - if (dInfoCalc_SetFromAbsDate(dinfo, absdate, calendar)) goto onError; + if (dInfoCalc_SetFromAbsDate(dinfo, absdate)) goto onError; /* Calculate the time */ if (dInfoCalc_SetFromAbsTime(dinfo, abstime)) goto onError; @@ -1356,8 +1326,7 @@ static int _ISOWeek(struct date_info *dinfo) { /* Verify */ if (week < 0) { /* The day lies in last week of the previous year */ - if ((week > -2) || (week == -2 && dInfoCalc_Leapyear(dinfo->year - 1, - dinfo->calendar))) + if ((week > -2) || (week == -2 && dInfoCalc_Leapyear(dinfo->year - 1))) week = 53; else week = 52; @@ -1384,8 +1353,7 @@ int get_date_info(npy_int64 ordinal, int freq, struct date_info *dinfo) { absdate += 1; } - if (dInfoCalc_SetFromAbsDateTime(dinfo, absdate, abstime, - GREGORIAN_CALENDAR)) + if (dInfoCalc_SetFromAbsDateTime(dinfo, absdate, abstime)) return INT_ERR_CODE; return 0; @@ -1480,7 +1448,6 @@ int pdays_in_month(npy_int64 ordinal, int freq) { if (get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) return INT_ERR_CODE; - days = days_in_month[dInfoCalc_Leapyear(dinfo.year, dinfo.calendar)] - [dinfo.month - 1]; + days = days_in_month[dInfoCalc_Leapyear(dinfo.year)][dinfo.month - 1]; return days; } diff --git a/pandas/_libs/src/period_helper.h b/pandas/_libs/src/period_helper.h index 35dd20848a2ec..d3d32f81d1f66 100644 --- a/pandas/_libs/src/period_helper.h +++ b/pandas/_libs/src/period_helper.h @@ -24,9 +24,6 @@ frequency conversion routines. * declarations from period here */ -#define GREGORIAN_CALENDAR 0 -#define JULIAN_CALENDAR 1 - #define SECONDS_PER_DAY ((double)86400.0) #define Py_AssertWithArg(x, errortype, errorstr, a1) \ @@ -138,7 +135,6 @@ typedef struct date_info { int year; int day_of_week; int day_of_year; - int calendar; } date_info; typedef npy_int64 (*freq_conv_func)(npy_int64, char, asfreq_info *); diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index e2caebe4c4afc..5098e5c9100ff 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -59,7 +59,6 @@ cdef extern from "period_helper.h": int year int day_of_week int day_of_year - int calendar ctypedef struct asfreq_info: int from_week_end From 5b58a20504aeb3efe8858164377edc0e4f02ae02 Mon Sep 17 00:00:00 2001 From: discort Date: Mon, 5 Feb 2018 06:12:02 -0500 Subject: [PATCH 046/214] BUG: groupby with resample using on parameter errors when selecting column to apply function closes #17813 Author: discort Closes #19433 from discort/fix_17813 and squashes the following commits: 2f25d40a0 [discort] Fixed bug in df.resample using 'on' parameter --- doc/source/whatsnew/v0.23.0.txt | 8 ++++++-- pandas/core/groupby.py | 18 +++++++++++++++--- pandas/tests/test_resample.py | 9 +++++++++ 3 files changed, 30 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 0ac27a2f23386..b3905824f7e44 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -289,6 +289,8 @@ Convert to an xarray DataArray p.to_xarray() +.. _whatsnew_0230.api_breaking.build_changes: + Build Changes ^^^^^^^^^^^^^ @@ -296,6 +298,8 @@ Build Changes - Building from source now explicitly requires ``setuptools`` in ``setup.py`` (:issue:`18113`) - Updated conda recipe to be in compliance with conda-build 3.0+ (:issue:`18002`) +.. _whatsnew_0230.api_breaking.extract: + Extraction of matching patterns from strings ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -594,8 +598,8 @@ Groupby/Resample/Rolling - Fixed regression in :func:`DataFrame.groupby` which would not emit an error when called with a tuple key not in the index (:issue:`18798`) - Bug in :func:`DataFrame.resample` which silently ignored unsupported (or mistyped) options for ``label``, ``closed`` and ``convention`` (:issue:`19303`) - Bug in :func:`DataFrame.groupby` where tuples were interpreted as lists of keys rather than as keys (:issue:`17979`, :issue:`18249`) -- Bug in ``transform`` where particular aggregation functions were being incorrectly cast to match the dtype(s) of the grouped data (:issue:`19200`) -- +- Bug in :func:`DataFrame.transform` where particular aggregation functions were being incorrectly cast to match the dtype(s) of the grouped data (:issue:`19200`) +- Bug in :func:`DataFrame.groupby` passing the `on=` kwarg, and subsequently using ``.apply()`` (:issue:`17813`) Sparse ^^^^^^ diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 88af80e295d74..ab0070777c190 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -37,6 +37,7 @@ _ensure_categorical, _ensure_float) from pandas.core.dtypes.cast import maybe_downcast_to_dtype +from pandas.core.dtypes.generic import ABCSeries from pandas.core.dtypes.missing import isna, notna, _maybe_fill from pandas.core.base import (PandasObject, SelectionMixin, GroupByError, @@ -423,6 +424,7 @@ def __init__(self, key=None, level=None, freq=None, axis=0, sort=False): self.obj = None self.indexer = None self.binner = None + self._grouper = None @property def ax(self): @@ -465,12 +467,22 @@ def _set_grouper(self, obj, sort=False): raise ValueError( "The Grouper cannot specify both a key and a level!") + # Keep self.grouper value before overriding + if self._grouper is None: + self._grouper = self.grouper + # the key must be a valid info item if self.key is not None: key = self.key - if key not in obj._info_axis: - raise KeyError("The grouper name {0} is not found".format(key)) - ax = Index(obj[key], name=key) + # The 'on' is already defined + if getattr(self.grouper, 'name', None) == key and \ + isinstance(obj, ABCSeries): + ax = self._grouper.take(obj.index) + else: + if key not in obj._info_axis: + raise KeyError( + "The grouper name {0} is not found".format(key)) + ax = Index(obj[key], name=key) else: ax = obj._get_axis(self.axis) diff --git a/pandas/tests/test_resample.py b/pandas/tests/test_resample.py index a5aaa328a8e06..2de890ea459f0 100644 --- a/pandas/tests/test_resample.py +++ b/pandas/tests/test_resample.py @@ -3077,6 +3077,15 @@ def test_getitem_multiple(self): result = r['buyer'].count() assert_series_equal(result, expected) + def test_groupby_resample_on_api_with_getitem(self): + # GH 17813 + df = pd.DataFrame({'id': list('aabbb'), + 'date': pd.date_range('1-1-2016', periods=5), + 'data': 1}) + exp = df.set_index('date').groupby('id').resample('2D')['data'].sum() + result = df.groupby('id').resample('2D', on='date')['data'].sum() + assert_series_equal(result, exp) + def test_nearest(self): # GH 17496 From d5a7e7c947325554d4ee3c4e3755c878610d354c Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Mon, 5 Feb 2018 06:35:03 -0500 Subject: [PATCH 047/214] TST: Fix makeIntIndex, benchmark get loc Author: Pietro Battiston Closes #19483 from toobaz/test_get_loc and squashes the following commits: 51d691106 [Pietro Battiston] TST: benchmark get_loc in various cases d424f63df [Pietro Battiston] TST: produce unsorted integer index (consistently with other types) --- asv_bench/benchmarks/index_object.py | 17 +++++++++++++++++ pandas/tests/indexes/test_base.py | 16 +++++++++------- pandas/tests/indexing/test_floats.py | 15 +++++++-------- 3 files changed, 33 insertions(+), 15 deletions(-) diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py index 970760373632a..f1703e163917a 100644 --- a/asv_bench/benchmarks/index_object.py +++ b/asv_bench/benchmarks/index_object.py @@ -147,6 +147,11 @@ def setup(self, dtype): self.idx = getattr(tm, 'make{}Index'.format(dtype))(N) self.array_mask = (np.arange(N) % 3) == 0 self.series_mask = Series(self.array_mask) + self.sorted = self.idx.sort_values() + half = N // 2 + self.non_unique = self.idx[:half].append(self.idx[:half]) + self.non_unique_sorted = self.sorted[:half].append(self.sorted[:half]) + self.key = self.sorted[N // 4] def time_boolean_array(self, dtype): self.idx[self.array_mask] @@ -163,6 +168,18 @@ def time_slice(self, dtype): def time_slice_step(self, dtype): self.idx[::2] + def time_get_loc(self, dtype): + self.idx.get_loc(self.key) + + def time_get_loc_sorted(self, dtype): + self.sorted.get_loc(self.key) + + def time_get_loc_non_unique(self, dtype): + self.non_unique.get_loc(self.key) + + def time_get_loc_non_unique_sorted(self, dtype): + self.non_unique_sorted.get_loc(self.key) + class Float64IndexMethod(object): # GH 13166 diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 974099f1fbbe9..90edcb526bb2e 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -830,15 +830,16 @@ def test_map_with_tuples(self): # Test that returning a single tuple from an Index # returns an Index. - boolean_index = tm.makeIntIndex(3).map(lambda x: (x,)) - expected = Index([(0,), (1,), (2,)]) - tm.assert_index_equal(boolean_index, expected) + idx = tm.makeIntIndex(3) + result = tm.makeIntIndex(3).map(lambda x: (x,)) + expected = Index([(i,) for i in idx]) + tm.assert_index_equal(result, expected) # Test that returning a tuple from a map of a single index # returns a MultiIndex object. - boolean_index = tm.makeIntIndex(3).map(lambda x: (x, x == 1)) - expected = MultiIndex.from_tuples([(0, False), (1, True), (2, False)]) - tm.assert_index_equal(boolean_index, expected) + result = idx.map(lambda x: (x, x == 1)) + expected = MultiIndex.from_tuples([(i, i == 1) for i in idx]) + tm.assert_index_equal(result, expected) # Test that returning a single object from a MultiIndex # returns an Index. @@ -870,7 +871,8 @@ def test_map_tseries_indices_return_index(self): def test_map_dictlike(self, mapper): # GH 12756 expected = Index(['foo', 'bar', 'baz']) - result = tm.makeIntIndex(3).map(mapper(expected.values, [0, 1, 2])) + idx = tm.makeIntIndex(3) + result = idx.map(mapper(expected.values, idx)) tm.assert_index_equal(result, expected) for name in self.indices.keys(): diff --git a/pandas/tests/indexing/test_floats.py b/pandas/tests/indexing/test_floats.py index d2692c7dc302e..e3f93924aca0d 100644 --- a/pandas/tests/indexing/test_floats.py +++ b/pandas/tests/indexing/test_floats.py @@ -4,7 +4,8 @@ from warnings import catch_warnings import numpy as np -from pandas import Series, DataFrame, Index, Float64Index +from pandas import (Series, DataFrame, Index, Float64Index, Int64Index, + RangeIndex) from pandas.util.testing import assert_series_equal, assert_almost_equal import pandas.util.testing as tm @@ -206,9 +207,8 @@ def test_scalar_integer(self): # test how scalar float indexers work on int indexes # integer index - for index in [tm.makeIntIndex, tm.makeRangeIndex]: + for i in [Int64Index(range(5)), RangeIndex(5)]: - i = index(5) for s in [Series(np.arange(len(i))), DataFrame(np.random.randn(len(i), len(i)), index=i, columns=i)]: @@ -362,9 +362,9 @@ def test_slice_integer(self): # these coerce to a like integer # oob indicates if we are out of bounds # of positional indexing - for index, oob in [(tm.makeIntIndex(5), False), - (tm.makeRangeIndex(5), False), - (tm.makeIntIndex(5) + 10, True)]: + for index, oob in [(Int64Index(range(5)), False), + (RangeIndex(5), False), + (Int64Index(range(5)) + 10, True)]: # s is an in-range index s = Series(range(5), index=index) @@ -486,9 +486,8 @@ def f(): def test_slice_integer_frame_getitem(self): # similar to above, but on the getitem dim (of a DataFrame) - for index in [tm.makeIntIndex, tm.makeRangeIndex]: + for index in [Int64Index(range(5)), RangeIndex(5)]: - index = index(5) s = DataFrame(np.random.randn(5, 2), index=index) def f(idxr): From f391cbfe57fb4e334e9f06d49073dc1ca25eb1e1 Mon Sep 17 00:00:00 2001 From: Pepe Flores Date: Mon, 5 Feb 2018 20:43:02 +0200 Subject: [PATCH 048/214] DOC: Fix typo in example (#19537) Fix typo in the example for pandas.io.formats.style.Styler.format --- pandas/io/formats/style.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 58796aa30f0bf..20e72dd6bde91 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -364,7 +364,7 @@ def format(self, formatter, subset=None): >>> df = pd.DataFrame(np.random.randn(4, 2), columns=['a', 'b']) >>> df.style.format("{:.2%}") >>> df['c'] = ['a', 'b', 'c', 'd'] - >>> df.style.format({'C': str.upper}) + >>> df.style.format({'c': str.upper}) """ if subset is None: row_locs = range(len(self.data)) From a01f74cf27314817acff6289f36b6eba9c49fb6c Mon Sep 17 00:00:00 2001 From: Matthew Kirk Date: Mon, 5 Feb 2018 20:24:00 -0500 Subject: [PATCH 049/214] BUG: don't assume series is length > 0 closes #19368 Author: Matthew Kirk Closes #19438 from hexgnu/segfault_memory_usage and squashes the following commits: f9433d844 [Matthew Kirk] Use shared docstring and get rid of if condition 4ead141c0 [Matthew Kirk] Move whatsnew doc to Sparse ae9f74d58 [Matthew Kirk] Revert base.py cdd4141e4 [Matthew Kirk] Fix linting error 93a0c3daa [Matthew Kirk] Merge remote-tracking branch 'upstream/master' into segfault_memory_usage 207bc74d2 [Matthew Kirk] Define memory_usage on SparseArray 21ae14707 [Matthew Kirk] FIX: revert change to lib.pyx 3f52a44f6 [Matthew Kirk] Ah ha I think I got it 5e59e9cbc [Matthew Kirk] Use range over 0 <= for loops e25158713 [Matthew Kirk] Fix failing test with indexing 27df317be [Matthew Kirk] Merge remote-tracking branch 'upstream/master' into segfault_memory_usage 7fdd03e94 [Matthew Kirk] Take out comment and use product 6bd6ddd02 [Matthew Kirk] BUG: don't assume series is length > 0 --- doc/source/whatsnew/v0.23.0.txt | 2 +- pandas/core/base.py | 2 +- pandas/core/sparse/array.py | 16 ++++++++++++++-- pandas/tests/sparse/series/test_series.py | 13 +++++++++++++ 4 files changed, 29 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index b3905824f7e44..e4f00990d28c0 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -606,7 +606,7 @@ Sparse - Bug in which creating a ``SparseDataFrame`` from a dense ``Series`` or an unsupported type raised an uncontrolled exception (:issue:`19374`) - Bug in :class:`SparseDataFrame.to_csv` causing exception (:issue:`19384`) -- +- Bug in :class:`SparseSeries.memory_usage` which caused segfault by accessing non sparse elements (:issue:`19368`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/base.py b/pandas/core/base.py index 54d25a16a10a3..d5b204dba063e 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1048,7 +1048,7 @@ def is_monotonic_decreasing(self): def memory_usage(self, deep=False): """ - Memory usage of my values + Memory usage of the values Parameters ---------- diff --git a/pandas/core/sparse/array.py b/pandas/core/sparse/array.py index fa07400a0706e..65aefd9fb8c0a 100644 --- a/pandas/core/sparse/array.py +++ b/pandas/core/sparse/array.py @@ -8,10 +8,10 @@ import warnings import pandas as pd -from pandas.core.base import PandasObject +from pandas.core.base import PandasObject, IndexOpsMixin from pandas import compat -from pandas.compat import range +from pandas.compat import range, PYPY from pandas.compat.numpy import function as nv from pandas.core.dtypes.generic import ABCSparseSeries @@ -30,6 +30,7 @@ from pandas.core.dtypes.missing import isna, notna, na_value_for_dtype import pandas._libs.sparse as splib +import pandas._libs.lib as lib from pandas._libs.sparse import SparseIndex, BlockIndex, IntIndex from pandas._libs import index as libindex import pandas.core.algorithms as algos @@ -238,6 +239,17 @@ def kind(self): elif isinstance(self.sp_index, IntIndex): return 'integer' + @Appender(IndexOpsMixin.memory_usage.__doc__) + def memory_usage(self, deep=False): + values = self.sp_values + + v = values.nbytes + + if deep and is_object_dtype(self) and not PYPY: + v += lib.memory_usage_of_objects(values) + + return v + def __array_wrap__(self, out_arr, context=None): """ NumPy calls this method when ufunc is applied diff --git a/pandas/tests/sparse/series/test_series.py b/pandas/tests/sparse/series/test_series.py index 2ea1e63433520..3f5d5a59cc540 100644 --- a/pandas/tests/sparse/series/test_series.py +++ b/pandas/tests/sparse/series/test_series.py @@ -23,6 +23,8 @@ from pandas.core.sparse.api import SparseSeries from pandas.tests.series.test_api import SharedWithSparse +from itertools import product + def _test_data1(): # nan-based @@ -971,6 +973,17 @@ def test_combine_first(self): tm.assert_sp_series_equal(result, result2) tm.assert_sp_series_equal(result, expected) + @pytest.mark.parametrize('deep,fill_values', [([True, False], + [0, 1, np.nan, None])]) + def test_memory_usage_deep(self, deep, fill_values): + for deep, fill_value in product(deep, fill_values): + sparse_series = SparseSeries(fill_values, fill_value=fill_value) + dense_series = Series(fill_values) + sparse_usage = sparse_series.memory_usage(deep=deep) + dense_usage = dense_series.memory_usage(deep=deep) + + assert sparse_usage < dense_usage + class TestSparseHandlingMultiIndexes(object): From ed10bf618b93726c61ed9b3ebbc3031416bc1263 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Mon, 5 Feb 2018 20:29:15 -0500 Subject: [PATCH 050/214] TST: fix and test index division by zero Related: #19336 Author: Brock Mendel Closes #19347 from jbrockmendel/div_zero2 and squashes the following commits: be1e2e1b8 [Brock Mendel] move fixture to conftest 64b0c0853 [Brock Mendel] Merge branch 'master' of https://github.com/pandas-dev/pandas into div_zero2 aa969f8d2 [Brock Mendel] Merge branch 'master' of https://github.com/pandas-dev/pandas into div_zero2 000aefde0 [Brock Mendel] fix long again 9de356ab0 [Brock Mendel] revert fixture to fix test_range failures b8cf21d3e [Brock Mendel] flake8 remove unused import afedba98b [Brock Mendel] whatsnew clarification b51c2e14c [Brock Mendel] fixturize 37efd5108 [Brock Mendel] make zero a fixture 965f7214e [Brock Mendel] Merge branch 'master' of https://github.com/pandas-dev/pandas into div_zero2 d648ef698 [Brock Mendel] requested edits 1ef3a6c74 [Brock Mendel] Merge branch 'master' of https://github.com/pandas-dev/pandas into div_zero2 78de1a4df [Brock Mendel] Merge branch 'master' of https://github.com/pandas-dev/pandas into div_zero2 0277d9fca [Brock Mendel] add ipython output to whatsnew 5d7e3ea0c [Brock Mendel] Merge branch 'master' of https://github.com/pandas-dev/pandas into div_zero2 ea75c3ca0 [Brock Mendel] ipython block 6fc61bd99 [Brock Mendel] elaborate docstring ca3bf4241 [Brock Mendel] Whatsnew section cd543497c [Brock Mendel] move dispatch_missing to core.missing 06df02a89 [Brock Mendel] py3 fix 84c74c54a [Brock Mendel] remove operator.div for py3 6acc2f78a [Brock Mendel] fix missing import e0e89b978 [Brock Mendel] fix and and tests for divmod 969f342e1 [Brock Mendel] fix and test index division by zero --- doc/source/whatsnew/v0.23.0.txt | 44 +++++++++++++++ pandas/core/indexes/base.py | 2 + pandas/core/indexes/range.py | 31 +++++------ pandas/core/missing.py | 82 ++++++++++++++++++++++++++++ pandas/tests/indexes/conftest.py | 18 +++++- pandas/tests/indexes/test_numeric.py | 42 ++++++++++++++ 6 files changed, 200 insertions(+), 19 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index e4f00990d28c0..ea56ebad7d782 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -204,6 +204,50 @@ Please note that the string `index` is not supported with the round trip format, new_df print(new_df.index.name) +.. _whatsnew_0230.enhancements.index_division_by_zero: + +Index Division By Zero Fills Correctly +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Division operations on ``Index`` and subclasses will now fill division of positive numbers by zero with ``np.inf``, division of negative numbers by zero with ``-np.inf`` and `0 / 0` with ``np.nan``. This matches existing ``Series`` behavior. (:issue:`19322`, :issue:`19347`) + +Previous Behavior: + +.. code-block:: ipython + + In [6]: index = pd.Int64Index([-1, 0, 1]) + + In [7]: index / 0 + Out[7]: Int64Index([0, 0, 0], dtype='int64') + + # Previous behavior yielded different results depending on the type of zero in the divisor + In [8]: index / 0.0 + Out[8]: Float64Index([-inf, nan, inf], dtype='float64') + + In [9]: index = pd.UInt64Index([0, 1]) + + In [10]: index / np.array([0, 0], dtype=np.uint64) + Out[10]: UInt64Index([0, 0], dtype='uint64') + + In [11]: pd.RangeIndex(1, 5) / 0 + ZeroDivisionError: integer division or modulo by zero + +Current Behavior: + +.. ipython:: python + + index = pd.Int64Index([-1, 0, 1]) + # division by zero gives -infinity where negative, +infinity where positive, and NaN for 0 / 0 + index / 0 + + # The result of division by zero should not depend on whether the zero is int or float + index / 0.0 + + index = pd.UInt64Index([0, 1]) + index / np.array([0, 0], dtype=np.uint64) + + pd.RangeIndex(1, 5) / 0 + .. _whatsnew_0230.enhancements.other: Other Enhancements diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 626f3dc86556a..1e1bb0d49b3df 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4040,6 +4040,8 @@ def _evaluate_numeric_binop(self, other): attrs = self._maybe_update_attributes(attrs) with np.errstate(all='ignore'): result = op(values, other) + + result = missing.dispatch_missing(op, values, other, result) return constructor(result, **attrs) return _evaluate_numeric_binop diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index a82ee6b2b44af..0ed92a67c7e14 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -550,7 +550,7 @@ def __getitem__(self, key): return super_getitem(key) def __floordiv__(self, other): - if is_integer(other): + if is_integer(other) and other != 0: if (len(self) == 0 or self._start % other == 0 and self._step % other == 0): @@ -592,14 +592,15 @@ def _evaluate_numeric_binop(self, other): attrs = self._get_attributes_dict() attrs = self._maybe_update_attributes(attrs) + left, right = self, other if reversed: - self, other = other, self + left, right = right, left try: # apply if we have an override if step: with np.errstate(all='ignore'): - rstep = step(self._step, other) + rstep = step(left._step, right) # we don't have a representable op # so return a base index @@ -607,11 +608,11 @@ def _evaluate_numeric_binop(self, other): raise ValueError else: - rstep = self._step + rstep = left._step with np.errstate(all='ignore'): - rstart = op(self._start, other) - rstop = op(self._stop, other) + rstart = op(left._start, right) + rstop = op(left._stop, right) result = RangeIndex(rstart, rstop, @@ -627,18 +628,12 @@ def _evaluate_numeric_binop(self, other): return result - except (ValueError, TypeError, AttributeError): - pass - - # convert to Int64Index ops - if isinstance(self, RangeIndex): - self = self.values - if isinstance(other, RangeIndex): - other = other.values - - with np.errstate(all='ignore'): - results = op(self, other) - return Index(results, **attrs) + except (ValueError, TypeError, AttributeError, + ZeroDivisionError): + # Defer to Int64Index implementation + if reversed: + return op(other, self._int64index) + return op(self._int64index, other) return _evaluate_numeric_binop diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 2eccc5777bca6..31c489e2f8941 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -1,6 +1,7 @@ """ Routines for filling missing data """ +import operator import numpy as np from distutils.version import LooseVersion @@ -650,6 +651,87 @@ def fill_zeros(result, x, y, name, fill): return result +def mask_zero_div_zero(x, y, result, copy=False): + """ + Set results of 0 / 0 or 0 // 0 to np.nan, regardless of the dtypes + of the numerator or the denominator. + + Parameters + ---------- + x : ndarray + y : ndarray + result : ndarray + copy : bool (default False) + Whether to always create a new array or try to fill in the existing + array if possible. + + Returns + ------- + filled_result : ndarray + + Examples + -------- + >>> x = np.array([1, 0, -1], dtype=np.int64) + >>> y = 0 # int 0; numpy behavior is different with float + >>> result = x / y + >>> result # raw numpy result does not fill division by zero + array([0, 0, 0]) + >>> mask_zero_div_zero(x, y, result) + array([ inf, nan, -inf]) + """ + if is_scalar(y): + y = np.array(y) + + zmask = y == 0 + if zmask.any(): + shape = result.shape + + nan_mask = (zmask & (x == 0)).ravel() + neginf_mask = (zmask & (x < 0)).ravel() + posinf_mask = (zmask & (x > 0)).ravel() + + if nan_mask.any() or neginf_mask.any() or posinf_mask.any(): + # Fill negative/0 with -inf, positive/0 with +inf, 0/0 with NaN + result = result.astype('float64', copy=copy).ravel() + + np.putmask(result, nan_mask, np.nan) + np.putmask(result, posinf_mask, np.inf) + np.putmask(result, neginf_mask, -np.inf) + + result = result.reshape(shape) + + return result + + +def dispatch_missing(op, left, right, result): + """ + Fill nulls caused by division by zero, casting to a diffferent dtype + if necessary. + + Parameters + ---------- + op : function (operator.add, operator.div, ...) + left : object (Index for non-reversed ops) + right : object (Index fof reversed ops) + result : ndarray + + Returns + ------- + result : ndarray + """ + opstr = '__{opname}__'.format(opname=op.__name__).replace('____', '__') + if op in [operator.truediv, operator.floordiv, + getattr(operator, 'div', None)]: + result = mask_zero_div_zero(left, right, result) + elif op is operator.mod: + result = fill_zeros(result, left, right, opstr, np.nan) + elif op is divmod: + res0 = mask_zero_div_zero(left, right, result[0]) + res1 = fill_zeros(result[1], left, right, opstr, np.nan) + result = (res0, res1) + return result + + def _interp_limit(invalid, fw_limit, bw_limit): """ Get indexers of values that won't be filled diff --git a/pandas/tests/indexes/conftest.py b/pandas/tests/indexes/conftest.py index 217ee07affa84..6d88ef0cfa6c5 100644 --- a/pandas/tests/indexes/conftest.py +++ b/pandas/tests/indexes/conftest.py @@ -1,9 +1,10 @@ import pytest import numpy as np +import pandas as pd import pandas.util.testing as tm from pandas.core.indexes.api import Index, MultiIndex -from pandas.compat import lzip +from pandas.compat import lzip, long @pytest.fixture(params=[tm.makeUnicodeIndex(100), @@ -29,3 +30,18 @@ def indices(request): def one(request): # zero-dim integer array behaves like an integer return request.param + + +zeros = [box([0] * 5, dtype=dtype) + for box in [pd.Index, np.array] + for dtype in [np.int64, np.uint64, np.float64]] +zeros.extend([np.array(0, dtype=dtype) + for dtype in [np.int64, np.uint64, np.float64]]) +zeros.extend([0, 0.0, long(0)]) + + +@pytest.fixture(params=zeros) +def zero(request): + # For testing division by (or of) zero for Index with length 5, this + # gives several scalar-zeros and length-5 vector-zeros + return request.param diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index 0c1bec7a6f1a9..c6883df7ee91a 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -157,6 +157,48 @@ def test_divmod_series(self): for r, e in zip(result, expected): tm.assert_series_equal(r, e) + def test_div_zero(self, zero): + idx = self.create_index() + + expected = Index([np.nan, np.inf, np.inf, np.inf, np.inf], + dtype=np.float64) + result = idx / zero + tm.assert_index_equal(result, expected) + ser_compat = Series(idx).astype('i8') / np.array(zero).astype('i8') + tm.assert_series_equal(ser_compat, Series(result)) + + def test_floordiv_zero(self, zero): + idx = self.create_index() + expected = Index([np.nan, np.inf, np.inf, np.inf, np.inf], + dtype=np.float64) + + result = idx // zero + tm.assert_index_equal(result, expected) + ser_compat = Series(idx).astype('i8') // np.array(zero).astype('i8') + tm.assert_series_equal(ser_compat, Series(result)) + + def test_mod_zero(self, zero): + idx = self.create_index() + + expected = Index([np.nan, np.nan, np.nan, np.nan, np.nan], + dtype=np.float64) + result = idx % zero + tm.assert_index_equal(result, expected) + ser_compat = Series(idx).astype('i8') % np.array(zero).astype('i8') + tm.assert_series_equal(ser_compat, Series(result)) + + def test_divmod_zero(self, zero): + idx = self.create_index() + + exleft = Index([np.nan, np.inf, np.inf, np.inf, np.inf], + dtype=np.float64) + exright = Index([np.nan, np.nan, np.nan, np.nan, np.nan], + dtype=np.float64) + + result = divmod(idx, zero) + tm.assert_index_equal(result[0], exleft) + tm.assert_index_equal(result[1], exright) + def test_explicit_conversions(self): # GH 8608 From 672f5a151886a8bf457ac499d9a9a471689ee9ff Mon Sep 17 00:00:00 2001 From: Sam Foo Date: Tue, 6 Feb 2018 05:15:50 -0500 Subject: [PATCH 051/214] DOC: Remove repeated duplicated word (#19546) --- doc/source/advanced.rst | 2 +- doc/source/comparison_with_sas.rst | 4 ++-- doc/source/computation.rst | 2 +- doc/source/io.rst | 2 +- doc/source/release.rst | 10 +++++----- doc/source/tutorials.rst | 2 +- 6 files changed, 11 insertions(+), 11 deletions(-) diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst index 25f7c5a3ad948..ca903dadc6eb1 100644 --- a/doc/source/advanced.rst +++ b/doc/source/advanced.rst @@ -672,7 +672,7 @@ The ``CategoricalIndex`` is **preserved** after indexing: df2.loc['a'].index Sorting the index will sort by the order of the categories (Recall that we -created the index with with ``CategoricalDtype(list('cab'))``, so the sorted +created the index with ``CategoricalDtype(list('cab'))``, so the sorted order is ``cab``.). .. ipython:: python diff --git a/doc/source/comparison_with_sas.rst b/doc/source/comparison_with_sas.rst index e9e0d7716af3a..214667119f7e0 100644 --- a/doc/source/comparison_with_sas.rst +++ b/doc/source/comparison_with_sas.rst @@ -279,7 +279,7 @@ date/datetime columns. The equivalent pandas operations are shown below. In addition to these functions pandas supports other Time Series features -not available in Base SAS (such as resampling and and custom offsets) - +not available in Base SAS (such as resampling and custom offsets) - see the :ref:`timeseries documentation` for more details. .. ipython:: python @@ -584,7 +584,7 @@ For example, in SAS you could do this to filter missing values. if value_x ^= .; run; -Which doesn't work in in pandas. Instead, the ``pd.isna`` or ``pd.notna`` functions +Which doesn't work in pandas. Instead, the ``pd.isna`` or ``pd.notna`` functions should be used for comparisons. .. ipython:: python diff --git a/doc/source/computation.rst b/doc/source/computation.rst index a64542fa71705..4285767654e25 100644 --- a/doc/source/computation.rst +++ b/doc/source/computation.rst @@ -512,7 +512,7 @@ a same sized result as the input. When using ``.resample()`` with an offset. Construct a new index that is the frequency of the offset. For each frequency bin, aggregate points from the input within a backwards-in-time looking window that fall in that bin. The result of this -aggregation is the output for that frequency point. The windows are fixed size size in the frequency space. Your result +aggregation is the output for that frequency point. The windows are fixed size in the frequency space. Your result will have the shape of a regular frequency between the min and the max of the original input object. To summarize, ``.rolling()`` is a time-based window operation, while ``.resample()`` is a frequency-based window operation. diff --git a/doc/source/io.rst b/doc/source/io.rst index 60dc89f8fd495..1785de54b7dd6 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -4529,7 +4529,7 @@ Several caveats. on an attempt at serialization. You can specify an ``engine`` to direct the serialization. This can be one of ``pyarrow``, or ``fastparquet``, or ``auto``. -If the engine is NOT specified, then the ``pd.options.io.parquet.engine`` option is checked; if this is also ``auto``, then +If the engine is NOT specified, then the ``pd.options.io.parquet.engine`` option is checked; if this is also ``auto``, then ``pyarrow`` is tried, and falling back to ``fastparquet``. See the documentation for `pyarrow `__ and `fastparquet `__ diff --git a/doc/source/release.rst b/doc/source/release.rst index cd763de42d162..8e063116cbf07 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -406,7 +406,7 @@ of all enhancements and bugs that have been fixed in 0.20.1. .. note:: - This is a combined release for 0.20.0 and and 0.20.1. + This is a combined release for 0.20.0 and 0.20.1. Version 0.20.1 contains one additional change for backwards-compatibility with downstream projects using pandas' ``utils`` routines. (:issue:`16250`) Thanks @@ -2918,7 +2918,7 @@ Improvements to existing features - clipboard functions use pyperclip (no dependencies on Windows, alternative dependencies offered for Linux) (:issue:`3837`). - Plotting functions now raise a ``TypeError`` before trying to plot anything - if the associated objects have have a dtype of ``object`` (:issue:`1818`, + if the associated objects have a dtype of ``object`` (:issue:`1818`, :issue:`3572`, :issue:`3911`, :issue:`3912`), but they will try to convert object arrays to numeric arrays if possible so that you can still plot, for example, an object array with floats. This happens before any drawing takes place which @@ -4082,7 +4082,7 @@ Bug Fixes columns (:issue:`1943`) - Fix time zone localization bug causing improper fields (e.g. hours) in time zones that have not had a UTC transition in a long time (:issue:`1946`) -- Fix errors when parsing and working with with fixed offset timezones +- Fix errors when parsing and working with fixed offset timezones (:issue:`1922`, :issue:`1928`) - Fix text parser bug when handling UTC datetime objects generated by dateutil (:issue:`1693`) @@ -4383,7 +4383,7 @@ Bug Fixes error (:issue:`1090`) - Consistently set name on groupby pieces (:issue:`184`) - Treat dict return values as Series in GroupBy.apply (:issue:`823`) -- Respect column selection for DataFrame in in GroupBy.transform (:issue:`1365`) +- Respect column selection for DataFrame in GroupBy.transform (:issue:`1365`) - Fix MultiIndex partial indexing bug (:issue:`1352`) - Enable assignment of rows in mixed-type DataFrame via .ix (:issue:`1432`) - Reset index mapping when grouping Series in Cython (:issue:`1423`) @@ -5040,7 +5040,7 @@ New Features - Add `melt` function to `pandas.core.reshape` - Add `level` parameter to group by level in Series and DataFrame descriptive statistics (:issue:`313`) -- Add `head` and `tail` methods to Series, analogous to to DataFrame (PR +- Add `head` and `tail` methods to Series, analogous to DataFrame (PR :issue:`296`) - Add `Series.isin` function which checks if each value is contained in a passed sequence (:issue:`289`) diff --git a/doc/source/tutorials.rst b/doc/source/tutorials.rst index 43ccd372d9d5b..710212bc237cd 100644 --- a/doc/source/tutorials.rst +++ b/doc/source/tutorials.rst @@ -19,7 +19,7 @@ pandas Cookbook The goal of this cookbook (by `Julia Evans `_) is to give you some concrete examples for getting started with pandas. These are examples with real-world data, and all the bugs and weirdness that -that entails. +entails. Here are links to the v0.1 release. For an up-to-date table of contents, see the `pandas-cookbook GitHub repository `_. To run the examples in this tutorial, you'll need to From a22acc2961bc6719f11a2900e004982e55007401 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 6 Feb 2018 03:20:35 -0800 Subject: [PATCH 052/214] centralize and split frame division tests (#19527) --- pandas/tests/frame/test_arithmetic.py | 122 +++++++++++++++++++++++++- pandas/tests/frame/test_operators.py | 70 --------------- pandas/tests/frame/test_timeseries.py | 9 -- 3 files changed, 121 insertions(+), 80 deletions(-) diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 9b99a7b73b82b..1bb8e8edffc6e 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -1,8 +1,9 @@ # -*- coding: utf-8 -*- - import pytest import numpy as np +from pandas.compat import range + import pandas as pd import pandas.util.testing as tm @@ -58,10 +59,129 @@ def test_df_flex_cmp_constant_return_types_empty(self, opname): result = getattr(empty, opname)(const).get_dtype_counts() tm.assert_series_equal(result, pd.Series([2], ['bool'])) + @pytest.mark.parametrize('timestamps', [ + [pd.Timestamp('2012-01-01 13:00:00+00:00')] * 2, + [pd.Timestamp('2012-01-01 13:00:00')] * 2]) + def test_tz_aware_scalar_comparison(self, timestamps): + # Test for issue #15966 + df = pd.DataFrame({'test': timestamps}) + expected = pd.DataFrame({'test': [False, False]}) + tm.assert_frame_equal(df == -1, expected) + # ------------------------------------------------------------------- # Arithmetic +class TestFrameMulDiv(object): + """Tests for DataFrame multiplication and division""" + # ------------------------------------------------------------------ + # Mod By Zero + + def test_df_mod_zero_df(self): + # GH#3590, modulo as ints + df = pd.DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]}) + + # this is technically wrong, as the integer portion is coerced to float + # ### + first = pd.Series([0, 0, 0, 0], dtype='float64') + second = pd.Series([np.nan, np.nan, np.nan, 0]) + expected = pd.DataFrame({'first': first, 'second': second}) + result = df % df + tm.assert_frame_equal(result, expected) + + def test_df_mod_zero_array(self): + # GH#3590, modulo as ints + df = pd.DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]}) + + # this is technically wrong, as the integer portion is coerced to float + # ### + first = pd.Series([0, 0, 0, 0], dtype='float64') + second = pd.Series([np.nan, np.nan, np.nan, 0]) + expected = pd.DataFrame({'first': first, 'second': second}) + + # numpy has a slightly different (wrong) treatment + with np.errstate(all='ignore'): + arr = df.values % df.values + result2 = pd.DataFrame(arr, index=df.index, + columns=df.columns, dtype='float64') + result2.iloc[0:3, 1] = np.nan + tm.assert_frame_equal(result2, expected) + + def test_df_mod_zero_int(self): + # GH#3590, modulo as ints + df = pd.DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]}) + + result = df % 0 + expected = pd.DataFrame(np.nan, index=df.index, columns=df.columns) + tm.assert_frame_equal(result, expected) + + # numpy has a slightly different (wrong) treatment + with np.errstate(all='ignore'): + arr = df.values.astype('float64') % 0 + result2 = pd.DataFrame(arr, index=df.index, columns=df.columns) + tm.assert_frame_equal(result2, expected) + + def test_df_mod_zero_series_does_not_commute(self): + # GH#3590, modulo as ints + # not commutative with series + df = pd.DataFrame(np.random.randn(10, 5)) + ser = df[0] + res = ser % df + res2 = df % ser + assert not res.fillna(0).equals(res2.fillna(0)) + + # ------------------------------------------------------------------ + # Division By Zero + + def test_df_div_zero_df(self): + # integer div, but deal with the 0's (GH#9144) + df = pd.DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]}) + result = df / df + + first = pd.Series([1.0, 1.0, 1.0, 1.0]) + second = pd.Series([np.nan, np.nan, np.nan, 1]) + expected = pd.DataFrame({'first': first, 'second': second}) + tm.assert_frame_equal(result, expected) + + def test_df_div_zero_array(self): + # integer div, but deal with the 0's (GH#9144) + df = pd.DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]}) + + first = pd.Series([1.0, 1.0, 1.0, 1.0]) + second = pd.Series([np.nan, np.nan, np.nan, 1]) + expected = pd.DataFrame({'first': first, 'second': second}) + + with np.errstate(all='ignore'): + arr = df.values.astype('float') / df.values + result = pd.DataFrame(arr, index=df.index, + columns=df.columns) + tm.assert_frame_equal(result, expected) + + def test_df_div_zero_int(self): + # integer div, but deal with the 0's (GH#9144) + df = pd.DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]}) + + result = df / 0 + expected = pd.DataFrame(np.inf, index=df.index, columns=df.columns) + expected.iloc[0:3, 1] = np.nan + tm.assert_frame_equal(result, expected) + + # numpy has a slightly different (wrong) treatment + with np.errstate(all='ignore'): + arr = df.values.astype('float64') / 0 + result2 = pd.DataFrame(arr, index=df.index, + columns=df.columns) + tm.assert_frame_equal(result2, expected) + + def test_df_div_zero_series_does_not_commute(self): + # integer div, but deal with the 0's (GH#9144) + df = pd.DataFrame(np.random.randn(10, 5)) + ser = df[0] + res = ser / df + res2 = df / ser + assert not res.fillna(0).equals(res2.fillna(0)) + + class TestFrameArithmetic(object): @pytest.mark.xfail(reason='GH#7996 datetime64 units not converted to nano') diff --git a/pandas/tests/frame/test_operators.py b/pandas/tests/frame/test_operators.py index bdccbec6111d3..bf895be8bc813 100644 --- a/pandas/tests/frame/test_operators.py +++ b/pandas/tests/frame/test_operators.py @@ -203,76 +203,6 @@ def test_timestamp_compare(self): result = right_f(Timestamp('nat'), df) assert_frame_equal(result, expected) - def test_modulo(self): - # GH3590, modulo as ints - p = DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]}) - - # this is technically wrong as the integer portion is coerced to float - # ### - expected = DataFrame({'first': Series([0, 0, 0, 0], dtype='float64'), - 'second': Series([np.nan, np.nan, np.nan, 0])}) - result = p % p - assert_frame_equal(result, expected) - - # numpy has a slightly different (wrong) treatement - with np.errstate(all='ignore'): - arr = p.values % p.values - result2 = DataFrame(arr, index=p.index, - columns=p.columns, dtype='float64') - result2.iloc[0:3, 1] = np.nan - assert_frame_equal(result2, expected) - - result = p % 0 - expected = DataFrame(np.nan, index=p.index, columns=p.columns) - assert_frame_equal(result, expected) - - # numpy has a slightly different (wrong) treatement - with np.errstate(all='ignore'): - arr = p.values.astype('float64') % 0 - result2 = DataFrame(arr, index=p.index, columns=p.columns) - assert_frame_equal(result2, expected) - - # not commutative with series - p = DataFrame(np.random.randn(10, 5)) - s = p[0] - res = s % p - res2 = p % s - assert not res.fillna(0).equals(res2.fillna(0)) - - def test_div(self): - - # integer div, but deal with the 0's (GH 9144) - p = DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]}) - result = p / p - - expected = DataFrame({'first': Series([1.0, 1.0, 1.0, 1.0]), - 'second': Series([nan, nan, nan, 1])}) - assert_frame_equal(result, expected) - - with np.errstate(all='ignore'): - arr = p.values.astype('float') / p.values - result2 = DataFrame(arr, index=p.index, - columns=p.columns) - assert_frame_equal(result2, expected) - - result = p / 0 - expected = DataFrame(np.inf, index=p.index, columns=p.columns) - expected.iloc[0:3, 1] = nan - assert_frame_equal(result, expected) - - # numpy has a slightly different (wrong) treatement - with np.errstate(all='ignore'): - arr = p.values.astype('float64') / 0 - result2 = DataFrame(arr, index=p.index, - columns=p.columns) - assert_frame_equal(result2, expected) - - p = DataFrame(np.random.randn(10, 5)) - s = p[0] - res = s / p - res2 = p / s - assert not res.fillna(0).equals(res2.fillna(0)) - def test_logical_operators(self): def _check_bin_op(op): diff --git a/pandas/tests/frame/test_timeseries.py b/pandas/tests/frame/test_timeseries.py index e6b47fd69cb05..25dd285e883a0 100644 --- a/pandas/tests/frame/test_timeseries.py +++ b/pandas/tests/frame/test_timeseries.py @@ -738,12 +738,3 @@ def test_tz_convert_and_localize(self, fn): with assert_raises_regex(ValueError, 'not valid'): df = DataFrame(index=l0) df = getattr(df, fn)('US/Pacific', level=1) - - @pytest.mark.parametrize('timestamps', [ - [Timestamp('2012-01-01 13:00:00+00:00')] * 2, - [Timestamp('2012-01-01 13:00:00')] * 2]) - def test_tz_aware_scalar_comparison(self, timestamps): - # Test for issue #15966 - df = DataFrame({'test': timestamps}) - expected = DataFrame({'test': [False, False]}) - assert_frame_equal(df == -1, expected) From 84522a0f5e033ab631d83808d02cbb07ec8dfec3 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 6 Feb 2018 03:27:16 -0800 Subject: [PATCH 053/214] Fix parsing corner case closes #19382 (#19529) --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/_libs/tslib.pyx | 30 ++++++++++++++++---- pandas/_libs/tslibs/conversion.pyx | 8 ++++++ pandas/tests/indexes/datetimes/test_tools.py | 16 ++++++++++- pandas/tests/scalar/test_timestamp.py | 8 ++++++ 5 files changed, 56 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index ea56ebad7d782..ca625f492b61f 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -550,6 +550,7 @@ Datetimelike - Bug in :func:`Series.truncate` which raises ``TypeError`` with a monotonic ``PeriodIndex`` (:issue:`17717`) - Bug in :func:`~DataFrame.pct_change` using ``periods`` and ``freq`` returned different length outputs (:issue:`7292`) - Bug in comparison of :class:`DatetimeIndex` against ``None`` or ``datetime.date`` objects raising ``TypeError`` for ``==`` and ``!=`` comparisons instead of all-``False`` and all-``True``, respectively (:issue:`19301`) +- Bug in :class:`Timestamp` and :func:`to_datetime` where a string representing a barely out-of-bounds timestamp would be incorrectly rounded down instead of raising ``OutOfBoundsDatetime`` (:issue:`19382`) - Timezones diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 81df7981096ba..877d7deff6ff4 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -609,20 +609,38 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', value = tz_convert_single(value, tz, 'UTC') iresult[i] = value check_dts_bounds(&dts) + except OutOfBoundsDatetime: + # GH#19382 for just-barely-OutOfBounds falling back to + # dateutil parser will return incorrect result because + # it will ignore nanoseconds + if require_iso8601: + if _parse_today_now(val, &iresult[i]): + continue + elif is_coerce: + iresult[i] = NPY_NAT + continue + elif is_raise: + raise ValueError("time data {val} doesn't match " + "format specified" + .format(val=val)) + return values + elif is_coerce: + iresult[i] = NPY_NAT + continue + raise except ValueError: # if requiring iso8601 strings, skip trying other formats if require_iso8601: if _parse_today_now(val, &iresult[i]): continue - if is_coerce: + elif is_coerce: iresult[i] = NPY_NAT continue elif is_raise: - raise ValueError( - "time data %r doesn't match format " - "specified" % (val,)) - else: - return values + raise ValueError("time data {val} doesn't match " + "format specified" + .format(val=val)) + return values try: py_dt = parse_datetime_string(val, dayfirst=dayfirst, diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index a32bfc1f6836c..4f1a053da6f1d 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -26,6 +26,7 @@ from np_datetime cimport (check_dts_bounds, dt64_to_dtstruct, dtstruct_to_dt64, get_datetime64_unit, get_datetime64_value, pydatetime_to_dt64) +from np_datetime import OutOfBoundsDatetime from util cimport (is_string_object, is_datetime64_object, @@ -472,6 +473,13 @@ cdef _TSObject convert_str_to_tsobject(object ts, object tz, object unit, ts = tz_localize_to_utc(np.array([ts], dtype='i8'), tz, ambiguous='raise', errors='raise')[0] + + except OutOfBoundsDatetime: + # GH#19382 for just-barely-OutOfBounds falling back to dateutil + # parser will return incorrect result because it will ignore + # nanoseconds + raise + except ValueError: try: ts = parse_datetime_string(ts, dayfirst=dayfirst, diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index 44f3c21d23e62..f8b1f68ba33ce 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -17,6 +17,7 @@ from pandas._libs.tslibs import parsing from pandas.core.tools import datetimes as tools +from pandas.errors import OutOfBoundsDatetime from pandas.compat import lmap from pandas.compat.numpy import np_array_datetime64_compat from pandas.core.dtypes.common import is_datetime64_ns_dtype @@ -783,7 +784,6 @@ def test_dataframe_dtypes(self, cache): class TestToDatetimeMisc(object): - @pytest.mark.parametrize('cache', [True, False]) def test_to_datetime_iso8601(self, cache): result = to_datetime(["2012-01-01 00:00:00"], cache=cache) @@ -1596,6 +1596,20 @@ def test_coerce_of_invalid_datetimes(self): ) ) + def test_to_datetime_barely_out_of_bounds(self): + # GH#19529 + # GH#19382 close enough to bounds that dropping nanos would result + # in an in-bounds datetime + arr = np.array(['2262-04-11 23:47:16.854775808'], dtype=object) + + with pytest.raises(OutOfBoundsDatetime): + to_datetime(arr) + + with pytest.raises(OutOfBoundsDatetime): + # Essentially the same as above, but more directly calling + # the relevant function + tslib.array_to_datetime(arr) + def test_normalize_date(): value = date(2012, 9, 7) diff --git a/pandas/tests/scalar/test_timestamp.py b/pandas/tests/scalar/test_timestamp.py index 301f6da140866..7695c94409232 100644 --- a/pandas/tests/scalar/test_timestamp.py +++ b/pandas/tests/scalar/test_timestamp.py @@ -18,6 +18,7 @@ from pandas._libs.tslibs import conversion from pandas._libs.tslibs.timezones import get_timezone, dateutil_gettz as gettz +from pandas.errors import OutOfBoundsDatetime from pandas.compat import long, PY3 from pandas.compat.numpy import np_datetime64_compat from pandas import Timestamp, Period, Timedelta @@ -410,6 +411,13 @@ def test_out_of_bounds_string(self): with pytest.raises(ValueError): Timestamp('2263-01-01') + def test_barely_out_of_bounds(self): + # GH#19529 + # GH#19382 close enough to bounds that dropping nanos would result + # in an in-bounds datetime + with pytest.raises(OutOfBoundsDatetime): + Timestamp('2262-04-11 23:47:16.854775808') + def test_bounds_with_different_units(self): out_of_bounds_dates = ('1677-09-21', '2262-04-12') From 54f1b3eca094e0b98d6d2b93854f9c937394109d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 6 Feb 2018 03:34:32 -0800 Subject: [PATCH 054/214] Collect Series timezone tests (#19541) --- pandas/tests/series/test_timezones.py | 293 +++++++++++++++++++++++++ pandas/tests/tseries/test_timezones.py | 258 +--------------------- 2 files changed, 296 insertions(+), 255 deletions(-) create mode 100644 pandas/tests/series/test_timezones.py diff --git a/pandas/tests/series/test_timezones.py b/pandas/tests/series/test_timezones.py new file mode 100644 index 0000000000000..2e15c964e4e93 --- /dev/null +++ b/pandas/tests/series/test_timezones.py @@ -0,0 +1,293 @@ +# -*- coding: utf-8 -*- +""" +Tests for Series timezone-related methods +""" +from datetime import datetime + +import pytest +import pytz +import numpy as np +from dateutil.tz import tzoffset + +import pandas.util.testing as tm +from pandas._libs import tslib +from pandas._libs.tslibs import timezones +from pandas.compat import lrange +from pandas.core.indexes.datetimes import date_range +from pandas import Series, Timestamp, DatetimeIndex, Index + + +class TestSeriesTimezones(object): + # ----------------------------------------------------------------- + # Series.tz_localize + def test_series_tz_localize(self): + + rng = date_range('1/1/2011', periods=100, freq='H') + ts = Series(1, index=rng) + + result = ts.tz_localize('utc') + assert result.index.tz.zone == 'UTC' + + # Can't localize if already tz-aware + rng = date_range('1/1/2011', periods=100, freq='H', tz='utc') + ts = Series(1, index=rng) + tm.assert_raises_regex(TypeError, 'Already tz-aware', + ts.tz_localize, 'US/Eastern') + + def test_series_tz_localize_ambiguous_bool(self): + # make sure that we are correctly accepting bool values as ambiguous + + # GH#14402 + ts = Timestamp('2015-11-01 01:00:03') + expected0 = Timestamp('2015-11-01 01:00:03-0500', tz='US/Central') + expected1 = Timestamp('2015-11-01 01:00:03-0600', tz='US/Central') + + ser = Series([ts]) + expected0 = Series([expected0]) + expected1 = Series([expected1]) + + with pytest.raises(pytz.AmbiguousTimeError): + ser.dt.tz_localize('US/Central') + + result = ser.dt.tz_localize('US/Central', ambiguous=True) + tm.assert_series_equal(result, expected0) + + result = ser.dt.tz_localize('US/Central', ambiguous=[True]) + tm.assert_series_equal(result, expected0) + + result = ser.dt.tz_localize('US/Central', ambiguous=False) + tm.assert_series_equal(result, expected1) + + result = ser.dt.tz_localize('US/Central', ambiguous=[False]) + tm.assert_series_equal(result, expected1) + + @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + def test_series_tz_localize_empty(self, tzstr): + # GH#2248 + ser = Series() + + ser2 = ser.tz_localize('utc') + assert ser2.index.tz == pytz.utc + + ser2 = ser.tz_localize(tzstr) + timezones.tz_compare(ser2.index.tz, timezones.maybe_get_tz(tzstr)) + + # ----------------------------------------------------------------- + # Series.tz_convert + + def test_series_tz_convert(self): + rng = date_range('1/1/2011', periods=200, freq='D', tz='US/Eastern') + ts = Series(1, index=rng) + + result = ts.tz_convert('Europe/Berlin') + assert result.index.tz.zone == 'Europe/Berlin' + + # can't convert tz-naive + rng = date_range('1/1/2011', periods=200, freq='D') + ts = Series(1, index=rng) + tm.assert_raises_regex(TypeError, "Cannot convert tz-naive", + ts.tz_convert, 'US/Eastern') + + # ----------------------------------------------------------------- + # Series.append + + def test_series_append_aware(self): + rng1 = date_range('1/1/2011 01:00', periods=1, freq='H', + tz='US/Eastern') + rng2 = date_range('1/1/2011 02:00', periods=1, freq='H', + tz='US/Eastern') + ser1 = Series([1], index=rng1) + ser2 = Series([2], index=rng2) + ts_result = ser1.append(ser2) + + exp_index = DatetimeIndex(['2011-01-01 01:00', '2011-01-01 02:00'], + tz='US/Eastern') + exp = Series([1, 2], index=exp_index) + tm.assert_series_equal(ts_result, exp) + assert ts_result.index.tz == rng1.tz + + rng1 = date_range('1/1/2011 01:00', periods=1, freq='H', tz='UTC') + rng2 = date_range('1/1/2011 02:00', periods=1, freq='H', tz='UTC') + ser1 = Series([1], index=rng1) + ser2 = Series([2], index=rng2) + ts_result = ser1.append(ser2) + + exp_index = DatetimeIndex(['2011-01-01 01:00', '2011-01-01 02:00'], + tz='UTC') + exp = Series([1, 2], index=exp_index) + tm.assert_series_equal(ts_result, exp) + utc = rng1.tz + assert utc == ts_result.index.tz + + # GH#7795 + # different tz coerces to object dtype, not UTC + rng1 = date_range('1/1/2011 01:00', periods=1, freq='H', + tz='US/Eastern') + rng2 = date_range('1/1/2011 02:00', periods=1, freq='H', + tz='US/Central') + ser1 = Series([1], index=rng1) + ser2 = Series([2], index=rng2) + ts_result = ser1.append(ser2) + exp_index = Index([Timestamp('1/1/2011 01:00', tz='US/Eastern'), + Timestamp('1/1/2011 02:00', tz='US/Central')]) + exp = Series([1, 2], index=exp_index) + tm.assert_series_equal(ts_result, exp) + + def test_series_append_aware_naive(self): + rng1 = date_range('1/1/2011 01:00', periods=1, freq='H') + rng2 = date_range('1/1/2011 02:00', periods=1, freq='H', + tz='US/Eastern') + ser1 = Series(np.random.randn(len(rng1)), index=rng1) + ser2 = Series(np.random.randn(len(rng2)), index=rng2) + ts_result = ser1.append(ser2) + + expected = ser1.index.astype(object).append(ser2.index.astype(object)) + assert ts_result.index.equals(expected) + + # mixed + rng1 = date_range('1/1/2011 01:00', periods=1, freq='H') + rng2 = lrange(100) + ser1 = Series(np.random.randn(len(rng1)), index=rng1) + ser2 = Series(np.random.randn(len(rng2)), index=rng2) + ts_result = ser1.append(ser2) + + expected = ser1.index.astype(object).append(ser2.index) + assert ts_result.index.equals(expected) + + def test_series_append_dst(self): + rng1 = date_range('1/1/2016 01:00', periods=3, freq='H', + tz='US/Eastern') + rng2 = date_range('8/1/2016 01:00', periods=3, freq='H', + tz='US/Eastern') + ser1 = Series([1, 2, 3], index=rng1) + ser2 = Series([10, 11, 12], index=rng2) + ts_result = ser1.append(ser2) + + exp_index = DatetimeIndex(['2016-01-01 01:00', '2016-01-01 02:00', + '2016-01-01 03:00', '2016-08-01 01:00', + '2016-08-01 02:00', '2016-08-01 03:00'], + tz='US/Eastern') + exp = Series([1, 2, 3, 10, 11, 12], index=exp_index) + tm.assert_series_equal(ts_result, exp) + assert ts_result.index.tz == rng1.tz + + # ----------------------------------------------------------------- + + def test_dateutil_tzoffset_support(self): + values = [188.5, 328.25] + tzinfo = tzoffset(None, 7200) + index = [datetime(2012, 5, 11, 11, tzinfo=tzinfo), + datetime(2012, 5, 11, 12, tzinfo=tzinfo)] + series = Series(data=values, index=index) + + assert series.index.tz == tzinfo + + # it works! #2443 + repr(series.index[0]) + + @pytest.mark.parametrize('tz', ['US/Eastern', 'dateutil/US/Eastern']) + def test_tz_aware_asfreq(self, tz): + dr = date_range('2011-12-01', '2012-07-20', freq='D', tz=tz) + + ser = Series(np.random.randn(len(dr)), index=dr) + + # it works! + ser.asfreq('T') + + @pytest.mark.parametrize('tz', ['US/Eastern', 'dateutil/US/Eastern']) + def test_string_index_alias_tz_aware(self, tz): + rng = date_range('1/1/2000', periods=10, tz=tz) + ser = Series(np.random.randn(len(rng)), index=rng) + + result = ser['1/3/2000'] + tm.assert_almost_equal(result, ser[2]) + + # TODO: De-duplicate with test below + def test_series_add_tz_mismatch_converts_to_utc_duplicate(self): + rng = date_range('1/1/2011', periods=10, freq='H', tz='US/Eastern') + ser = Series(np.random.randn(len(rng)), index=rng) + + ts_moscow = ser.tz_convert('Europe/Moscow') + + result = ser + ts_moscow + assert result.index.tz is pytz.utc + + result = ts_moscow + ser + assert result.index.tz is pytz.utc + + def test_series_add_tz_mismatch_converts_to_utc(self): + rng = date_range('1/1/2011', periods=100, freq='H', tz='utc') + + perm = np.random.permutation(100)[:90] + ser1 = Series(np.random.randn(90), + index=rng.take(perm).tz_convert('US/Eastern')) + + perm = np.random.permutation(100)[:90] + ser2 = Series(np.random.randn(90), + index=rng.take(perm).tz_convert('Europe/Berlin')) + + result = ser1 + ser2 + + uts1 = ser1.tz_convert('utc') + uts2 = ser2.tz_convert('utc') + expected = uts1 + uts2 + + assert result.index.tz == pytz.UTC + tm.assert_series_equal(result, expected) + + def test_series_add_aware_naive_raises(self): + rng = date_range('1/1/2011', periods=10, freq='H') + ser = Series(np.random.randn(len(rng)), index=rng) + + ser_utc = ser.tz_localize('utc') + + with pytest.raises(Exception): + ser + ser_utc + + with pytest.raises(Exception): + ser_utc + ser + + def test_series_align_aware(self): + idx1 = date_range('2001', periods=5, freq='H', tz='US/Eastern') + ser = Series(np.random.randn(len(idx1)), index=idx1) + ser_central = ser.tz_convert('US/Central') + # # different timezones convert to UTC + + new1, new2 = ser.align(ser_central) + assert new1.index.tz == pytz.UTC + assert new2.index.tz == pytz.UTC + + @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + def test_localized_at_time_between_time(self, tzstr): + from datetime import time + tz = timezones.maybe_get_tz(tzstr) + + rng = date_range('4/16/2012', '5/1/2012', freq='H') + ts = Series(np.random.randn(len(rng)), index=rng) + + ts_local = ts.tz_localize(tzstr) + + result = ts_local.at_time(time(10, 0)) + expected = ts.at_time(time(10, 0)).tz_localize(tzstr) + tm.assert_series_equal(result, expected) + assert timezones.tz_compare(result.index.tz, tz) + + t1, t2 = time(10, 0), time(11, 0) + result = ts_local.between_time(t1, t2) + expected = ts.between_time(t1, t2).tz_localize(tzstr) + tm.assert_series_equal(result, expected) + assert timezones.tz_compare(result.index.tz, tz) + + @pytest.mark.parametrize('tzstr', ['Europe/Berlin', + 'dateutil/Europe/Berlin']) + def test_getitem_pydatetime_tz(self, tzstr): + tz = timezones.maybe_get_tz(tzstr) + + index = date_range(start='2012-12-24 16:00', end='2012-12-24 18:00', + freq='H', tz=tzstr) + ts = Series(index=index, data=index.hour) + time_pandas = Timestamp('2012-12-24 17:00', tz=tzstr) + + dt = datetime(2012, 12, 24, 17, 0) + time_datetime = tslib._localize_pydatetime(dt, tz) + assert ts[time_pandas] == ts[time_datetime] diff --git a/pandas/tests/tseries/test_timezones.py b/pandas/tests/tseries/test_timezones.py index 2630984a70807..8f46e0a58580e 100644 --- a/pandas/tests/tseries/test_timezones.py +++ b/pandas/tests/tseries/test_timezones.py @@ -8,7 +8,7 @@ from dateutil.parser import parse from pytz import NonExistentTimeError from distutils.version import LooseVersion -from dateutil.tz import tzlocal, tzoffset +from dateutil.tz import tzlocal from datetime import datetime, timedelta, tzinfo import pandas.util.testing as tm @@ -18,9 +18,9 @@ from pandas.core.indexes.datetimes import bdate_range, date_range from pandas._libs import tslib from pandas._libs.tslibs import timezones, conversion -from pandas import (Index, Series, isna, Timestamp, NaT, +from pandas import (Index, isna, Timestamp, NaT, DatetimeIndex, to_datetime) -from pandas.util.testing import assert_series_equal, set_timezone +from pandas.util.testing import set_timezone class FixedOffset(tzinfo): @@ -142,17 +142,6 @@ def test_tz_localize_dti(self): pytest.raises(pytz.NonExistentTimeError, dti.tz_localize, self.tzstr('US/Eastern')) - def test_tz_localize_empty_series(self): - # #2248 - - ts = Series() - - ts2 = ts.tz_localize('utc') - assert ts2.index.tz == pytz.utc - - ts2 = ts.tz_localize(self.tzstr('US/Eastern')) - assert self.cmptz(ts2.index.tz, self.tz('US/Eastern')) - def test_create_with_tz(self): stamp = Timestamp('3/11/2012 05:00', tz=self.tzstr('US/Eastern')) assert stamp.hour == 5 @@ -455,34 +444,6 @@ def test_ambiguous_nat(self): # right is datetime64[ns, tzfile('/usr/share/zoneinfo/US/Eastern')] tm.assert_numpy_array_equal(di_test.values, localized.values) - def test_ambiguous_bool(self): - # make sure that we are correctly accepting bool values as ambiguous - - # gh-14402 - t = Timestamp('2015-11-01 01:00:03') - expected0 = Timestamp('2015-11-01 01:00:03-0500', tz='US/Central') - expected1 = Timestamp('2015-11-01 01:00:03-0600', tz='US/Central') - - s = Series([t]) - expected0 = Series([expected0]) - expected1 = Series([expected1]) - - def f(): - s.dt.tz_localize('US/Central') - pytest.raises(pytz.AmbiguousTimeError, f) - - result = s.dt.tz_localize('US/Central', ambiguous=True) - assert_series_equal(result, expected0) - - result = s.dt.tz_localize('US/Central', ambiguous=[True]) - assert_series_equal(result, expected0) - - result = s.dt.tz_localize('US/Central', ambiguous=False) - assert_series_equal(result, expected1) - - result = s.dt.tz_localize('US/Central', ambiguous=[False]) - assert_series_equal(result, expected1) - def test_nonexistent_raise_coerce(self): # See issue 13057 from pytz.exceptions import NonExistentTimeError @@ -565,34 +526,6 @@ def test_index_astype_asobject_tzinfos(self): assert x == exval assert x.tzinfo == exval.tzinfo - def test_localized_at_time_between_time(self): - from datetime import time - - rng = date_range('4/16/2012', '5/1/2012', freq='H') - ts = Series(np.random.randn(len(rng)), index=rng) - - ts_local = ts.tz_localize(self.tzstr('US/Eastern')) - - result = ts_local.at_time(time(10, 0)) - expected = ts.at_time(time(10, 0)).tz_localize(self.tzstr( - 'US/Eastern')) - assert_series_equal(result, expected) - assert self.cmptz(result.index.tz, self.tz('US/Eastern')) - - t1, t2 = time(10, 0), time(11, 0) - result = ts_local.between_time(t1, t2) - expected = ts.between_time(t1, - t2).tz_localize(self.tzstr('US/Eastern')) - assert_series_equal(result, expected) - assert self.cmptz(result.index.tz, self.tz('US/Eastern')) - - def test_string_index_alias_tz_aware(self): - rng = date_range('1/1/2000', periods=10, tz=self.tzstr('US/Eastern')) - ts = Series(np.random.randn(len(rng)), index=rng) - - result = ts['1/3/2000'] - tm.assert_almost_equal(result, ts[2]) - def test_fixed_offset(self): dates = [datetime(2000, 1, 1, tzinfo=fixed_off), datetime(2000, 1, 2, tzinfo=fixed_off), @@ -668,15 +601,6 @@ def test_shift_localized(self): result = dr_tz.shift(1, '10T') assert result.tz == dr_tz.tz - def test_tz_aware_asfreq(self): - dr = date_range('2011-12-01', '2012-07-20', freq='D', - tz=self.tzstr('US/Eastern')) - - s = Series(np.random.randn(len(dr)), index=dr) - - # it works! - s.asfreq('T') - def test_static_tzinfo(self): # it works! index = DatetimeIndex([datetime(2012, 1, 1)], tz=self.tzstr('EST')) @@ -709,28 +633,6 @@ def test_convert_datetime_list(self): assert dr.tz == dr2.tz assert dr2.name == 'foo' - def test_dateutil_tzoffset_support(self): - values = [188.5, 328.25] - tzinfo = tzoffset(None, 7200) - index = [datetime(2012, 5, 11, 11, tzinfo=tzinfo), - datetime(2012, 5, 11, 12, tzinfo=tzinfo)] - series = Series(data=values, index=index) - - assert series.index.tz == tzinfo - - # it works! #2443 - repr(series.index[0]) - - def test_getitem_pydatetime_tz(self): - index = date_range(start='2012-12-24 16:00', end='2012-12-24 18:00', - freq='H', tz=self.tzstr('Europe/Berlin')) - ts = Series(index=index, data=index.hour) - time_pandas = Timestamp('2012-12-24 17:00', - tz=self.tzstr('Europe/Berlin')) - time_datetime = self.localize( - self.tz('Europe/Berlin'), datetime(2012, 12, 24, 17, 0)) - assert ts[time_pandas] == ts[time_datetime] - def test_index_drop_dont_lose_tz(self): # #2621 ind = date_range("2012-12-01", periods=10, tz="utc") @@ -1056,33 +958,6 @@ def test_tz_localize_roundtrip(self): tm.assert_index_equal(reset, idx) assert reset.tzinfo is None - def test_series_tz_localize(self): - - rng = date_range('1/1/2011', periods=100, freq='H') - ts = Series(1, index=rng) - - result = ts.tz_localize('utc') - assert result.index.tz.zone == 'UTC' - - # Can't localize if already tz-aware - rng = date_range('1/1/2011', periods=100, freq='H', tz='utc') - ts = Series(1, index=rng) - tm.assert_raises_regex(TypeError, 'Already tz-aware', - ts.tz_localize, 'US/Eastern') - - def test_series_tz_convert(self): - rng = date_range('1/1/2011', periods=200, freq='D', tz='US/Eastern') - ts = Series(1, index=rng) - - result = ts.tz_convert('Europe/Berlin') - assert result.index.tz.zone == 'Europe/Berlin' - - # can't convert tz-naive - rng = date_range('1/1/2011', periods=200, freq='D') - ts = Series(1, index=rng) - tm.assert_raises_regex(TypeError, "Cannot convert tz-naive", - ts.tz_convert, 'US/Eastern') - def test_tz_convert_roundtrip(self): for tz in self.timezones: idx1 = date_range(start='2014-01-01', end='2014-12-31', freq='M', @@ -1127,12 +1002,6 @@ def test_join_utc_convert(self): def test_join_aware(self): rng = date_range('1/1/2011', periods=10, freq='H') - ts = Series(np.random.randn(len(rng)), index=rng) - - ts_utc = ts.tz_localize('utc') - - pytest.raises(Exception, ts.__add__, ts_utc) - pytest.raises(Exception, ts_utc.__add__, ts) # non-overlapping rng = date_range("2012-11-15 00:00:00", periods=6, freq="H", @@ -1144,127 +1013,6 @@ def test_join_aware(self): result = rng.union(rng2) assert result.tz.zone == 'UTC' - def test_series_align_aware(self): - idx1 = date_range('2001', periods=5, freq='H', tz='US/Eastern') - ser = Series(np.random.randn(len(idx1)), index=idx1) - ser_central = ser.tz_convert('US/Central') - # # different timezones convert to UTC - - new1, new2 = ser.align(ser_central) - assert new1.index.tz == pytz.UTC - assert new2.index.tz == pytz.UTC - - def test_append_aware(self): - rng1 = date_range('1/1/2011 01:00', periods=1, freq='H', - tz='US/Eastern') - rng2 = date_range('1/1/2011 02:00', periods=1, freq='H', - tz='US/Eastern') - ts1 = Series([1], index=rng1) - ts2 = Series([2], index=rng2) - ts_result = ts1.append(ts2) - - exp_index = DatetimeIndex(['2011-01-01 01:00', '2011-01-01 02:00'], - tz='US/Eastern') - exp = Series([1, 2], index=exp_index) - assert_series_equal(ts_result, exp) - assert ts_result.index.tz == rng1.tz - - rng1 = date_range('1/1/2011 01:00', periods=1, freq='H', tz='UTC') - rng2 = date_range('1/1/2011 02:00', periods=1, freq='H', tz='UTC') - ts1 = Series([1], index=rng1) - ts2 = Series([2], index=rng2) - ts_result = ts1.append(ts2) - - exp_index = DatetimeIndex(['2011-01-01 01:00', '2011-01-01 02:00'], - tz='UTC') - exp = Series([1, 2], index=exp_index) - assert_series_equal(ts_result, exp) - utc = rng1.tz - assert utc == ts_result.index.tz - - # GH 7795 - # different tz coerces to object dtype, not UTC - rng1 = date_range('1/1/2011 01:00', periods=1, freq='H', - tz='US/Eastern') - rng2 = date_range('1/1/2011 02:00', periods=1, freq='H', - tz='US/Central') - ts1 = Series([1], index=rng1) - ts2 = Series([2], index=rng2) - ts_result = ts1.append(ts2) - exp_index = Index([Timestamp('1/1/2011 01:00', tz='US/Eastern'), - Timestamp('1/1/2011 02:00', tz='US/Central')]) - exp = Series([1, 2], index=exp_index) - assert_series_equal(ts_result, exp) - - def test_append_dst(self): - rng1 = date_range('1/1/2016 01:00', periods=3, freq='H', - tz='US/Eastern') - rng2 = date_range('8/1/2016 01:00', periods=3, freq='H', - tz='US/Eastern') - ts1 = Series([1, 2, 3], index=rng1) - ts2 = Series([10, 11, 12], index=rng2) - ts_result = ts1.append(ts2) - - exp_index = DatetimeIndex(['2016-01-01 01:00', '2016-01-01 02:00', - '2016-01-01 03:00', '2016-08-01 01:00', - '2016-08-01 02:00', '2016-08-01 03:00'], - tz='US/Eastern') - exp = Series([1, 2, 3, 10, 11, 12], index=exp_index) - assert_series_equal(ts_result, exp) - assert ts_result.index.tz == rng1.tz - - def test_append_aware_naive(self): - rng1 = date_range('1/1/2011 01:00', periods=1, freq='H') - rng2 = date_range('1/1/2011 02:00', periods=1, freq='H', - tz='US/Eastern') - ts1 = Series(np.random.randn(len(rng1)), index=rng1) - ts2 = Series(np.random.randn(len(rng2)), index=rng2) - ts_result = ts1.append(ts2) - - assert ts_result.index.equals(ts1.index.astype(object).append( - ts2.index.astype(object))) - - # mixed - rng1 = date_range('1/1/2011 01:00', periods=1, freq='H') - rng2 = lrange(100) - ts1 = Series(np.random.randn(len(rng1)), index=rng1) - ts2 = Series(np.random.randn(len(rng2)), index=rng2) - ts_result = ts1.append(ts2) - assert ts_result.index.equals(ts1.index.astype(object).append( - ts2.index)) - - def test_series_add_tz_mismatch_converts_to_utc(self): - rng = date_range('1/1/2011', periods=10, freq='H', tz='US/Eastern') - ts = Series(np.random.randn(len(rng)), index=rng) - - ts_moscow = ts.tz_convert('Europe/Moscow') - - result = ts + ts_moscow - assert result.index.tz is pytz.utc - - result = ts_moscow + ts - assert result.index.tz is pytz.utc - - def test_arith_utc_convert(self): - rng = date_range('1/1/2011', periods=100, freq='H', tz='utc') - - perm = np.random.permutation(100)[:90] - ts1 = Series(np.random.randn(90), - index=rng.take(perm).tz_convert('US/Eastern')) - - perm = np.random.permutation(100)[:90] - ts2 = Series(np.random.randn(90), - index=rng.take(perm).tz_convert('Europe/Berlin')) - - result = ts1 + ts2 - - uts1 = ts1.tz_convert('utc') - uts2 = ts2.tz_convert('utc') - expected = uts1 + uts2 - - assert result.index.tz == pytz.UTC - assert_series_equal(result, expected) - def test_intersection(self): rng = date_range('1/1/2011', periods=100, freq='H', tz='utc') From d5eead6bdd745a98d8828e93a9e2718008af4d0a Mon Sep 17 00:00:00 2001 From: Sangwoong Yoon Date: Tue, 6 Feb 2018 23:16:13 +0900 Subject: [PATCH 055/214] DOC/ERR: better error message on no common merge keys (#19427) --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/core/frame.py | 2 +- pandas/core/reshape/merge.py | 7 ++++++- pandas/tests/reshape/merge/test_merge.py | 8 ++++++++ 4 files changed, 16 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index ca625f492b61f..54dba831f7216 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -667,6 +667,7 @@ Reshaping - Bug in :func:`DataFrame.stack`, :func:`DataFrame.unstack`, :func:`Series.unstack` which were not returning subclasses (:issue:`15563`) - Bug in timezone comparisons, manifesting as a conversion of the index to UTC in ``.concat()`` (:issue:`18523`) - Bug in :func:`concat` when concatting sparse and dense series it returns only a ``SparseDataFrame``. Should be a ``DataFrame``. (:issue:`18914`, :issue:`18686`, and :issue:`16874`) +- Improved error message for :func:`DataFrame.merge` when there is no common merge key (:issue:`19427`) - diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 201d8ba427c8a..3d1983f65d70d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -233,7 +233,7 @@ -------- merge_ordered merge_asof - +DataFrame.join """ # ----------------------------------------------------------------------- diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 3ec78ce52c6e5..9dbb327e3d956 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1021,7 +1021,12 @@ def _validate_specification(self): common_cols = self.left.columns.intersection( self.right.columns) if len(common_cols) == 0: - raise MergeError('No common columns to perform merge on') + raise MergeError( + 'No common columns to perform merge on. ' + 'Merge options: left_on={lon}, right_on={ron}, ' + 'left_index={lidx}, right_index={ridx}' + .format(lon=self.left_on, ron=self.right_on, + lidx=self.left_index, ridx=self.right_index)) if not common_cols.is_unique: raise MergeError("Data columns not unique: {common!r}" .format(common=common_cols)) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index f63c206c0c407..32f83ab972be5 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -270,6 +270,14 @@ def test_no_overlap_more_informative_error(self): df2 = DataFrame({'y': ['b', 'c']}, index=[dt, dt]) pytest.raises(MergeError, merge, df1, df2) + msg = ('No common columns to perform merge on. ' + 'Merge options: left_on={lon}, right_on={ron}, ' + 'left_index={lidx}, right_index={ridx}' + .format(lon=None, ron=None, lidx=False, ridx=False)) + + with tm.assert_raises_regex(MergeError, msg): + merge(df1, df2) + def test_merge_non_unique_indexes(self): dt = datetime(2012, 5, 1) From 93c86aa13e1b7816c762b4ff372aef80a7830af8 Mon Sep 17 00:00:00 2001 From: miker985 Date: Tue, 6 Feb 2018 06:17:14 -0800 Subject: [PATCH 056/214] BUGFIX - AttributeError raised in StataReader.value_labels() (#19510) --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/io/stata.py | 8 +++++--- pandas/tests/io/test_stata.py | 10 ++++++++++ 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 54dba831f7216..b5bf7ccbda0b6 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -627,6 +627,7 @@ I/O - Bug in :func:`DataFrame.to_parquet` where an exception was raised if the write destination is S3 (:issue:`19134`) - :class:`Interval` now supported in :func:`DataFrame.to_excel` for all Excel file types (:issue:`19242`) - :class:`Timedelta` now supported in :func:`DataFrame.to_excel` for xls file type (:issue:`19242`, :issue:`9155`) +- Bug in :meth:`pandas.io.stata.StataReader.value_labels` raising an ``AttributeError`` when called on very old files. Now returns an empty dict (:issue:`19417`) Plotting ^^^^^^^^ diff --git a/pandas/io/stata.py b/pandas/io/stata.py index adbff06364dbe..ee6975ea1d938 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1341,12 +1341,14 @@ def _null_terminate(self, s): return s def _read_value_labels(self): - if self.format_version <= 108: - # Value labels are not supported in version 108 and earlier. - return if self._value_labels_read: # Don't read twice return + if self.format_version <= 108: + # Value labels are not supported in version 108 and earlier. + self._value_labels_read = True + self.value_label_dict = dict() + return if self.format_version >= 117: self.path_or_buf.seek(self.seek_value_labels) diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 89d76061329a3..4e259d0994bdb 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -589,6 +589,16 @@ def test_105(self): df0['psch_dis'] = df0["psch_dis"].astype(np.float32) tm.assert_frame_equal(df.head(3), df0) + def test_value_labels_old_format(self): + # GH 19417 + # + # Test that value_labels() returns an empty dict if the file format + # predates supporting value labels. + dpath = os.path.join(self.dirpath, 'S4_EDUC1.dta') + reader = StataReader(dpath) + assert reader.value_labels() == {} + reader.close() + def test_date_export_formats(self): columns = ['tc', 'td', 'tw', 'tm', 'tq', 'th', 'ty'] conversions = {c: c for c in columns} From 04b1f039bd63497d8e4fc483baa79dcbe6219abd Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 6 Feb 2018 15:41:47 -0800 Subject: [PATCH 057/214] separate DatetimeIndex timezone tests (#19545) --- .../indexes/datetimes/test_arithmetic.py | 26 + .../tests/indexes/datetimes/test_timezones.py | 1018 +++++++++++++++++ pandas/tests/tseries/test_timezones.py | 1007 +--------------- 3 files changed, 1047 insertions(+), 1004 deletions(-) create mode 100644 pandas/tests/indexes/datetimes/test_timezones.py diff --git a/pandas/tests/indexes/datetimes/test_arithmetic.py b/pandas/tests/indexes/datetimes/test_arithmetic.py index 09a6b35a0ff0e..f6f8eccf4e30c 100644 --- a/pandas/tests/indexes/datetimes/test_arithmetic.py +++ b/pandas/tests/indexes/datetimes/test_arithmetic.py @@ -412,6 +412,14 @@ def test_dti_shift_no_freq(self): with pytest.raises(NullFrequencyError): dti.shift(2) + @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + def test_dti_shift_localized(self, tzstr): + dr = date_range('2011/1/1', '2012/1/1', freq='W-FRI') + dr_tz = dr.tz_localize(tzstr) + + result = dr_tz.shift(1, '10T') + assert result.tz == dr_tz.tz + # ------------------------------------------------------------- # Binary operations DatetimeIndex and timedelta-like @@ -767,6 +775,24 @@ def test_dti_with_offset_series(self, tz, names): res3 = dti - other tm.assert_series_equal(res3, expected_sub) + def test_dti_add_offset_tzaware(self): + dates = date_range('2012-11-01', periods=3, tz='US/Pacific') + offset = dates + pd.offsets.Hour(5) + assert dates[0] + pd.offsets.Hour(5) == offset[0] + + # GH#6818 + for tz in ['UTC', 'US/Pacific', 'Asia/Tokyo']: + dates = date_range('2010-11-01 00:00', periods=3, tz=tz, freq='H') + expected = DatetimeIndex(['2010-11-01 05:00', '2010-11-01 06:00', + '2010-11-01 07:00'], freq='H', tz=tz) + + offset = dates + pd.offsets.Hour(5) + tm.assert_index_equal(offset, expected) + offset = dates + np.timedelta64(5, 'h') + tm.assert_index_equal(offset, expected) + offset = dates + timedelta(hours=5) + tm.assert_index_equal(offset, expected) + @pytest.mark.parametrize('klass,assert_func', [ (Series, tm.assert_series_equal), diff --git a/pandas/tests/indexes/datetimes/test_timezones.py b/pandas/tests/indexes/datetimes/test_timezones.py new file mode 100644 index 0000000000000..075d239df5f7a --- /dev/null +++ b/pandas/tests/indexes/datetimes/test_timezones.py @@ -0,0 +1,1018 @@ +# -*- coding: utf-8 -*- +""" +Tests for DatetimeIndex timezone-related methods +""" +from datetime import datetime, timedelta, tzinfo +from distutils.version import LooseVersion + +import pytest +import pytz +import dateutil +from dateutil.tz import gettz, tzlocal +import numpy as np + +import pandas.util.testing as tm +import pandas.util._test_decorators as td + +import pandas as pd +from pandas._libs import tslib +from pandas._libs.tslibs import timezones +from pandas.compat import lrange, zip +from pandas import (DatetimeIndex, date_range, bdate_range, + Timestamp, isna, to_datetime, Index) + + +class FixedOffset(tzinfo): + """Fixed offset in minutes east from UTC.""" + + def __init__(self, offset, name): + self.__offset = timedelta(minutes=offset) + self.__name = name + + def utcoffset(self, dt): + return self.__offset + + def tzname(self, dt): + return self.__name + + def dst(self, dt): + return timedelta(0) + + +fixed_off = FixedOffset(-420, '-07:00') +fixed_off_no_name = FixedOffset(-330, None) + + +class TestDatetimeIndexTimezones(object): + # ------------------------------------------------------------- + # DatetimeIndex.tz_convert + def test_tz_convert_nat(self): + # GH#5546 + dates = [pd.NaT] + idx = DatetimeIndex(dates) + idx = idx.tz_localize('US/Pacific') + tm.assert_index_equal(idx, DatetimeIndex(dates, tz='US/Pacific')) + idx = idx.tz_convert('US/Eastern') + tm.assert_index_equal(idx, DatetimeIndex(dates, tz='US/Eastern')) + idx = idx.tz_convert('UTC') + tm.assert_index_equal(idx, DatetimeIndex(dates, tz='UTC')) + + dates = ['2010-12-01 00:00', '2010-12-02 00:00', pd.NaT] + idx = DatetimeIndex(dates) + idx = idx.tz_localize('US/Pacific') + tm.assert_index_equal(idx, DatetimeIndex(dates, tz='US/Pacific')) + idx = idx.tz_convert('US/Eastern') + expected = ['2010-12-01 03:00', '2010-12-02 03:00', pd.NaT] + tm.assert_index_equal(idx, DatetimeIndex(expected, tz='US/Eastern')) + + idx = idx + pd.offsets.Hour(5) + expected = ['2010-12-01 08:00', '2010-12-02 08:00', pd.NaT] + tm.assert_index_equal(idx, DatetimeIndex(expected, tz='US/Eastern')) + idx = idx.tz_convert('US/Pacific') + expected = ['2010-12-01 05:00', '2010-12-02 05:00', pd.NaT] + tm.assert_index_equal(idx, DatetimeIndex(expected, tz='US/Pacific')) + + idx = idx + np.timedelta64(3, 'h') + expected = ['2010-12-01 08:00', '2010-12-02 08:00', pd.NaT] + tm.assert_index_equal(idx, DatetimeIndex(expected, tz='US/Pacific')) + + idx = idx.tz_convert('US/Eastern') + expected = ['2010-12-01 11:00', '2010-12-02 11:00', pd.NaT] + tm.assert_index_equal(idx, DatetimeIndex(expected, tz='US/Eastern')) + + @pytest.mark.parametrize('prefix', ['', 'dateutil/']) + def test_dti_tz_convert_compat_timestamp(self, prefix): + strdates = ['1/1/2012', '3/1/2012', '4/1/2012'] + idx = DatetimeIndex(strdates, tz=prefix + 'US/Eastern') + + conv = idx[0].tz_convert(prefix + 'US/Pacific') + expected = idx.tz_convert(prefix + 'US/Pacific')[0] + + assert conv == expected + + def test_dti_tz_convert_hour_overflow_dst(self): + # Regression test for: + # https://github.com/pandas-dev/pandas/issues/13306 + + # sorted case US/Eastern -> UTC + ts = ['2008-05-12 09:50:00', + '2008-12-12 09:50:35', + '2009-05-12 09:50:32'] + tt = DatetimeIndex(ts).tz_localize('US/Eastern') + ut = tt.tz_convert('UTC') + expected = Index([13, 14, 13]) + tm.assert_index_equal(ut.hour, expected) + + # sorted case UTC -> US/Eastern + ts = ['2008-05-12 13:50:00', + '2008-12-12 14:50:35', + '2009-05-12 13:50:32'] + tt = DatetimeIndex(ts).tz_localize('UTC') + ut = tt.tz_convert('US/Eastern') + expected = Index([9, 9, 9]) + tm.assert_index_equal(ut.hour, expected) + + # unsorted case US/Eastern -> UTC + ts = ['2008-05-12 09:50:00', + '2008-12-12 09:50:35', + '2008-05-12 09:50:32'] + tt = DatetimeIndex(ts).tz_localize('US/Eastern') + ut = tt.tz_convert('UTC') + expected = Index([13, 14, 13]) + tm.assert_index_equal(ut.hour, expected) + + # unsorted case UTC -> US/Eastern + ts = ['2008-05-12 13:50:00', + '2008-12-12 14:50:35', + '2008-05-12 13:50:32'] + tt = DatetimeIndex(ts).tz_localize('UTC') + ut = tt.tz_convert('US/Eastern') + expected = Index([9, 9, 9]) + tm.assert_index_equal(ut.hour, expected) + + @pytest.mark.parametrize('tz', ['US/Eastern', 'dateutil/US/Eastern']) + def test_dti_tz_convert_hour_overflow_dst_timestamps(self, tz): + # Regression test for GH#13306 + + # sorted case US/Eastern -> UTC + ts = [Timestamp('2008-05-12 09:50:00', tz=tz), + Timestamp('2008-12-12 09:50:35', tz=tz), + Timestamp('2009-05-12 09:50:32', tz=tz)] + tt = DatetimeIndex(ts) + ut = tt.tz_convert('UTC') + expected = Index([13, 14, 13]) + tm.assert_index_equal(ut.hour, expected) + + # sorted case UTC -> US/Eastern + ts = [Timestamp('2008-05-12 13:50:00', tz='UTC'), + Timestamp('2008-12-12 14:50:35', tz='UTC'), + Timestamp('2009-05-12 13:50:32', tz='UTC')] + tt = DatetimeIndex(ts) + ut = tt.tz_convert('US/Eastern') + expected = Index([9, 9, 9]) + tm.assert_index_equal(ut.hour, expected) + + # unsorted case US/Eastern -> UTC + ts = [Timestamp('2008-05-12 09:50:00', tz=tz), + Timestamp('2008-12-12 09:50:35', tz=tz), + Timestamp('2008-05-12 09:50:32', tz=tz)] + tt = DatetimeIndex(ts) + ut = tt.tz_convert('UTC') + expected = Index([13, 14, 13]) + tm.assert_index_equal(ut.hour, expected) + + # unsorted case UTC -> US/Eastern + ts = [Timestamp('2008-05-12 13:50:00', tz='UTC'), + Timestamp('2008-12-12 14:50:35', tz='UTC'), + Timestamp('2008-05-12 13:50:32', tz='UTC')] + tt = DatetimeIndex(ts) + ut = tt.tz_convert('US/Eastern') + expected = Index([9, 9, 9]) + tm.assert_index_equal(ut.hour, expected) + + def test_dti_tz_convert_trans_pos_plus_1__bug(self): + # Regression test for tslib.tz_convert(vals, tz1, tz2). + # See https://github.com/pandas-dev/pandas/issues/4496 for details. + for freq, n in [('H', 1), ('T', 60), ('S', 3600)]: + idx = date_range(datetime(2011, 3, 26, 23), + datetime(2011, 3, 27, 1), freq=freq) + idx = idx.tz_localize('UTC') + idx = idx.tz_convert('Europe/Moscow') + + expected = np.repeat(np.array([3, 4, 5]), np.array([n, n, 1])) + tm.assert_index_equal(idx.hour, Index(expected)) + + def test_dti_tz_convert_dst(self): + for freq, n in [('H', 1), ('T', 60), ('S', 3600)]: + # Start DST + idx = date_range('2014-03-08 23:00', '2014-03-09 09:00', freq=freq, + tz='UTC') + idx = idx.tz_convert('US/Eastern') + expected = np.repeat(np.array([18, 19, 20, 21, 22, 23, + 0, 1, 3, 4, 5]), + np.array([n, n, n, n, n, n, n, n, n, n, 1])) + tm.assert_index_equal(idx.hour, Index(expected)) + + idx = date_range('2014-03-08 18:00', '2014-03-09 05:00', freq=freq, + tz='US/Eastern') + idx = idx.tz_convert('UTC') + expected = np.repeat(np.array([23, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), + np.array([n, n, n, n, n, n, n, n, n, n, 1])) + tm.assert_index_equal(idx.hour, Index(expected)) + + # End DST + idx = date_range('2014-11-01 23:00', '2014-11-02 09:00', freq=freq, + tz='UTC') + idx = idx.tz_convert('US/Eastern') + expected = np.repeat(np.array([19, 20, 21, 22, 23, + 0, 1, 1, 2, 3, 4]), + np.array([n, n, n, n, n, n, n, n, n, n, 1])) + tm.assert_index_equal(idx.hour, Index(expected)) + + idx = date_range('2014-11-01 18:00', '2014-11-02 05:00', freq=freq, + tz='US/Eastern') + idx = idx.tz_convert('UTC') + expected = np.repeat(np.array([22, 23, 0, 1, 2, 3, 4, 5, 6, + 7, 8, 9, 10]), + np.array([n, n, n, n, n, n, n, n, n, + n, n, n, 1])) + tm.assert_index_equal(idx.hour, Index(expected)) + + # daily + # Start DST + idx = date_range('2014-03-08 00:00', '2014-03-09 00:00', freq='D', + tz='UTC') + idx = idx.tz_convert('US/Eastern') + tm.assert_index_equal(idx.hour, Index([19, 19])) + + idx = date_range('2014-03-08 00:00', '2014-03-09 00:00', freq='D', + tz='US/Eastern') + idx = idx.tz_convert('UTC') + tm.assert_index_equal(idx.hour, Index([5, 5])) + + # End DST + idx = date_range('2014-11-01 00:00', '2014-11-02 00:00', freq='D', + tz='UTC') + idx = idx.tz_convert('US/Eastern') + tm.assert_index_equal(idx.hour, Index([20, 20])) + + idx = date_range('2014-11-01 00:00', '2014-11-02 000:00', freq='D', + tz='US/Eastern') + idx = idx.tz_convert('UTC') + tm.assert_index_equal(idx.hour, Index([4, 4])) + + @pytest.mark.parametrize('tz', ['UTC', 'Asia/Tokyo', 'US/Eastern', + 'dateutil/US/Pacific']) + def test_tz_convert_roundtrip(self, tz): + idx1 = date_range(start='2014-01-01', end='2014-12-31', freq='M', + tz='UTC') + exp1 = date_range(start='2014-01-01', end='2014-12-31', freq='M') + + idx2 = date_range(start='2014-01-01', end='2014-12-31', freq='D', + tz='UTC') + exp2 = date_range(start='2014-01-01', end='2014-12-31', freq='D') + + idx3 = date_range(start='2014-01-01', end='2014-03-01', freq='H', + tz='UTC') + exp3 = date_range(start='2014-01-01', end='2014-03-01', freq='H') + + idx4 = date_range(start='2014-08-01', end='2014-10-31', freq='T', + tz='UTC') + exp4 = date_range(start='2014-08-01', end='2014-10-31', freq='T') + + for idx, expected in [(idx1, exp1), (idx2, exp2), (idx3, exp3), + (idx4, exp4)]: + converted = idx.tz_convert(tz) + reset = converted.tz_convert(None) + tm.assert_index_equal(reset, expected) + assert reset.tzinfo is None + expected = converted.tz_convert('UTC').tz_localize(None) + tm.assert_index_equal(reset, expected) + + def test_dti_tz_convert_tzlocal(self): + # GH#13583 + # tz_convert doesn't affect to internal + dti = date_range(start='2001-01-01', end='2001-03-01', tz='UTC') + dti2 = dti.tz_convert(dateutil.tz.tzlocal()) + tm.assert_numpy_array_equal(dti2.asi8, dti.asi8) + + dti = date_range(start='2001-01-01', end='2001-03-01', + tz=dateutil.tz.tzlocal()) + dti2 = dti.tz_convert(None) + tm.assert_numpy_array_equal(dti2.asi8, dti.asi8) + + @pytest.mark.parametrize('tz', ['US/Eastern', 'dateutil/US/Eastern', + pytz.timezone('US/Eastern'), + gettz('US/Eastern')]) + def test_dti_tz_convert_utc_to_local_no_modify(self, tz): + rng = date_range('3/11/2012', '3/12/2012', freq='H', tz='utc') + rng_eastern = rng.tz_convert(tz) + + # Values are unmodified + tm.assert_numpy_array_equal(rng.asi8, rng_eastern.asi8) + + assert timezones.tz_compare(rng_eastern.tz, timezones.maybe_get_tz(tz)) + + @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + def test_tz_convert_unsorted(self, tzstr): + dr = date_range('2012-03-09', freq='H', periods=100, tz='utc') + dr = dr.tz_convert(tzstr) + + result = dr[::-1].hour + exp = dr.hour[::-1] + tm.assert_almost_equal(result, exp) + + # ------------------------------------------------------------- + # DatetimeIndex.tz_localize + + def test_dti_tz_localize_nonexistent_raise_coerce(self): + # GH#13057 + times = ['2015-03-08 01:00', '2015-03-08 02:00', '2015-03-08 03:00'] + index = DatetimeIndex(times) + tz = 'US/Eastern' + with pytest.raises(pytz.NonExistentTimeError): + index.tz_localize(tz=tz) + + with pytest.raises(pytz.NonExistentTimeError): + index.tz_localize(tz=tz, errors='raise') + + result = index.tz_localize(tz=tz, errors='coerce') + test_times = ['2015-03-08 01:00-05:00', 'NaT', + '2015-03-08 03:00-04:00'] + dti = DatetimeIndex(test_times) + expected = dti.tz_localize('UTC').tz_convert('US/Eastern') + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize('tz', [pytz.timezone('US/Eastern'), + gettz('US/Eastern')]) + def test_dti_tz_localize_ambiguous_infer(self, tz): + # November 6, 2011, fall back, repeat 2 AM hour + # With no repeated hours, we cannot infer the transition + dr = date_range(datetime(2011, 11, 6, 0), periods=5, + freq=pd.offsets.Hour()) + with pytest.raises(pytz.AmbiguousTimeError): + dr.tz_localize(tz) + + # With repeated hours, we can infer the transition + dr = date_range(datetime(2011, 11, 6, 0), periods=5, + freq=pd.offsets.Hour(), tz=tz) + times = ['11/06/2011 00:00', '11/06/2011 01:00', '11/06/2011 01:00', + '11/06/2011 02:00', '11/06/2011 03:00'] + di = DatetimeIndex(times) + localized = di.tz_localize(tz, ambiguous='infer') + tm.assert_index_equal(dr, localized) + with tm.assert_produces_warning(FutureWarning): + localized_old = di.tz_localize(tz, infer_dst=True) + tm.assert_index_equal(dr, localized_old) + tm.assert_index_equal(dr, DatetimeIndex(times, tz=tz, + ambiguous='infer')) + + # When there is no dst transition, nothing special happens + dr = date_range(datetime(2011, 6, 1, 0), periods=10, + freq=pd.offsets.Hour()) + localized = dr.tz_localize(tz) + localized_infer = dr.tz_localize(tz, ambiguous='infer') + tm.assert_index_equal(localized, localized_infer) + with tm.assert_produces_warning(FutureWarning): + localized_infer_old = dr.tz_localize(tz, infer_dst=True) + tm.assert_index_equal(localized, localized_infer_old) + + @pytest.mark.parametrize('tz', [pytz.timezone('US/Eastern'), + gettz('US/Eastern')]) + def test_dti_tz_localize_ambiguous_times(self, tz): + # March 13, 2011, spring forward, skip from 2 AM to 3 AM + dr = date_range(datetime(2011, 3, 13, 1, 30), periods=3, + freq=pd.offsets.Hour()) + with pytest.raises(pytz.NonExistentTimeError): + dr.tz_localize(tz) + + # after dst transition, it works + dr = date_range(datetime(2011, 3, 13, 3, 30), periods=3, + freq=pd.offsets.Hour(), tz=tz) + + # November 6, 2011, fall back, repeat 2 AM hour + dr = date_range(datetime(2011, 11, 6, 1, 30), periods=3, + freq=pd.offsets.Hour()) + with pytest.raises(pytz.AmbiguousTimeError): + dr.tz_localize(tz) + + # UTC is OK + dr = date_range(datetime(2011, 3, 13), periods=48, + freq=pd.offsets.Minute(30), tz=pytz.utc) + + @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + def test_dti_tz_localize_pass_dates_to_utc(self, tzstr): + strdates = ['1/1/2012', '3/1/2012', '4/1/2012'] + + idx = DatetimeIndex(strdates) + conv = idx.tz_localize(tzstr) + + fromdates = DatetimeIndex(strdates, tz=tzstr) + + assert conv.tz == fromdates.tz + tm.assert_numpy_array_equal(conv.values, fromdates.values) + + @pytest.mark.parametrize('prefix', ['', 'dateutil/']) + def test_dti_tz_localize(self, prefix): + tzstr = prefix + 'US/Eastern' + dti = DatetimeIndex(start='1/1/2005', end='1/1/2005 0:00:30.256', + freq='L') + dti2 = dti.tz_localize(tzstr) + + dti_utc = DatetimeIndex(start='1/1/2005 05:00', + end='1/1/2005 5:00:30.256', freq='L', tz='utc') + + tm.assert_numpy_array_equal(dti2.values, dti_utc.values) + + dti3 = dti2.tz_convert(prefix + 'US/Pacific') + tm.assert_numpy_array_equal(dti3.values, dti_utc.values) + + dti = DatetimeIndex(start='11/6/2011 1:59', end='11/6/2011 2:00', + freq='L') + with pytest.raises(pytz.AmbiguousTimeError): + dti.tz_localize(tzstr) + + dti = DatetimeIndex(start='3/13/2011 1:59', end='3/13/2011 2:00', + freq='L') + with pytest.raises(pytz.NonExistentTimeError): + dti.tz_localize(tzstr) + + @pytest.mark.parametrize('tz', ['US/Eastern', 'dateutil/US/Eastern', + pytz.timezone('US/Eastern'), + gettz('US/Eastern')]) + def test_dti_tz_localize_utc_conversion(self, tz): + # Localizing to time zone should: + # 1) check for DST ambiguities + # 2) convert to UTC + + rng = date_range('3/10/2012', '3/11/2012', freq='30T') + + converted = rng.tz_localize(tz) + expected_naive = rng + pd.offsets.Hour(5) + tm.assert_numpy_array_equal(converted.asi8, expected_naive.asi8) + + # DST ambiguity, this should fail + rng = date_range('3/11/2012', '3/12/2012', freq='30T') + # Is this really how it should fail?? + with pytest.raises(pytz.NonExistentTimeError): + rng.tz_localize(tz) + + @pytest.mark.parametrize('tz', ['UTC', 'Asia/Tokyo', 'US/Eastern', + 'dateutil/US/Pacific']) + def test_dti_tz_localize_roundtrip(self, tz): + idx1 = date_range(start='2014-01-01', end='2014-12-31', freq='M') + idx2 = date_range(start='2014-01-01', end='2014-12-31', freq='D') + idx3 = date_range(start='2014-01-01', end='2014-03-01', freq='H') + idx4 = date_range(start='2014-08-01', end='2014-10-31', freq='T') + for idx in [idx1, idx2, idx3, idx4]: + localized = idx.tz_localize(tz) + expected = date_range(start=idx[0], end=idx[-1], freq=idx.freq, + tz=tz) + tm.assert_index_equal(localized, expected) + + with pytest.raises(TypeError): + localized.tz_localize(tz) + + reset = localized.tz_localize(None) + tm.assert_index_equal(reset, idx) + assert reset.tzinfo is None + + def test_dti_tz_localize_naive(self): + rng = date_range('1/1/2011', periods=100, freq='H') + + conv = rng.tz_localize('US/Pacific') + exp = date_range('1/1/2011', periods=100, freq='H', tz='US/Pacific') + + tm.assert_index_equal(conv, exp) + + def test_dti_tz_localize_tzlocal(self): + # GH#13583 + offset = dateutil.tz.tzlocal().utcoffset(datetime(2011, 1, 1)) + offset = int(offset.total_seconds() * 1000000000) + + dti = date_range(start='2001-01-01', end='2001-03-01') + dti2 = dti.tz_localize(dateutil.tz.tzlocal()) + tm.assert_numpy_array_equal(dti2.asi8 + offset, dti.asi8) + + dti = date_range(start='2001-01-01', end='2001-03-01', + tz=dateutil.tz.tzlocal()) + dti2 = dti.tz_localize(None) + tm.assert_numpy_array_equal(dti2.asi8 - offset, dti.asi8) + + @pytest.mark.parametrize('tz', [pytz.timezone('US/Eastern'), + gettz('US/Eastern')]) + def test_dti_tz_localize_ambiguous_nat(self, tz): + times = ['11/06/2011 00:00', '11/06/2011 01:00', '11/06/2011 01:00', + '11/06/2011 02:00', '11/06/2011 03:00'] + di = DatetimeIndex(times) + localized = di.tz_localize(tz, ambiguous='NaT') + + times = ['11/06/2011 00:00', np.NaN, np.NaN, '11/06/2011 02:00', + '11/06/2011 03:00'] + di_test = DatetimeIndex(times, tz='US/Eastern') + + # left dtype is datetime64[ns, US/Eastern] + # right is datetime64[ns, tzfile('/usr/share/zoneinfo/US/Eastern')] + tm.assert_numpy_array_equal(di_test.values, localized.values) + + @pytest.mark.parametrize('tz', [pytz.timezone('US/Eastern'), + gettz('US/Eastern')]) + def test_dti_tz_localize_ambiguous_flags(self, tz): + # November 6, 2011, fall back, repeat 2 AM hour + + # Pass in flags to determine right dst transition + dr = date_range(datetime(2011, 11, 6, 0), periods=5, + freq=pd.offsets.Hour(), tz=tz) + times = ['11/06/2011 00:00', '11/06/2011 01:00', '11/06/2011 01:00', + '11/06/2011 02:00', '11/06/2011 03:00'] + + # Test tz_localize + di = DatetimeIndex(times) + is_dst = [1, 1, 0, 0, 0] + localized = di.tz_localize(tz, ambiguous=is_dst) + tm.assert_index_equal(dr, localized) + tm.assert_index_equal(dr, DatetimeIndex(times, tz=tz, + ambiguous=is_dst)) + + localized = di.tz_localize(tz, ambiguous=np.array(is_dst)) + tm.assert_index_equal(dr, localized) + + localized = di.tz_localize(tz, + ambiguous=np.array(is_dst).astype('bool')) + tm.assert_index_equal(dr, localized) + + # Test constructor + localized = DatetimeIndex(times, tz=tz, ambiguous=is_dst) + tm.assert_index_equal(dr, localized) + + # Test duplicate times where infer_dst fails + times += times + di = DatetimeIndex(times) + + # When the sizes are incompatible, make sure error is raised + with pytest.raises(Exception): + di.tz_localize(tz, ambiguous=is_dst) + + # When sizes are compatible and there are repeats ('infer' won't work) + is_dst = np.hstack((is_dst, is_dst)) + localized = di.tz_localize(tz, ambiguous=is_dst) + dr = dr.append(dr) + tm.assert_index_equal(dr, localized) + + # When there is no dst transition, nothing special happens + dr = date_range(datetime(2011, 6, 1, 0), periods=10, + freq=pd.offsets.Hour()) + is_dst = np.array([1] * 10) + localized = dr.tz_localize(tz) + localized_is_dst = dr.tz_localize(tz, ambiguous=is_dst) + tm.assert_index_equal(localized, localized_is_dst) + + # TODO: belongs outside tz_localize tests? + @pytest.mark.parametrize('tz', ['Europe/London', 'dateutil/Europe/London']) + def test_dti_construction_ambiguous_endpoint(self, tz): + # construction with an ambiguous end-point + # GH#11626 + + # FIXME: This next block fails to raise; it was taken from an older + # version of this test that had an indention mistake that caused it + # to not get executed. + # with pytest.raises(pytz.AmbiguousTimeError): + # date_range("2013-10-26 23:00", "2013-10-27 01:00", + # tz="Europe/London", freq="H") + + times = date_range("2013-10-26 23:00", "2013-10-27 01:00", freq="H", + tz=tz, ambiguous='infer') + assert times[0] == Timestamp('2013-10-26 23:00', tz=tz, freq="H") + + if str(tz).startswith('dateutil'): + if LooseVersion(dateutil.__version__) < LooseVersion('2.6.0'): + # see GH#14621 + assert times[-1] == Timestamp('2013-10-27 01:00:00+0000', + tz=tz, freq="H") + elif LooseVersion(dateutil.__version__) > LooseVersion('2.6.0'): + # fixed ambiguous behavior + assert times[-1] == Timestamp('2013-10-27 01:00:00+0100', + tz=tz, freq="H") + else: + assert times[-1] == Timestamp('2013-10-27 01:00:00+0000', + tz=tz, freq="H") + + def test_dti_tz_localize_bdate_range(self): + dr = pd.bdate_range('1/1/2009', '1/1/2010') + dr_utc = pd.bdate_range('1/1/2009', '1/1/2010', tz=pytz.utc) + localized = dr.tz_localize(pytz.utc) + tm.assert_index_equal(dr_utc, localized) + + # ------------------------------------------------------------- + # DatetimeIndex.normalize + + def test_normalize_tz(self): + rng = date_range('1/1/2000 9:30', periods=10, freq='D', + tz='US/Eastern') + + result = rng.normalize() + expected = date_range('1/1/2000', periods=10, freq='D', + tz='US/Eastern') + tm.assert_index_equal(result, expected) + + assert result.is_normalized + assert not rng.is_normalized + + rng = date_range('1/1/2000 9:30', periods=10, freq='D', tz='UTC') + + result = rng.normalize() + expected = date_range('1/1/2000', periods=10, freq='D', tz='UTC') + tm.assert_index_equal(result, expected) + + assert result.is_normalized + assert not rng.is_normalized + + rng = date_range('1/1/2000 9:30', periods=10, freq='D', tz=tzlocal()) + result = rng.normalize() + expected = date_range('1/1/2000', periods=10, freq='D', tz=tzlocal()) + tm.assert_index_equal(result, expected) + + assert result.is_normalized + assert not rng.is_normalized + + @td.skip_if_windows + @pytest.mark.parametrize('timezone', ['US/Pacific', 'US/Eastern', 'UTC', + 'Asia/Kolkata', 'Asia/Shanghai', + 'Australia/Canberra']) + def test_normalize_tz_local(self, timezone): + # GH#13459 + with tm.set_timezone(timezone): + rng = date_range('1/1/2000 9:30', periods=10, freq='D', + tz=tzlocal()) + + result = rng.normalize() + expected = date_range('1/1/2000', periods=10, freq='D', + tz=tzlocal()) + tm.assert_index_equal(result, expected) + + assert result.is_normalized + assert not rng.is_normalized + + # ------------------------------------------------------------ + # DatetimeIndex.__new__ + + @pytest.mark.parametrize('prefix', ['', 'dateutil/']) + def test_dti_constructor_static_tzinfo(self, prefix): + # it works! + index = DatetimeIndex([datetime(2012, 1, 1)], tz=prefix + 'EST') + index.hour + index[0] + + def test_dti_constructor_with_fixed_tz(self): + off = FixedOffset(420, '+07:00') + start = datetime(2012, 3, 11, 5, 0, 0, tzinfo=off) + end = datetime(2012, 6, 11, 5, 0, 0, tzinfo=off) + rng = date_range(start=start, end=end) + assert off == rng.tz + + rng2 = date_range(start, periods=len(rng), tz=off) + tm.assert_index_equal(rng, rng2) + + rng3 = date_range('3/11/2012 05:00:00+07:00', + '6/11/2012 05:00:00+07:00') + assert (rng.values == rng3.values).all() + + @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + def test_dti_convert_datetime_list(self, tzstr): + dr = date_range('2012-06-02', periods=10, + tz=tzstr, name='foo') + dr2 = DatetimeIndex(list(dr), name='foo') + tm.assert_index_equal(dr, dr2) + assert dr.tz == dr2.tz + assert dr2.name == 'foo' + + def test_dti_construction_univalent(self): + rng = date_range('03/12/2012 00:00', periods=10, freq='W-FRI', + tz='US/Eastern') + rng2 = DatetimeIndex(data=rng, tz='US/Eastern') + tm.assert_index_equal(rng, rng2) + + @pytest.mark.parametrize('tz', [pytz.timezone('US/Eastern'), + gettz('US/Eastern')]) + def test_dti_from_tzaware_datetime(self, tz): + d = [datetime(2012, 8, 19, tzinfo=tz)] + + index = DatetimeIndex(d) + assert timezones.tz_compare(index.tz, tz) + + @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + def test_dti_tz_constructors(self, tzstr): + """ Test different DatetimeIndex constructions with timezone + Follow-up of GH#4229 + """ + + arr = ['11/10/2005 08:00:00', '11/10/2005 09:00:00'] + + idx1 = to_datetime(arr).tz_localize(tzstr) + idx2 = DatetimeIndex(start="2005-11-10 08:00:00", freq='H', periods=2, + tz=tzstr) + idx3 = DatetimeIndex(arr, tz=tzstr) + idx4 = DatetimeIndex(np.array(arr), tz=tzstr) + + for other in [idx2, idx3, idx4]: + tm.assert_index_equal(idx1, other) + + # ------------------------------------------------------------- + # Unsorted + + def test_join_utc_convert(self): + rng = date_range('1/1/2011', periods=100, freq='H', tz='utc') + + left = rng.tz_convert('US/Eastern') + right = rng.tz_convert('Europe/Berlin') + + for how in ['inner', 'outer', 'left', 'right']: + result = left.join(left[:-5], how=how) + assert isinstance(result, DatetimeIndex) + assert result.tz == left.tz + + result = left.join(right[:-5], how=how) + assert isinstance(result, DatetimeIndex) + assert result.tz.zone == 'UTC' + + def test_dti_drop_dont_lose_tz(self): + # GH#2621 + ind = date_range("2012-12-01", periods=10, tz="utc") + ind = ind.drop(ind[-1]) + + assert ind.tz is not None + + def test_date_range_localize(self): + rng = date_range('3/11/2012 03:00', periods=15, freq='H', + tz='US/Eastern') + rng2 = DatetimeIndex(['3/11/2012 03:00', '3/11/2012 04:00'], + tz='US/Eastern') + rng3 = date_range('3/11/2012 03:00', periods=15, freq='H') + rng3 = rng3.tz_localize('US/Eastern') + + tm.assert_index_equal(rng, rng3) + + # DST transition time + val = rng[0] + exp = Timestamp('3/11/2012 03:00', tz='US/Eastern') + + assert val.hour == 3 + assert exp.hour == 3 + assert val == exp # same UTC value + tm.assert_index_equal(rng[:2], rng2) + + # Right before the DST transition + rng = date_range('3/11/2012 00:00', periods=2, freq='H', + tz='US/Eastern') + rng2 = DatetimeIndex(['3/11/2012 00:00', '3/11/2012 01:00'], + tz='US/Eastern') + tm.assert_index_equal(rng, rng2) + exp = Timestamp('3/11/2012 00:00', tz='US/Eastern') + assert exp.hour == 0 + assert rng[0] == exp + exp = Timestamp('3/11/2012 01:00', tz='US/Eastern') + assert exp.hour == 1 + assert rng[1] == exp + + rng = date_range('3/11/2012 00:00', periods=10, freq='H', + tz='US/Eastern') + assert rng[2].hour == 3 + + def test_timestamp_equality_different_timezones(self): + utc_range = date_range('1/1/2000', periods=20, tz='UTC') + eastern_range = utc_range.tz_convert('US/Eastern') + berlin_range = utc_range.tz_convert('Europe/Berlin') + + for a, b, c in zip(utc_range, eastern_range, berlin_range): + assert a == b + assert b == c + assert a == c + + assert (utc_range == eastern_range).all() + assert (utc_range == berlin_range).all() + assert (berlin_range == eastern_range).all() + + def test_dti_intersection(self): + rng = date_range('1/1/2011', periods=100, freq='H', tz='utc') + + left = rng[10:90][::-1] + right = rng[20:80][::-1] + + assert left.tz == rng.tz + result = left.intersection(right) + assert result.tz == left.tz + + def test_dti_equals_with_tz(self): + left = date_range('1/1/2011', periods=100, freq='H', tz='utc') + right = date_range('1/1/2011', periods=100, freq='H', tz='US/Eastern') + + assert not left.equals(right) + + @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + def test_dti_tz_nat(self, tzstr): + idx = DatetimeIndex([Timestamp("2013-1-1", tz=tzstr), pd.NaT]) + + assert isna(idx[1]) + assert idx[0].tzinfo is not None + + @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + def test_dti_astype_asobject_tzinfos(self, tzstr): + # GH#1345 + + # dates around a dst transition + rng = date_range('2/13/2010', '5/6/2010', tz=tzstr) + + objs = rng.astype(object) + for i, x in enumerate(objs): + exval = rng[i] + assert x == exval + assert x.tzinfo == exval.tzinfo + + objs = rng.astype(object) + for i, x in enumerate(objs): + exval = rng[i] + assert x == exval + assert x.tzinfo == exval.tzinfo + + @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + def test_dti_with_timezone_repr(self, tzstr): + rng = date_range('4/13/2010', '5/6/2010') + + rng_eastern = rng.tz_localize(tzstr) + + rng_repr = repr(rng_eastern) + assert '2010-04-13 00:00:00' in rng_repr + + @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + def test_dti_take_dont_lose_meta(self, tzstr): + rng = date_range('1/1/2000', periods=20, tz=tzstr) + + result = rng.take(lrange(5)) + assert result.tz == rng.tz + assert result.freq == rng.freq + + @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + def test_utc_box_timestamp_and_localize(self, tzstr): + tz = timezones.maybe_get_tz(tzstr) + + rng = date_range('3/11/2012', '3/12/2012', freq='H', tz='utc') + rng_eastern = rng.tz_convert(tzstr) + + expected = rng[-1].astimezone(tz) + + stamp = rng_eastern[-1] + assert stamp == expected + assert stamp.tzinfo == expected.tzinfo + + # right tzinfo + rng = date_range('3/13/2012', '3/14/2012', freq='H', tz='utc') + rng_eastern = rng.tz_convert(tzstr) + # test not valid for dateutil timezones. + # assert 'EDT' in repr(rng_eastern[0].tzinfo) + assert ('EDT' in repr(rng_eastern[0].tzinfo) or + 'tzfile' in repr(rng_eastern[0].tzinfo)) + + def test_dti_to_pydatetime(self): + dt = dateutil.parser.parse('2012-06-13T01:39:00Z') + dt = dt.replace(tzinfo=tzlocal()) + + arr = np.array([dt], dtype=object) + + result = to_datetime(arr, utc=True) + assert result.tz is pytz.utc + + rng = date_range('2012-11-03 03:00', '2012-11-05 03:00', tz=tzlocal()) + arr = rng.to_pydatetime() + result = to_datetime(arr, utc=True) + assert result.tz is pytz.utc + + def test_dti_to_pydatetime_fizedtz(self): + dates = np.array([datetime(2000, 1, 1, tzinfo=fixed_off), + datetime(2000, 1, 2, tzinfo=fixed_off), + datetime(2000, 1, 3, tzinfo=fixed_off)]) + dti = DatetimeIndex(dates) + + result = dti.to_pydatetime() + tm.assert_numpy_array_equal(dates, result) + + result = dti._mpl_repr() + tm.assert_numpy_array_equal(dates, result) + + @pytest.mark.parametrize('tz', [pytz.timezone('US/Central'), + gettz('US/Central')]) + def test_with_tz(self, tz): + # just want it to work + start = datetime(2011, 3, 12, tzinfo=pytz.utc) + dr = bdate_range(start, periods=50, freq=pd.offsets.Hour()) + assert dr.tz is pytz.utc + + # DateRange with naive datetimes + dr = bdate_range('1/1/2005', '1/1/2009', tz=pytz.utc) + dr = bdate_range('1/1/2005', '1/1/2009', tz=tz) + + # normalized + central = dr.tz_convert(tz) + assert central.tz is tz + naive = central[0].to_pydatetime().replace(tzinfo=None) + comp = tslib._localize_pydatetime(naive, tz).tzinfo + assert central[0].tz is comp + + # compare vs a localized tz + naive = dr[0].to_pydatetime().replace(tzinfo=None) + comp = tslib._localize_pydatetime(naive, tz).tzinfo + assert central[0].tz is comp + + # datetimes with tzinfo set + dr = bdate_range(datetime(2005, 1, 1, tzinfo=pytz.utc), + datetime(2009, 1, 1, tzinfo=pytz.utc)) + with pytest.raises(Exception): + bdate_range(datetime(2005, 1, 1, tzinfo=pytz.utc), '1/1/2009', + tz=tz) + + @pytest.mark.parametrize('prefix', ['', 'dateutil/']) + def test_field_access_localize(self, prefix): + strdates = ['1/1/2012', '3/1/2012', '4/1/2012'] + rng = DatetimeIndex(strdates, tz=prefix + 'US/Eastern') + assert (rng.hour == 0).all() + + # a more unusual time zone, #1946 + dr = date_range('2011-10-02 00:00', freq='h', periods=10, + tz=prefix + 'America/Atikokan') + + expected = Index(np.arange(10, dtype=np.int64)) + tm.assert_index_equal(dr.hour, expected) + + @pytest.mark.parametrize('tz', [pytz.timezone('US/Eastern'), + gettz('US/Eastern')]) + def test_dti_convert_tz_aware_datetime_datetime(self, tz): + # GH#1581 + dates = [datetime(2000, 1, 1), datetime(2000, 1, 2), + datetime(2000, 1, 3)] + + dates_aware = [tslib._localize_pydatetime(x, tz) for x in dates] + result = DatetimeIndex(dates_aware) + assert timezones.tz_compare(result.tz, tz) + + converted = to_datetime(dates_aware, utc=True) + ex_vals = np.array([Timestamp(x).value for x in dates_aware]) + tm.assert_numpy_array_equal(converted.asi8, ex_vals) + assert converted.tz is pytz.utc + + def test_dti_union_aware(self): + # non-overlapping + rng = date_range("2012-11-15 00:00:00", periods=6, freq="H", + tz="US/Central") + + rng2 = date_range("2012-11-15 12:00:00", periods=6, freq="H", + tz="US/Eastern") + + result = rng.union(rng2) + assert result.tz.zone == 'UTC' + + +class TestDateRange(object): + """Tests for date_range with timezones""" + def test_hongkong_tz_convert(self): + # GH#1673 smoke test + dr = date_range('2012-01-01', '2012-01-10', freq='D', tz='Hongkong') + + # it works! + dr.hour + + @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + def test_date_range_span_dst_transition(self, tzstr): + # GH#1778 + + # Standard -> Daylight Savings Time + dr = date_range('03/06/2012 00:00', periods=200, freq='W-FRI', + tz='US/Eastern') + + assert (dr.hour == 0).all() + + dr = date_range('2012-11-02', periods=10, tz=tzstr) + assert (dr.hour == 0).all() + + @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + def test_date_range_timezone_str_argument(self, tzstr): + tz = timezones.maybe_get_tz(tzstr) + result = date_range('1/1/2000', periods=10, tz=tzstr) + expected = date_range('1/1/2000', periods=10, tz=tz) + + tm.assert_index_equal(result, expected) + + def test_date_range_with_fixedoffset_noname(self): + off = fixed_off_no_name + start = datetime(2012, 3, 11, 5, 0, 0, tzinfo=off) + end = datetime(2012, 6, 11, 5, 0, 0, tzinfo=off) + rng = date_range(start=start, end=end) + assert off == rng.tz + + idx = Index([start, end]) + assert off == idx.tz + + @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + def test_date_range_with_tz(self, tzstr): + stamp = Timestamp('3/11/2012 05:00', tz=tzstr) + assert stamp.hour == 5 + + rng = date_range('3/11/2012 04:00', periods=10, freq='H', + tz=tzstr) + + assert stamp == rng[1] + + +class TestToDatetime(object): + """Tests for the to_datetime constructor with timezones""" + def test_to_datetime_utc(self): + arr = np.array([dateutil.parser.parse('2012-06-13T01:39:00Z')], + dtype=object) + + result = to_datetime(arr, utc=True) + assert result.tz is pytz.utc + + def test_to_datetime_fixed_offset(self): + dates = [datetime(2000, 1, 1, tzinfo=fixed_off), + datetime(2000, 1, 2, tzinfo=fixed_off), + datetime(2000, 1, 3, tzinfo=fixed_off)] + result = to_datetime(dates) + assert result.tz == fixed_off diff --git a/pandas/tests/tseries/test_timezones.py b/pandas/tests/tseries/test_timezones.py index 8f46e0a58580e..565e735c14c80 100644 --- a/pandas/tests/tseries/test_timezones.py +++ b/pandas/tests/tseries/test_timezones.py @@ -5,43 +5,13 @@ import dateutil import numpy as np -from dateutil.parser import parse -from pytz import NonExistentTimeError -from distutils.version import LooseVersion -from dateutil.tz import tzlocal -from datetime import datetime, timedelta, tzinfo +from datetime import datetime import pandas.util.testing as tm -import pandas.util._test_decorators as td -import pandas.tseries.offsets as offsets -from pandas.compat import lrange, zip -from pandas.core.indexes.datetimes import bdate_range, date_range +from pandas.core.indexes.datetimes import date_range from pandas._libs import tslib from pandas._libs.tslibs import timezones, conversion -from pandas import (Index, isna, Timestamp, NaT, - DatetimeIndex, to_datetime) -from pandas.util.testing import set_timezone - - -class FixedOffset(tzinfo): - """Fixed offset in minutes east from UTC.""" - - def __init__(self, offset, name): - self.__offset = timedelta(minutes=offset) - self.__name = name - - def utcoffset(self, dt): - return self.__offset - - def tzname(self, dt): - return self.__name - - def dst(self, dt): - return timedelta(0) - - -fixed_off = FixedOffset(-420, '-07:00') -fixed_off_no_name = FixedOffset(-330, None) +from pandas import Timestamp class TestTimeZoneSupportPytz(object): @@ -68,399 +38,6 @@ def cmptz(self, tz1, tz2): # tests. return tz1.zone == tz2.zone - def test_utc_to_local_no_modify(self): - rng = date_range('3/11/2012', '3/12/2012', freq='H', tz='utc') - rng_eastern = rng.tz_convert(self.tzstr('US/Eastern')) - - # Values are unmodified - tm.assert_numpy_array_equal(rng.asi8, rng_eastern.asi8) - - assert self.cmptz(rng_eastern.tz, self.tz('US/Eastern')) - - def test_utc_to_local_no_modify_explicit(self): - rng = date_range('3/11/2012', '3/12/2012', freq='H', tz='utc') - rng_eastern = rng.tz_convert(self.tz('US/Eastern')) - - # Values are unmodified - tm.assert_numpy_array_equal(rng.asi8, rng_eastern.asi8) - - assert rng_eastern.tz == self.tz('US/Eastern') - - def test_localize_utc_conversion(self): - # Localizing to time zone should: - # 1) check for DST ambiguities - # 2) convert to UTC - - rng = date_range('3/10/2012', '3/11/2012', freq='30T') - - converted = rng.tz_localize(self.tzstr('US/Eastern')) - expected_naive = rng + offsets.Hour(5) - tm.assert_numpy_array_equal(converted.asi8, expected_naive.asi8) - - # DST ambiguity, this should fail - rng = date_range('3/11/2012', '3/12/2012', freq='30T') - # Is this really how it should fail?? - pytest.raises(NonExistentTimeError, rng.tz_localize, - self.tzstr('US/Eastern')) - - def test_localize_utc_conversion_explicit(self): - # Localizing to time zone should: - # 1) check for DST ambiguities - # 2) convert to UTC - - rng = date_range('3/10/2012', '3/11/2012', freq='30T') - converted = rng.tz_localize(self.tz('US/Eastern')) - expected_naive = rng + offsets.Hour(5) - tm.assert_numpy_array_equal(converted.asi8, expected_naive.asi8) - - # DST ambiguity, this should fail - rng = date_range('3/11/2012', '3/12/2012', freq='30T') - # Is this really how it should fail?? - pytest.raises(NonExistentTimeError, rng.tz_localize, - self.tz('US/Eastern')) - - def test_tz_localize_dti(self): - dti = DatetimeIndex(start='1/1/2005', end='1/1/2005 0:00:30.256', - freq='L') - dti2 = dti.tz_localize(self.tzstr('US/Eastern')) - - dti_utc = DatetimeIndex(start='1/1/2005 05:00', - end='1/1/2005 5:00:30.256', freq='L', tz='utc') - - tm.assert_numpy_array_equal(dti2.values, dti_utc.values) - - dti3 = dti2.tz_convert(self.tzstr('US/Pacific')) - tm.assert_numpy_array_equal(dti3.values, dti_utc.values) - - dti = DatetimeIndex(start='11/6/2011 1:59', end='11/6/2011 2:00', - freq='L') - pytest.raises(pytz.AmbiguousTimeError, dti.tz_localize, - self.tzstr('US/Eastern')) - - dti = DatetimeIndex(start='3/13/2011 1:59', end='3/13/2011 2:00', - freq='L') - pytest.raises(pytz.NonExistentTimeError, dti.tz_localize, - self.tzstr('US/Eastern')) - - def test_create_with_tz(self): - stamp = Timestamp('3/11/2012 05:00', tz=self.tzstr('US/Eastern')) - assert stamp.hour == 5 - - rng = date_range('3/11/2012 04:00', periods=10, freq='H', - tz=self.tzstr('US/Eastern')) - - assert stamp == rng[1] - - def test_create_with_fixed_tz(self): - off = FixedOffset(420, '+07:00') - start = datetime(2012, 3, 11, 5, 0, 0, tzinfo=off) - end = datetime(2012, 6, 11, 5, 0, 0, tzinfo=off) - rng = date_range(start=start, end=end) - assert off == rng.tz - - rng2 = date_range(start, periods=len(rng), tz=off) - tm.assert_index_equal(rng, rng2) - - rng3 = date_range('3/11/2012 05:00:00+07:00', - '6/11/2012 05:00:00+07:00') - assert (rng.values == rng3.values).all() - - def test_create_with_fixedoffset_noname(self): - off = fixed_off_no_name - start = datetime(2012, 3, 11, 5, 0, 0, tzinfo=off) - end = datetime(2012, 6, 11, 5, 0, 0, tzinfo=off) - rng = date_range(start=start, end=end) - assert off == rng.tz - - idx = Index([start, end]) - assert off == idx.tz - - def test_date_range_localize(self): - rng = date_range('3/11/2012 03:00', periods=15, freq='H', - tz='US/Eastern') - rng2 = DatetimeIndex(['3/11/2012 03:00', '3/11/2012 04:00'], - tz='US/Eastern') - rng3 = date_range('3/11/2012 03:00', periods=15, freq='H') - rng3 = rng3.tz_localize('US/Eastern') - - tm.assert_index_equal(rng, rng3) - - # DST transition time - val = rng[0] - exp = Timestamp('3/11/2012 03:00', tz='US/Eastern') - - assert val.hour == 3 - assert exp.hour == 3 - assert val == exp # same UTC value - tm.assert_index_equal(rng[:2], rng2) - - # Right before the DST transition - rng = date_range('3/11/2012 00:00', periods=2, freq='H', - tz='US/Eastern') - rng2 = DatetimeIndex(['3/11/2012 00:00', '3/11/2012 01:00'], - tz='US/Eastern') - tm.assert_index_equal(rng, rng2) - exp = Timestamp('3/11/2012 00:00', tz='US/Eastern') - assert exp.hour == 0 - assert rng[0] == exp - exp = Timestamp('3/11/2012 01:00', tz='US/Eastern') - assert exp.hour == 1 - assert rng[1] == exp - - rng = date_range('3/11/2012 00:00', periods=10, freq='H', - tz='US/Eastern') - assert rng[2].hour == 3 - - def test_utc_box_timestamp_and_localize(self): - rng = date_range('3/11/2012', '3/12/2012', freq='H', tz='utc') - rng_eastern = rng.tz_convert(self.tzstr('US/Eastern')) - - tz = self.tz('US/Eastern') - expected = rng[-1].astimezone(tz) - - stamp = rng_eastern[-1] - assert stamp == expected - assert stamp.tzinfo == expected.tzinfo - - # right tzinfo - rng = date_range('3/13/2012', '3/14/2012', freq='H', tz='utc') - rng_eastern = rng.tz_convert(self.tzstr('US/Eastern')) - # test not valid for dateutil timezones. - # assert 'EDT' in repr(rng_eastern[0].tzinfo) - assert ('EDT' in repr(rng_eastern[0].tzinfo) or - 'tzfile' in repr(rng_eastern[0].tzinfo)) - - def test_timestamp_tz_convert(self): - strdates = ['1/1/2012', '3/1/2012', '4/1/2012'] - idx = DatetimeIndex(strdates, tz=self.tzstr('US/Eastern')) - - conv = idx[0].tz_convert(self.tzstr('US/Pacific')) - expected = idx.tz_convert(self.tzstr('US/Pacific'))[0] - - assert conv == expected - - def test_pass_dates_localize_to_utc(self): - strdates = ['1/1/2012', '3/1/2012', '4/1/2012'] - - idx = DatetimeIndex(strdates) - conv = idx.tz_localize(self.tzstr('US/Eastern')) - - fromdates = DatetimeIndex(strdates, tz=self.tzstr('US/Eastern')) - - assert conv.tz == fromdates.tz - tm.assert_numpy_array_equal(conv.values, fromdates.values) - - def test_field_access_localize(self): - strdates = ['1/1/2012', '3/1/2012', '4/1/2012'] - rng = DatetimeIndex(strdates, tz=self.tzstr('US/Eastern')) - assert (rng.hour == 0).all() - - # a more unusual time zone, #1946 - dr = date_range('2011-10-02 00:00', freq='h', periods=10, - tz=self.tzstr('America/Atikokan')) - - expected = Index(np.arange(10, dtype=np.int64)) - tm.assert_index_equal(dr.hour, expected) - - def test_with_tz(self): - tz = self.tz('US/Central') - - # just want it to work - start = datetime(2011, 3, 12, tzinfo=pytz.utc) - dr = bdate_range(start, periods=50, freq=offsets.Hour()) - assert dr.tz is pytz.utc - - # DateRange with naive datetimes - dr = bdate_range('1/1/2005', '1/1/2009', tz=pytz.utc) - dr = bdate_range('1/1/2005', '1/1/2009', tz=tz) - - # normalized - central = dr.tz_convert(tz) - assert central.tz is tz - comp = self.localize(tz, central[0].to_pydatetime().replace( - tzinfo=None)).tzinfo - assert central[0].tz is comp - - # compare vs a localized tz - comp = self.localize(tz, - dr[0].to_pydatetime().replace(tzinfo=None)).tzinfo - assert central[0].tz is comp - - # datetimes with tzinfo set - dr = bdate_range(datetime(2005, 1, 1, tzinfo=pytz.utc), - datetime(2009, 1, 1, tzinfo=pytz.utc)) - - pytest.raises(Exception, bdate_range, - datetime(2005, 1, 1, tzinfo=pytz.utc), '1/1/2009', - tz=tz) - - def test_tz_localize(self): - dr = bdate_range('1/1/2009', '1/1/2010') - dr_utc = bdate_range('1/1/2009', '1/1/2010', tz=pytz.utc) - localized = dr.tz_localize(pytz.utc) - tm.assert_index_equal(dr_utc, localized) - - def test_with_tz_ambiguous_times(self): - tz = self.tz('US/Eastern') - - # March 13, 2011, spring forward, skip from 2 AM to 3 AM - dr = date_range(datetime(2011, 3, 13, 1, 30), periods=3, - freq=offsets.Hour()) - pytest.raises(pytz.NonExistentTimeError, dr.tz_localize, tz) - - # after dst transition, it works - dr = date_range(datetime(2011, 3, 13, 3, 30), periods=3, - freq=offsets.Hour(), tz=tz) - - # November 6, 2011, fall back, repeat 2 AM hour - dr = date_range(datetime(2011, 11, 6, 1, 30), periods=3, - freq=offsets.Hour()) - pytest.raises(pytz.AmbiguousTimeError, dr.tz_localize, tz) - - # UTC is OK - dr = date_range(datetime(2011, 3, 13), periods=48, - freq=offsets.Minute(30), tz=pytz.utc) - - def test_ambiguous_infer(self): - # November 6, 2011, fall back, repeat 2 AM hour - # With no repeated hours, we cannot infer the transition - tz = self.tz('US/Eastern') - dr = date_range(datetime(2011, 11, 6, 0), periods=5, - freq=offsets.Hour()) - pytest.raises(pytz.AmbiguousTimeError, dr.tz_localize, tz) - - # With repeated hours, we can infer the transition - dr = date_range(datetime(2011, 11, 6, 0), periods=5, - freq=offsets.Hour(), tz=tz) - times = ['11/06/2011 00:00', '11/06/2011 01:00', '11/06/2011 01:00', - '11/06/2011 02:00', '11/06/2011 03:00'] - di = DatetimeIndex(times) - localized = di.tz_localize(tz, ambiguous='infer') - tm.assert_index_equal(dr, localized) - with tm.assert_produces_warning(FutureWarning): - localized_old = di.tz_localize(tz, infer_dst=True) - tm.assert_index_equal(dr, localized_old) - tm.assert_index_equal(dr, DatetimeIndex(times, tz=tz, - ambiguous='infer')) - - # When there is no dst transition, nothing special happens - dr = date_range(datetime(2011, 6, 1, 0), periods=10, - freq=offsets.Hour()) - localized = dr.tz_localize(tz) - localized_infer = dr.tz_localize(tz, ambiguous='infer') - tm.assert_index_equal(localized, localized_infer) - with tm.assert_produces_warning(FutureWarning): - localized_infer_old = dr.tz_localize(tz, infer_dst=True) - tm.assert_index_equal(localized, localized_infer_old) - - def test_ambiguous_flags(self): - # November 6, 2011, fall back, repeat 2 AM hour - tz = self.tz('US/Eastern') - - # Pass in flags to determine right dst transition - dr = date_range(datetime(2011, 11, 6, 0), periods=5, - freq=offsets.Hour(), tz=tz) - times = ['11/06/2011 00:00', '11/06/2011 01:00', '11/06/2011 01:00', - '11/06/2011 02:00', '11/06/2011 03:00'] - - # Test tz_localize - di = DatetimeIndex(times) - is_dst = [1, 1, 0, 0, 0] - localized = di.tz_localize(tz, ambiguous=is_dst) - tm.assert_index_equal(dr, localized) - tm.assert_index_equal(dr, DatetimeIndex(times, tz=tz, - ambiguous=is_dst)) - - localized = di.tz_localize(tz, ambiguous=np.array(is_dst)) - tm.assert_index_equal(dr, localized) - - localized = di.tz_localize(tz, - ambiguous=np.array(is_dst).astype('bool')) - tm.assert_index_equal(dr, localized) - - # Test constructor - localized = DatetimeIndex(times, tz=tz, ambiguous=is_dst) - tm.assert_index_equal(dr, localized) - - # Test duplicate times where infer_dst fails - times += times - di = DatetimeIndex(times) - - # When the sizes are incompatible, make sure error is raised - pytest.raises(Exception, di.tz_localize, tz, ambiguous=is_dst) - - # When sizes are compatible and there are repeats ('infer' won't work) - is_dst = np.hstack((is_dst, is_dst)) - localized = di.tz_localize(tz, ambiguous=is_dst) - dr = dr.append(dr) - tm.assert_index_equal(dr, localized) - - # When there is no dst transition, nothing special happens - dr = date_range(datetime(2011, 6, 1, 0), periods=10, - freq=offsets.Hour()) - is_dst = np.array([1] * 10) - localized = dr.tz_localize(tz) - localized_is_dst = dr.tz_localize(tz, ambiguous=is_dst) - tm.assert_index_equal(localized, localized_is_dst) - - # construction with an ambiguous end-point - # GH 11626 - tz = self.tzstr("Europe/London") - - def f(): - date_range("2013-10-26 23:00", "2013-10-27 01:00", - tz="Europe/London", freq="H") - pytest.raises(pytz.AmbiguousTimeError, f) - - times = date_range("2013-10-26 23:00", "2013-10-27 01:00", freq="H", - tz=tz, ambiguous='infer') - assert times[0] == Timestamp('2013-10-26 23:00', tz=tz, freq="H") - - if str(tz).startswith('dateutil'): - if LooseVersion(dateutil.__version__) < LooseVersion('2.6.0'): - # see gh-14621 - assert times[-1] == Timestamp('2013-10-27 01:00:00+0000', - tz=tz, freq="H") - elif LooseVersion(dateutil.__version__) > LooseVersion('2.6.0'): - # fixed ambiguous behavior - assert times[-1] == Timestamp('2013-10-27 01:00:00+0100', - tz=tz, freq="H") - else: - assert times[-1] == Timestamp('2013-10-27 01:00:00+0000', - tz=tz, freq="H") - - def test_ambiguous_nat(self): - tz = self.tz('US/Eastern') - times = ['11/06/2011 00:00', '11/06/2011 01:00', '11/06/2011 01:00', - '11/06/2011 02:00', '11/06/2011 03:00'] - di = DatetimeIndex(times) - localized = di.tz_localize(tz, ambiguous='NaT') - - times = ['11/06/2011 00:00', np.NaN, np.NaN, '11/06/2011 02:00', - '11/06/2011 03:00'] - di_test = DatetimeIndex(times, tz='US/Eastern') - - # left dtype is datetime64[ns, US/Eastern] - # right is datetime64[ns, tzfile('/usr/share/zoneinfo/US/Eastern')] - tm.assert_numpy_array_equal(di_test.values, localized.values) - - def test_nonexistent_raise_coerce(self): - # See issue 13057 - from pytz.exceptions import NonExistentTimeError - times = ['2015-03-08 01:00', '2015-03-08 02:00', '2015-03-08 03:00'] - index = DatetimeIndex(times) - tz = 'US/Eastern' - pytest.raises(NonExistentTimeError, - index.tz_localize, tz=tz) - pytest.raises(NonExistentTimeError, - index.tz_localize, tz=tz, errors='raise') - result = index.tz_localize(tz=tz, errors='coerce') - test_times = ['2015-03-08 01:00-05:00', 'NaT', - '2015-03-08 03:00-04:00'] - expected = DatetimeIndex(test_times)\ - .tz_localize('UTC').tz_convert('US/Eastern') - tm.assert_index_equal(result, expected) - # test utility methods def test_infer_tz(self): eastern = self.tz('US/Eastern') @@ -486,183 +63,6 @@ def test_infer_tz(self): pytest.raises(Exception, timezones.infer_tzinfo, start, end) pytest.raises(Exception, timezones.infer_tzinfo, end, start) - def test_tz_string(self): - result = date_range('1/1/2000', periods=10, - tz=self.tzstr('US/Eastern')) - expected = date_range('1/1/2000', periods=10, tz=self.tz('US/Eastern')) - - tm.assert_index_equal(result, expected) - - def test_take_dont_lose_meta(self): - rng = date_range('1/1/2000', periods=20, tz=self.tzstr('US/Eastern')) - - result = rng.take(lrange(5)) - assert result.tz == rng.tz - assert result.freq == rng.freq - - def test_index_with_timezone_repr(self): - rng = date_range('4/13/2010', '5/6/2010') - - rng_eastern = rng.tz_localize(self.tzstr('US/Eastern')) - - rng_repr = repr(rng_eastern) - assert '2010-04-13 00:00:00' in rng_repr - - def test_index_astype_asobject_tzinfos(self): - # #1345 - - # dates around a dst transition - rng = date_range('2/13/2010', '5/6/2010', tz=self.tzstr('US/Eastern')) - - objs = rng.astype(object) - for i, x in enumerate(objs): - exval = rng[i] - assert x == exval - assert x.tzinfo == exval.tzinfo - - objs = rng.astype(object) - for i, x in enumerate(objs): - exval = rng[i] - assert x == exval - assert x.tzinfo == exval.tzinfo - - def test_fixed_offset(self): - dates = [datetime(2000, 1, 1, tzinfo=fixed_off), - datetime(2000, 1, 2, tzinfo=fixed_off), - datetime(2000, 1, 3, tzinfo=fixed_off)] - result = to_datetime(dates) - assert result.tz == fixed_off - - def test_fixedtz_topydatetime(self): - dates = np.array([datetime(2000, 1, 1, tzinfo=fixed_off), - datetime(2000, 1, 2, tzinfo=fixed_off), - datetime(2000, 1, 3, tzinfo=fixed_off)]) - result = to_datetime(dates).to_pydatetime() - tm.assert_numpy_array_equal(dates, result) - result = to_datetime(dates)._mpl_repr() - tm.assert_numpy_array_equal(dates, result) - - def test_convert_tz_aware_datetime_datetime(self): - # #1581 - - tz = self.tz('US/Eastern') - - dates = [datetime(2000, 1, 1), datetime(2000, 1, 2), - datetime(2000, 1, 3)] - - dates_aware = [self.localize(tz, x) for x in dates] - result = to_datetime(dates_aware) - assert self.cmptz(result.tz, self.tz('US/Eastern')) - - converted = to_datetime(dates_aware, utc=True) - ex_vals = np.array([Timestamp(x).value for x in dates_aware]) - tm.assert_numpy_array_equal(converted.asi8, ex_vals) - assert converted.tz is pytz.utc - - def test_to_datetime_utc(self): - arr = np.array([parse('2012-06-13T01:39:00Z')], dtype=object) - - result = to_datetime(arr, utc=True) - assert result.tz is pytz.utc - - def test_to_datetime_tzlocal(self): - dt = parse('2012-06-13T01:39:00Z') - dt = dt.replace(tzinfo=tzlocal()) - - arr = np.array([dt], dtype=object) - - result = to_datetime(arr, utc=True) - assert result.tz is pytz.utc - - rng = date_range('2012-11-03 03:00', '2012-11-05 03:00', tz=tzlocal()) - arr = rng.to_pydatetime() - result = to_datetime(arr, utc=True) - assert result.tz is pytz.utc - - def test_hongkong_tz_convert(self): - # #1673 - dr = date_range('2012-01-01', '2012-01-10', freq='D', tz='Hongkong') - - # it works! - dr.hour - - def test_tz_convert_unsorted(self): - dr = date_range('2012-03-09', freq='H', periods=100, tz='utc') - dr = dr.tz_convert(self.tzstr('US/Eastern')) - - result = dr[::-1].hour - exp = dr.hour[::-1] - tm.assert_almost_equal(result, exp) - - def test_shift_localized(self): - dr = date_range('2011/1/1', '2012/1/1', freq='W-FRI') - dr_tz = dr.tz_localize(self.tzstr('US/Eastern')) - - result = dr_tz.shift(1, '10T') - assert result.tz == dr_tz.tz - - def test_static_tzinfo(self): - # it works! - index = DatetimeIndex([datetime(2012, 1, 1)], tz=self.tzstr('EST')) - index.hour - index[0] - - def test_tzaware_datetime_to_index(self): - d = [datetime(2012, 8, 19, tzinfo=self.tz('US/Eastern'))] - - index = DatetimeIndex(d) - assert self.cmptz(index.tz, self.tz('US/Eastern')) - - def test_date_range_span_dst_transition(self): - # #1778 - - # Standard -> Daylight Savings Time - dr = date_range('03/06/2012 00:00', periods=200, freq='W-FRI', - tz='US/Eastern') - - assert (dr.hour == 0).all() - - dr = date_range('2012-11-02', periods=10, tz=self.tzstr('US/Eastern')) - assert (dr.hour == 0).all() - - def test_convert_datetime_list(self): - dr = date_range('2012-06-02', periods=10, - tz=self.tzstr('US/Eastern'), name='foo') - dr2 = DatetimeIndex(list(dr), name='foo') - tm.assert_index_equal(dr, dr2) - assert dr.tz == dr2.tz - assert dr2.name == 'foo' - - def test_index_drop_dont_lose_tz(self): - # #2621 - ind = date_range("2012-12-01", periods=10, tz="utc") - ind = ind.drop(ind[-1]) - - assert ind.tz is not None - - def test_datetimeindex_tz(self): - """ Test different DatetimeIndex constructions with timezone - Follow-up of #4229 - """ - - arr = ['11/10/2005 08:00:00', '11/10/2005 09:00:00'] - - idx1 = to_datetime(arr).tz_localize(self.tzstr('US/Eastern')) - idx2 = DatetimeIndex(start="2005-11-10 08:00:00", freq='H', periods=2, - tz=self.tzstr('US/Eastern')) - idx3 = DatetimeIndex(arr, tz=self.tzstr('US/Eastern')) - idx4 = DatetimeIndex(np.array(arr), tz=self.tzstr('US/Eastern')) - - for other in [idx2, idx3, idx4]: - tm.assert_index_equal(idx1, other) - - def test_datetimeindex_tz_nat(self): - idx = to_datetime([Timestamp("2013-1-1", tz=self.tzstr('US/Eastern')), - NaT]) - - assert isna(idx[1]) - assert idx[0].tzinfo is not None - def test_replace_across_dst(self): # GH#18319 check that 1) timezone is correctly normalized and # 2) that hour is not incorrectly changed by this normalization @@ -712,159 +112,6 @@ def normalize(self, ts): # no-op for dateutil return ts - def test_tz_convert_hour_overflow_dst(self): - # Regression test for: - # https://github.com/pandas-dev/pandas/issues/13306 - - # sorted case US/Eastern -> UTC - ts = ['2008-05-12 09:50:00', - '2008-12-12 09:50:35', - '2009-05-12 09:50:32'] - tt = to_datetime(ts).tz_localize('US/Eastern') - ut = tt.tz_convert('UTC') - expected = Index([13, 14, 13]) - tm.assert_index_equal(ut.hour, expected) - - # sorted case UTC -> US/Eastern - ts = ['2008-05-12 13:50:00', - '2008-12-12 14:50:35', - '2009-05-12 13:50:32'] - tt = to_datetime(ts).tz_localize('UTC') - ut = tt.tz_convert('US/Eastern') - expected = Index([9, 9, 9]) - tm.assert_index_equal(ut.hour, expected) - - # unsorted case US/Eastern -> UTC - ts = ['2008-05-12 09:50:00', - '2008-12-12 09:50:35', - '2008-05-12 09:50:32'] - tt = to_datetime(ts).tz_localize('US/Eastern') - ut = tt.tz_convert('UTC') - expected = Index([13, 14, 13]) - tm.assert_index_equal(ut.hour, expected) - - # unsorted case UTC -> US/Eastern - ts = ['2008-05-12 13:50:00', - '2008-12-12 14:50:35', - '2008-05-12 13:50:32'] - tt = to_datetime(ts).tz_localize('UTC') - ut = tt.tz_convert('US/Eastern') - expected = Index([9, 9, 9]) - tm.assert_index_equal(ut.hour, expected) - - def test_tz_convert_hour_overflow_dst_timestamps(self): - # Regression test for: - # https://github.com/pandas-dev/pandas/issues/13306 - - tz = self.tzstr('US/Eastern') - - # sorted case US/Eastern -> UTC - ts = [Timestamp('2008-05-12 09:50:00', tz=tz), - Timestamp('2008-12-12 09:50:35', tz=tz), - Timestamp('2009-05-12 09:50:32', tz=tz)] - tt = to_datetime(ts) - ut = tt.tz_convert('UTC') - expected = Index([13, 14, 13]) - tm.assert_index_equal(ut.hour, expected) - - # sorted case UTC -> US/Eastern - ts = [Timestamp('2008-05-12 13:50:00', tz='UTC'), - Timestamp('2008-12-12 14:50:35', tz='UTC'), - Timestamp('2009-05-12 13:50:32', tz='UTC')] - tt = to_datetime(ts) - ut = tt.tz_convert('US/Eastern') - expected = Index([9, 9, 9]) - tm.assert_index_equal(ut.hour, expected) - - # unsorted case US/Eastern -> UTC - ts = [Timestamp('2008-05-12 09:50:00', tz=tz), - Timestamp('2008-12-12 09:50:35', tz=tz), - Timestamp('2008-05-12 09:50:32', tz=tz)] - tt = to_datetime(ts) - ut = tt.tz_convert('UTC') - expected = Index([13, 14, 13]) - tm.assert_index_equal(ut.hour, expected) - - # unsorted case UTC -> US/Eastern - ts = [Timestamp('2008-05-12 13:50:00', tz='UTC'), - Timestamp('2008-12-12 14:50:35', tz='UTC'), - Timestamp('2008-05-12 13:50:32', tz='UTC')] - tt = to_datetime(ts) - ut = tt.tz_convert('US/Eastern') - expected = Index([9, 9, 9]) - tm.assert_index_equal(ut.hour, expected) - - def test_tslib_tz_convert_trans_pos_plus_1__bug(self): - # Regression test for tslib.tz_convert(vals, tz1, tz2). - # See https://github.com/pandas-dev/pandas/issues/4496 for details. - for freq, n in [('H', 1), ('T', 60), ('S', 3600)]: - idx = date_range(datetime(2011, 3, 26, 23), - datetime(2011, 3, 27, 1), freq=freq) - idx = idx.tz_localize('UTC') - idx = idx.tz_convert('Europe/Moscow') - - expected = np.repeat(np.array([3, 4, 5]), np.array([n, n, 1])) - tm.assert_index_equal(idx.hour, Index(expected)) - - def test_tslib_tz_convert_dst(self): - for freq, n in [('H', 1), ('T', 60), ('S', 3600)]: - # Start DST - idx = date_range('2014-03-08 23:00', '2014-03-09 09:00', freq=freq, - tz='UTC') - idx = idx.tz_convert('US/Eastern') - expected = np.repeat(np.array([18, 19, 20, 21, 22, 23, - 0, 1, 3, 4, 5]), - np.array([n, n, n, n, n, n, n, n, n, n, 1])) - tm.assert_index_equal(idx.hour, Index(expected)) - - idx = date_range('2014-03-08 18:00', '2014-03-09 05:00', freq=freq, - tz='US/Eastern') - idx = idx.tz_convert('UTC') - expected = np.repeat(np.array([23, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), - np.array([n, n, n, n, n, n, n, n, n, n, 1])) - tm.assert_index_equal(idx.hour, Index(expected)) - - # End DST - idx = date_range('2014-11-01 23:00', '2014-11-02 09:00', freq=freq, - tz='UTC') - idx = idx.tz_convert('US/Eastern') - expected = np.repeat(np.array([19, 20, 21, 22, 23, - 0, 1, 1, 2, 3, 4]), - np.array([n, n, n, n, n, n, n, n, n, n, 1])) - tm.assert_index_equal(idx.hour, Index(expected)) - - idx = date_range('2014-11-01 18:00', '2014-11-02 05:00', freq=freq, - tz='US/Eastern') - idx = idx.tz_convert('UTC') - expected = np.repeat(np.array([22, 23, 0, 1, 2, 3, 4, 5, 6, - 7, 8, 9, 10]), - np.array([n, n, n, n, n, n, n, n, n, - n, n, n, 1])) - tm.assert_index_equal(idx.hour, Index(expected)) - - # daily - # Start DST - idx = date_range('2014-03-08 00:00', '2014-03-09 00:00', freq='D', - tz='UTC') - idx = idx.tz_convert('US/Eastern') - tm.assert_index_equal(idx.hour, Index([19, 19])) - - idx = date_range('2014-03-08 00:00', '2014-03-09 00:00', freq='D', - tz='US/Eastern') - idx = idx.tz_convert('UTC') - tm.assert_index_equal(idx.hour, Index([5, 5])) - - # End DST - idx = date_range('2014-11-01 00:00', '2014-11-02 00:00', freq='D', - tz='UTC') - idx = idx.tz_convert('US/Eastern') - tm.assert_index_equal(idx.hour, Index([20, 20])) - - idx = date_range('2014-11-01 00:00', '2014-11-02 000:00', freq='D', - tz='US/Eastern') - idx = idx.tz_convert('UTC') - tm.assert_index_equal(idx.hour, Index([4, 4])) - def test_tzlocal(self): # GH 13583 ts = Timestamp('2011-01-01', tz=dateutil.tz.tzlocal()) @@ -879,32 +126,6 @@ def test_tzlocal(self): offset = offset.total_seconds() * 1000000000 assert ts.value + offset == Timestamp('2011-01-01').value - def test_tz_localize_tzlocal(self): - # GH 13583 - offset = dateutil.tz.tzlocal().utcoffset(datetime(2011, 1, 1)) - offset = int(offset.total_seconds() * 1000000000) - - dti = date_range(start='2001-01-01', end='2001-03-01') - dti2 = dti.tz_localize(dateutil.tz.tzlocal()) - tm.assert_numpy_array_equal(dti2.asi8 + offset, dti.asi8) - - dti = date_range(start='2001-01-01', end='2001-03-01', - tz=dateutil.tz.tzlocal()) - dti2 = dti.tz_localize(None) - tm.assert_numpy_array_equal(dti2.asi8 - offset, dti.asi8) - - def test_tz_convert_tzlocal(self): - # GH 13583 - # tz_convert doesn't affect to internal - dti = date_range(start='2001-01-01', end='2001-03-01', tz='UTC') - dti2 = dti.tz_convert(dateutil.tz.tzlocal()) - tm.assert_numpy_array_equal(dti2.asi8, dti.asi8) - - dti = date_range(start='2001-01-01', end='2001-03-01', - tz=dateutil.tz.tzlocal()) - dti2 = dti.tz_convert(None) - tm.assert_numpy_array_equal(dti2.asi8, dti.asi8) - class TestTimeZoneCacheKey(object): @@ -922,228 +143,6 @@ def test_cache_keys_are_distinct_for_pytz_vs_dateutil(self, tz_name): timezones._p_tz_cache_key(tz_d)) -class TestTimeZones(object): - timezones = ['UTC', 'Asia/Tokyo', 'US/Eastern', 'dateutil/US/Pacific'] - - def test_index_equals_with_tz(self): - left = date_range('1/1/2011', periods=100, freq='H', tz='utc') - right = date_range('1/1/2011', periods=100, freq='H', tz='US/Eastern') - - assert not left.equals(right) - - def test_tz_localize_naive(self): - rng = date_range('1/1/2011', periods=100, freq='H') - - conv = rng.tz_localize('US/Pacific') - exp = date_range('1/1/2011', periods=100, freq='H', tz='US/Pacific') - - tm.assert_index_equal(conv, exp) - - def test_tz_localize_roundtrip(self): - for tz in self.timezones: - idx1 = date_range(start='2014-01-01', end='2014-12-31', freq='M') - idx2 = date_range(start='2014-01-01', end='2014-12-31', freq='D') - idx3 = date_range(start='2014-01-01', end='2014-03-01', freq='H') - idx4 = date_range(start='2014-08-01', end='2014-10-31', freq='T') - for idx in [idx1, idx2, idx3, idx4]: - localized = idx.tz_localize(tz) - expected = date_range(start=idx[0], end=idx[-1], freq=idx.freq, - tz=tz) - tm.assert_index_equal(localized, expected) - - with pytest.raises(TypeError): - localized.tz_localize(tz) - - reset = localized.tz_localize(None) - tm.assert_index_equal(reset, idx) - assert reset.tzinfo is None - - def test_tz_convert_roundtrip(self): - for tz in self.timezones: - idx1 = date_range(start='2014-01-01', end='2014-12-31', freq='M', - tz='UTC') - exp1 = date_range(start='2014-01-01', end='2014-12-31', freq='M') - - idx2 = date_range(start='2014-01-01', end='2014-12-31', freq='D', - tz='UTC') - exp2 = date_range(start='2014-01-01', end='2014-12-31', freq='D') - - idx3 = date_range(start='2014-01-01', end='2014-03-01', freq='H', - tz='UTC') - exp3 = date_range(start='2014-01-01', end='2014-03-01', freq='H') - - idx4 = date_range(start='2014-08-01', end='2014-10-31', freq='T', - tz='UTC') - exp4 = date_range(start='2014-08-01', end='2014-10-31', freq='T') - - for idx, expected in [(idx1, exp1), (idx2, exp2), (idx3, exp3), - (idx4, exp4)]: - converted = idx.tz_convert(tz) - reset = converted.tz_convert(None) - tm.assert_index_equal(reset, expected) - assert reset.tzinfo is None - tm.assert_index_equal(reset, converted.tz_convert( - 'UTC').tz_localize(None)) - - def test_join_utc_convert(self): - rng = date_range('1/1/2011', periods=100, freq='H', tz='utc') - - left = rng.tz_convert('US/Eastern') - right = rng.tz_convert('Europe/Berlin') - - for how in ['inner', 'outer', 'left', 'right']: - result = left.join(left[:-5], how=how) - assert isinstance(result, DatetimeIndex) - assert result.tz == left.tz - - result = left.join(right[:-5], how=how) - assert isinstance(result, DatetimeIndex) - assert result.tz.zone == 'UTC' - - def test_join_aware(self): - rng = date_range('1/1/2011', periods=10, freq='H') - - # non-overlapping - rng = date_range("2012-11-15 00:00:00", periods=6, freq="H", - tz="US/Central") - - rng2 = date_range("2012-11-15 12:00:00", periods=6, freq="H", - tz="US/Eastern") - - result = rng.union(rng2) - assert result.tz.zone == 'UTC' - - def test_intersection(self): - rng = date_range('1/1/2011', periods=100, freq='H', tz='utc') - - left = rng[10:90][::-1] - right = rng[20:80][::-1] - - assert left.tz == rng.tz - result = left.intersection(right) - assert result.tz == left.tz - - def test_timestamp_equality_different_timezones(self): - utc_range = date_range('1/1/2000', periods=20, tz='UTC') - eastern_range = utc_range.tz_convert('US/Eastern') - berlin_range = utc_range.tz_convert('Europe/Berlin') - - for a, b, c in zip(utc_range, eastern_range, berlin_range): - assert a == b - assert b == c - assert a == c - - assert (utc_range == eastern_range).all() - assert (utc_range == berlin_range).all() - assert (berlin_range == eastern_range).all() - - def test_datetimeindex_tz(self): - rng = date_range('03/12/2012 00:00', periods=10, freq='W-FRI', - tz='US/Eastern') - rng2 = DatetimeIndex(data=rng, tz='US/Eastern') - tm.assert_index_equal(rng, rng2) - - def test_normalize_tz(self): - rng = date_range('1/1/2000 9:30', periods=10, freq='D', - tz='US/Eastern') - - result = rng.normalize() - expected = date_range('1/1/2000', periods=10, freq='D', - tz='US/Eastern') - tm.assert_index_equal(result, expected) - - assert result.is_normalized - assert not rng.is_normalized - - rng = date_range('1/1/2000 9:30', periods=10, freq='D', tz='UTC') - - result = rng.normalize() - expected = date_range('1/1/2000', periods=10, freq='D', tz='UTC') - tm.assert_index_equal(result, expected) - - assert result.is_normalized - assert not rng.is_normalized - - rng = date_range('1/1/2000 9:30', periods=10, freq='D', tz=tzlocal()) - result = rng.normalize() - expected = date_range('1/1/2000', periods=10, freq='D', tz=tzlocal()) - tm.assert_index_equal(result, expected) - - assert result.is_normalized - assert not rng.is_normalized - - @td.skip_if_windows - def test_normalize_tz_local(self): - # see gh-13459 - timezones = ['US/Pacific', 'US/Eastern', 'UTC', 'Asia/Kolkata', - 'Asia/Shanghai', 'Australia/Canberra'] - - for timezone in timezones: - with set_timezone(timezone): - rng = date_range('1/1/2000 9:30', periods=10, freq='D', - tz=tzlocal()) - - result = rng.normalize() - expected = date_range('1/1/2000', periods=10, freq='D', - tz=tzlocal()) - tm.assert_index_equal(result, expected) - - assert result.is_normalized - assert not rng.is_normalized - - def test_tzaware_offset(self): - dates = date_range('2012-11-01', periods=3, tz='US/Pacific') - offset = dates + offsets.Hour(5) - assert dates[0] + offsets.Hour(5) == offset[0] - - # GH 6818 - for tz in ['UTC', 'US/Pacific', 'Asia/Tokyo']: - dates = date_range('2010-11-01 00:00', periods=3, tz=tz, freq='H') - expected = DatetimeIndex(['2010-11-01 05:00', '2010-11-01 06:00', - '2010-11-01 07:00'], freq='H', tz=tz) - - offset = dates + offsets.Hour(5) - tm.assert_index_equal(offset, expected) - offset = dates + np.timedelta64(5, 'h') - tm.assert_index_equal(offset, expected) - offset = dates + timedelta(hours=5) - tm.assert_index_equal(offset, expected) - - def test_nat(self): - # GH 5546 - dates = [NaT] - idx = DatetimeIndex(dates) - idx = idx.tz_localize('US/Pacific') - tm.assert_index_equal(idx, DatetimeIndex(dates, tz='US/Pacific')) - idx = idx.tz_convert('US/Eastern') - tm.assert_index_equal(idx, DatetimeIndex(dates, tz='US/Eastern')) - idx = idx.tz_convert('UTC') - tm.assert_index_equal(idx, DatetimeIndex(dates, tz='UTC')) - - dates = ['2010-12-01 00:00', '2010-12-02 00:00', NaT] - idx = DatetimeIndex(dates) - idx = idx.tz_localize('US/Pacific') - tm.assert_index_equal(idx, DatetimeIndex(dates, tz='US/Pacific')) - idx = idx.tz_convert('US/Eastern') - expected = ['2010-12-01 03:00', '2010-12-02 03:00', NaT] - tm.assert_index_equal(idx, DatetimeIndex(expected, tz='US/Eastern')) - - idx = idx + offsets.Hour(5) - expected = ['2010-12-01 08:00', '2010-12-02 08:00', NaT] - tm.assert_index_equal(idx, DatetimeIndex(expected, tz='US/Eastern')) - idx = idx.tz_convert('US/Pacific') - expected = ['2010-12-01 05:00', '2010-12-02 05:00', NaT] - tm.assert_index_equal(idx, DatetimeIndex(expected, tz='US/Pacific')) - - idx = idx + np.timedelta64(3, 'h') - expected = ['2010-12-01 08:00', '2010-12-02 08:00', NaT] - tm.assert_index_equal(idx, DatetimeIndex(expected, tz='US/Pacific')) - - idx = idx.tz_convert('US/Eastern') - expected = ['2010-12-01 11:00', '2010-12-02 11:00', NaT] - tm.assert_index_equal(idx, DatetimeIndex(expected, tz='US/Eastern')) - - class TestTslib(object): def test_tslib_tz_convert(self): From 983d71fa8477439aaa227367d7f2f14952e4e235 Mon Sep 17 00:00:00 2001 From: Jason Bandlow Date: Tue, 6 Feb 2018 18:48:55 -0500 Subject: [PATCH 058/214] BUG: Fix ts precision issue with groupby and NaT (#19526) closes #19526 Author: Jason Bandlow Closes #19530 from jbandlow/timestamp_float_conversion and squashes the following commits: 2fb23d673 [Jason Bandlow] merge af37225d4 [Jason Bandlow] BUG: Fix ts precision issue with groupby and NaT (#19526) --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/core/groupby.py | 2 +- pandas/tests/groupby/aggregate/test_cython.py | 19 ++++++++++++++++++- 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index b5bf7ccbda0b6..7322bd9fe3327 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -644,6 +644,7 @@ Groupby/Resample/Rolling - Fixed regression in :func:`DataFrame.groupby` which would not emit an error when called with a tuple key not in the index (:issue:`18798`) - Bug in :func:`DataFrame.resample` which silently ignored unsupported (or mistyped) options for ``label``, ``closed`` and ``convention`` (:issue:`19303`) - Bug in :func:`DataFrame.groupby` where tuples were interpreted as lists of keys rather than as keys (:issue:`17979`, :issue:`18249`) +- Bug in :func:`DataFrame.groupby` where aggregation by ``first``/``last``/``min``/``max`` was causing timestamps to lose precision (:issue:`19526`) - Bug in :func:`DataFrame.transform` where particular aggregation functions were being incorrectly cast to match the dtype(s) of the grouped data (:issue:`19200`) - Bug in :func:`DataFrame.groupby` passing the `on=` kwarg, and subsequently using ``.apply()`` (:issue:`17813`) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index ab0070777c190..f352b80ba3069 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2336,7 +2336,7 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1): result = self._transform( result, values, labels, func, is_numeric, is_datetimelike) - if is_integer_dtype(result): + if is_integer_dtype(result) and not is_datetimelike: mask = result == iNaT if mask.any(): result = result.astype('float64') diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index c8ee05ddbb74f..cef3a699ed24b 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -12,7 +12,8 @@ from numpy import nan import pandas as pd -from pandas import bdate_range, DataFrame, Index, Series +from pandas import (bdate_range, DataFrame, Index, Series, Timestamp, + Timedelta, NaT) from pandas.core.groupby import DataError import pandas.util.testing as tm @@ -187,3 +188,19 @@ def test_cython_agg_empty_buckets_nanops(): {"a": [1, 1, 1716, 1]}, index=pd.CategoricalIndex(intervals, name='a', ordered=True)) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize('op', ['first', 'last', 'max', 'min']) +@pytest.mark.parametrize('data', [ + Timestamp('2016-10-14 21:00:44.557'), + Timedelta('17088 days 21:00:44.557'), ]) +def test_cython_with_timestamp_and_nat(op, data): + # https://github.com/pandas-dev/pandas/issues/19526 + df = DataFrame({'a': [0, 1], 'b': [data, NaT]}) + index = Index([0, 1], name='a') + + # We will group by a and test the cython aggregations + expected = DataFrame({'b': [data, NaT]}, index=index) + + result = df.groupby('a').aggregate(op) + tm.assert_frame_equal(expected, result) From 7e6e7e48595e7e2319c42d7e30867fd7b1cc9a0d Mon Sep 17 00:00:00 2001 From: William Ayd Date: Wed, 7 Feb 2018 03:01:53 -0800 Subject: [PATCH 059/214] Cleaned up return of _get_cython_function (#19561) --- pandas/core/groupby.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index f352b80ba3069..01241db7c0c42 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2240,7 +2240,7 @@ def wrapper(*args, **kwargs): raise NotImplementedError("function is not implemented for this" "dtype: [how->%s,dtype->%s]" % (how, dtype_str)) - return func, dtype_str + return func def _cython_operation(self, kind, values, how, axis, min_count=-1): assert kind in ['transform', 'aggregate'] @@ -2304,12 +2304,12 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1): values = values.astype(object) try: - func, dtype_str = self._get_cython_function( + func = self._get_cython_function( kind, how, values, is_numeric) except NotImplementedError: if is_numeric: values = _ensure_float64(values) - func, dtype_str = self._get_cython_function( + func = self._get_cython_function( kind, how, values, is_numeric) else: raise From b210bd3ebe8e32b32980e08755f5f0134a6aa1df Mon Sep 17 00:00:00 2001 From: Paul Reidy Date: Wed, 7 Feb 2018 11:04:28 +0000 Subject: [PATCH 060/214] DEPR/CLN: fix from_items deprecation warnings (#19559) --- pandas/tests/groupby/aggregate/test_other.py | 3 ++- pandas/tests/reshape/test_reshape.py | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 575eae1916f4c..4c407ad8a0d93 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -7,6 +7,7 @@ from __future__ import print_function import pytest +from collections import OrderedDict import datetime as dt from functools import partial @@ -81,7 +82,7 @@ def test_agg_period_index(): s1 = Series(np.random.rand(len(index)), index=index) s2 = Series(np.random.rand(len(index)), index=index) series = [('s1', s1), ('s2', s2)] - df = DataFrame.from_items(series) + df = DataFrame.from_dict(OrderedDict(series)) grouped = df.groupby(df.index.month) list(grouped) diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_reshape.py index c9d079421532f..a57c3c41b3637 100644 --- a/pandas/tests/reshape/test_reshape.py +++ b/pandas/tests/reshape/test_reshape.py @@ -3,6 +3,7 @@ from warnings import catch_warnings import pytest +from collections import OrderedDict from pandas import DataFrame, Series import pandas as pd @@ -457,7 +458,8 @@ def test_dataframe_dummies_preserve_categorical_dtype(self, dtype): @pytest.mark.parametrize('sparse', [True, False]) def test_get_dummies_dont_sparsify_all_columns(self, sparse): # GH18914 - df = DataFrame.from_items([('GDP', [1, 2]), ('Nation', ['AB', 'CD'])]) + df = DataFrame.from_dict(OrderedDict([('GDP', [1, 2]), + ('Nation', ['AB', 'CD'])])) df = get_dummies(df, columns=['Nation'], sparse=sparse) df2 = df.reindex(columns=['GDP']) From 0e58964c9039f969a4cf09ea296599c7794d9e8b Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 7 Feb 2018 03:09:33 -0800 Subject: [PATCH 061/214] Implement get_day_of_year, tests (#19555) --- pandas/_libs/tslibs/ccalendar.pxd | 1 + pandas/_libs/tslibs/ccalendar.pyx | 43 ++++++++++++++++++++++----- pandas/_libs/tslibs/fields.pyx | 13 ++------ pandas/_libs/tslibs/period.pyx | 5 ++-- pandas/_libs/tslibs/timestamps.pyx | 20 +++---------- pandas/core/indexes/datetimes.py | 4 +-- pandas/tests/tslibs/__init__.py | 0 pandas/tests/tslibs/test_ccalendar.py | 18 +++++++++++ setup.py | 1 + 9 files changed, 67 insertions(+), 38 deletions(-) create mode 100644 pandas/tests/tslibs/__init__.py create mode 100644 pandas/tests/tslibs/test_ccalendar.py diff --git a/pandas/_libs/tslibs/ccalendar.pxd b/pandas/_libs/tslibs/ccalendar.pxd index a1bbeea1cb69a..42473a97a7150 100644 --- a/pandas/_libs/tslibs/ccalendar.pxd +++ b/pandas/_libs/tslibs/ccalendar.pxd @@ -10,3 +10,4 @@ cdef int dayofweek(int y, int m, int m) nogil cdef bint is_leapyear(int64_t year) nogil cpdef int32_t get_days_in_month(int year, Py_ssize_t month) nogil cpdef int32_t get_week_of_year(int year, int month, int day) nogil +cpdef int32_t get_day_of_year(int year, int month, int day) nogil diff --git a/pandas/_libs/tslibs/ccalendar.pyx b/pandas/_libs/tslibs/ccalendar.pyx index ae52f7dd30165..613e111443636 100644 --- a/pandas/_libs/tslibs/ccalendar.pyx +++ b/pandas/_libs/tslibs/ccalendar.pyx @@ -142,17 +142,13 @@ cpdef int32_t get_week_of_year(int year, int month, int day) nogil: Assumes the inputs describe a valid date. """ cdef: - bint isleap, isleap_prev - int32_t mo_off + bint isleap int32_t doy, dow int woy isleap = is_leapyear(year) - isleap_prev = is_leapyear(year - 1) - - mo_off = _month_offset[isleap * 13 + month - 1] - doy = mo_off + day + doy = get_day_of_year(year, month, day) dow = dayofweek(year, month, day) # estimate @@ -162,7 +158,7 @@ cpdef int32_t get_week_of_year(int year, int month, int day) nogil: # verify if woy < 0: - if (woy > -2) or (woy == -2 and isleap_prev): + if (woy > -2) or (woy == -2 and is_leapyear(year - 1)): woy = 53 else: woy = 52 @@ -171,3 +167,36 @@ cpdef int32_t get_week_of_year(int year, int month, int day) nogil: woy = 1 return woy + + +@cython.wraparound(False) +@cython.boundscheck(False) +cpdef int32_t get_day_of_year(int year, int month, int day) nogil: + """Return the ordinal day-of-year for the given day. + + Parameters + ---------- + year : int + month : int + day : int + + Returns + ------- + day_of_year : int32_t + + Notes + ----- + Assumes the inputs describe a valid date. + """ + cdef: + bint isleap + int32_t mo_off + int32_t doy, dow + int woy + + isleap = is_leapyear(year) + + mo_off = _month_offset[isleap * 13 + month - 1] + + day_of_year = mo_off + day + return day_of_year diff --git a/pandas/_libs/tslibs/fields.pyx b/pandas/_libs/tslibs/fields.pyx index a8a865eec38dd..7a4b9775bd56e 100644 --- a/pandas/_libs/tslibs/fields.pyx +++ b/pandas/_libs/tslibs/fields.pyx @@ -15,7 +15,7 @@ cnp.import_array() from ccalendar cimport (get_days_in_month, is_leapyear, dayofweek, - get_week_of_year) + get_week_of_year, get_day_of_year) from np_datetime cimport (pandas_datetimestruct, pandas_timedeltastruct, dt64_to_dtstruct, td64_to_tdstruct) from nattype cimport NPY_NAT @@ -374,15 +374,7 @@ def get_date_field(ndarray[int64_t] dtindex, object field): cdef: Py_ssize_t i, count = 0 ndarray[int32_t] out - ndarray[int32_t, ndim=2] _month_offset - int isleap, isleap_prev pandas_datetimestruct dts - int mo_off, doy, dow - - _month_offset = np.array( - [[0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365], - [0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366]], - dtype=np.int32 ) count = len(dtindex) out = np.empty(count, dtype='i4') @@ -482,8 +474,7 @@ def get_date_field(ndarray[int64_t] dtindex, object field): continue dt64_to_dtstruct(dtindex[i], &dts) - isleap = is_leapyear(dts.year) - out[i] = _month_offset[isleap, dts.month -1] + dts.day + out[i] = get_day_of_year(dts.year, dts.month, dts.day) return out elif field == 'dow': diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 5098e5c9100ff..e82c9c613c62a 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -22,7 +22,7 @@ from cpython.datetime cimport PyDateTime_Check, PyDateTime_IMPORT PyDateTime_IMPORT from np_datetime cimport (pandas_datetimestruct, dtstruct_to_dt64, - dt64_to_dtstruct, is_leapyear) + dt64_to_dtstruct) cimport util from util cimport is_period_object, is_string_object, INT32_MIN @@ -34,11 +34,12 @@ from timezones cimport is_utc, is_tzlocal, get_utcoffset, get_dst_info from timedeltas cimport delta_to_nanoseconds from ccalendar import MONTH_NUMBERS +from ccalendar cimport is_leapyear from frequencies cimport (get_freq_code, get_base_alias, get_to_timestamp_base, get_freq_str, get_rule_month) from parsing import parse_time_string, NAT_SENTINEL -from resolution import resolution, Resolution +from resolution import Resolution from nattype import nat_strings, NaT, iNaT from nattype cimport _nat_scalar_rules, NPY_NAT diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index b9be9c16eb6c3..47179a4e1d761 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -29,8 +29,7 @@ from nattype import NaT from nattype cimport NPY_NAT from np_datetime import OutOfBoundsDatetime from np_datetime cimport (reverse_ops, cmp_scalar, check_dts_bounds, - pandas_datetimestruct, dt64_to_dtstruct, - is_leapyear) + pandas_datetimestruct, dt64_to_dtstruct) from timedeltas import Timedelta from timedeltas cimport delta_to_nanoseconds from timezones cimport ( @@ -291,14 +290,6 @@ cdef class _Timestamp(datetime): val = tz_convert_single(self.value, 'UTC', self.tz) return val - cpdef int _get_field(self, field): - cdef: - int64_t val - ndarray[int32_t] out - val = self._maybe_convert_value_to_local() - out = get_date_field(np.array([val], dtype=np.int64), field) - return int(out[0]) - cpdef bint _get_start_end_field(self, str field): cdef: int64_t val @@ -695,14 +686,11 @@ class Timestamp(_Timestamp): @property def dayofyear(self): - return self._get_field('doy') + return ccalendar.get_day_of_year(self.year, self.month, self.day) @property def week(self): - if self.freq is None: - # fastpath for non-business - return ccalendar.get_week_of_year(self.year, self.month, self.day) - return self._get_field('woy') + return ccalendar.get_week_of_year(self.year, self.month, self.day) weekofyear = week @@ -764,7 +752,7 @@ class Timestamp(_Timestamp): @property def is_leap_year(self): - return bool(is_leapyear(self.year)) + return bool(ccalendar.is_leapyear(self.year)) def tz_localize(self, tz, ambiguous='raise', errors='raise'): """ diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index e09fa87477122..61c941c3d2333 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -55,7 +55,7 @@ from pandas._libs import (lib, index as libindex, tslib as libts, join as libjoin, Timestamp) from pandas._libs.tslibs import (timezones, conversion, fields, parsing, - period as libperiod) + resolution as libresolution) # -------- some conversion wrapper functions @@ -1795,7 +1795,7 @@ def is_normalized(self): @cache_readonly def _resolution(self): - return libperiod.resolution(self.asi8, self.tz) + return libresolution.resolution(self.asi8, self.tz) def insert(self, loc, item): """ diff --git a/pandas/tests/tslibs/__init__.py b/pandas/tests/tslibs/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/tslibs/test_ccalendar.py b/pandas/tests/tslibs/test_ccalendar.py new file mode 100644 index 0000000000000..b5d562a7b5a9c --- /dev/null +++ b/pandas/tests/tslibs/test_ccalendar.py @@ -0,0 +1,18 @@ +# -*- coding: utf-8 -*- +from datetime import datetime + +import numpy as np + +from pandas._libs.tslibs import ccalendar + + +def test_get_day_of_year(): + assert ccalendar.get_day_of_year(2001, 3, 1) == 60 + assert ccalendar.get_day_of_year(2004, 3, 1) == 61 + assert ccalendar.get_day_of_year(1907, 12, 31) == 365 + assert ccalendar.get_day_of_year(2004, 12, 31) == 366 + + dt = datetime.fromordinal(1 + np.random.randint(365 * 4000)) + result = ccalendar.get_day_of_year(dt.year, dt.month, dt.day) + expected = (dt - dt.replace(month=1, day=1)).days + 1 + assert result == expected diff --git a/setup.py b/setup.py index 5397a1b84dc4d..2332503e558ed 100755 --- a/setup.py +++ b/setup.py @@ -515,6 +515,7 @@ def pxd(name): 'pyxfile': '_libs/tslibs/period', 'pxdfiles': ['_libs/src/util', '_libs/missing', + '_libs/tslibs/ccalendar', '_libs/tslibs/timedeltas', '_libs/tslibs/timezones', '_libs/tslibs/nattype'], From ccf967721c8b4c4189f71e3a65c617dcdb52f41c Mon Sep 17 00:00:00 2001 From: Alex Rychyk Date: Wed, 7 Feb 2018 13:12:58 +0200 Subject: [PATCH 062/214] fixed bug in df.aggregate passing non-existent columns (#19552) --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/core/base.py | 4 ++++ pandas/tests/test_resample.py | 18 ++++++++++++++++-- 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 7322bd9fe3327..c48f6d19e3b10 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -647,6 +647,7 @@ Groupby/Resample/Rolling - Bug in :func:`DataFrame.groupby` where aggregation by ``first``/``last``/``min``/``max`` was causing timestamps to lose precision (:issue:`19526`) - Bug in :func:`DataFrame.transform` where particular aggregation functions were being incorrectly cast to match the dtype(s) of the grouped data (:issue:`19200`) - Bug in :func:`DataFrame.groupby` passing the `on=` kwarg, and subsequently using ``.apply()`` (:issue:`17813`) +- Bug in :func:`DataFrame.resample().aggregate` not raising a `ValueError` when aggregating a non-existent column (:issue:`16766`) Sparse ^^^^^^ diff --git a/pandas/core/base.py b/pandas/core/base.py index d5b204dba063e..0969717d85e4f 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -392,6 +392,10 @@ def nested_renaming_depr(level=4): elif isinstance(obj, ABCSeries): nested_renaming_depr() + elif isinstance(obj, ABCDataFrame) and \ + k not in obj.columns: + raise ValueError( + "Column '{col}' does not exist!".format(col=k)) arg = new_arg diff --git a/pandas/tests/test_resample.py b/pandas/tests/test_resample.py index 2de890ea459f0..9feba3fd042dd 100644 --- a/pandas/tests/test_resample.py +++ b/pandas/tests/test_resample.py @@ -20,7 +20,6 @@ from pandas.core.dtypes.generic import ABCSeries, ABCDataFrame from pandas.compat import range, lrange, zip, product, OrderedDict -from pandas.core.base import SpecificationError from pandas.errors import UnsupportedFunctionCall from pandas.core.groupby import DataError import pandas.core.common as com @@ -614,7 +613,7 @@ def f(): t[['A']].agg({'A': ['sum', 'std'], 'B': ['mean', 'std']}) - pytest.raises(SpecificationError, f) + pytest.raises(ValueError, f) def test_agg_nested_dicts(self): @@ -659,6 +658,21 @@ def f(): 'B': {'rb': ['mean', 'std']}}) assert_frame_equal(result, expected, check_like=True) + def test_try_aggregate_non_existing_column(self): + # GH 16766 + data = [ + {'dt': datetime(2017, 6, 1, 0), 'x': 1.0, 'y': 2.0}, + {'dt': datetime(2017, 6, 1, 1), 'x': 2.0, 'y': 2.0}, + {'dt': datetime(2017, 6, 1, 2), 'x': 3.0, 'y': 1.5} + ] + df = DataFrame(data).set_index('dt') + + # Error as we don't have 'z' column + with pytest.raises(ValueError): + df.resample('30T').agg({'x': ['mean'], + 'y': ['median'], + 'z': ['sum']}) + def test_selection_api_validation(self): # GH 13500 index = date_range(datetime(2005, 1, 1), From 4e1fcbafc5b49aad9a3f6ac25fabdb4bf191d175 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 7 Feb 2018 07:30:16 -0500 Subject: [PATCH 063/214] ERR: raise KeyError on invalid column name in aggregate (#19566) xref #19552 --- doc/source/whatsnew/v0.23.0.txt | 2 +- pandas/core/base.py | 2 +- pandas/tests/test_resample.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index c48f6d19e3b10..eaa8841b79a78 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -647,7 +647,7 @@ Groupby/Resample/Rolling - Bug in :func:`DataFrame.groupby` where aggregation by ``first``/``last``/``min``/``max`` was causing timestamps to lose precision (:issue:`19526`) - Bug in :func:`DataFrame.transform` where particular aggregation functions were being incorrectly cast to match the dtype(s) of the grouped data (:issue:`19200`) - Bug in :func:`DataFrame.groupby` passing the `on=` kwarg, and subsequently using ``.apply()`` (:issue:`17813`) -- Bug in :func:`DataFrame.resample().aggregate` not raising a `ValueError` when aggregating a non-existent column (:issue:`16766`) +- Bug in :func:`DataFrame.resample().aggregate` not raising a ``KeyError`` when aggregating a non-existent column (:issue:`16766`, :issue:`19566`) Sparse ^^^^^^ diff --git a/pandas/core/base.py b/pandas/core/base.py index 0969717d85e4f..3d8f5f265e3db 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -394,7 +394,7 @@ def nested_renaming_depr(level=4): nested_renaming_depr() elif isinstance(obj, ABCDataFrame) and \ k not in obj.columns: - raise ValueError( + raise KeyError( "Column '{col}' does not exist!".format(col=k)) arg = new_arg diff --git a/pandas/tests/test_resample.py b/pandas/tests/test_resample.py index 9feba3fd042dd..23cc18de34778 100644 --- a/pandas/tests/test_resample.py +++ b/pandas/tests/test_resample.py @@ -613,7 +613,7 @@ def f(): t[['A']].agg({'A': ['sum', 'std'], 'B': ['mean', 'std']}) - pytest.raises(ValueError, f) + pytest.raises(KeyError, f) def test_agg_nested_dicts(self): @@ -668,7 +668,7 @@ def test_try_aggregate_non_existing_column(self): df = DataFrame(data).set_index('dt') # Error as we don't have 'z' column - with pytest.raises(ValueError): + with pytest.raises(KeyError): df.resample('30T').agg({'x': ['mean'], 'y': ['median'], 'z': ['sum']}) From a7d1103081e4b49f64a0f872c40ed6c16d275506 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 7 Feb 2018 04:32:37 -0800 Subject: [PATCH 064/214] Frame ops prelims - de-duplicate, remove unused kwargs (#19522) --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/core/frame.py | 49 ++++++++++------------------ pandas/core/indexes/base.py | 47 ++++++++++++-------------- pandas/core/ops.py | 8 +++-- pandas/core/sparse/frame.py | 14 +++----- pandas/tests/frame/test_operators.py | 13 ++++++++ 6 files changed, 61 insertions(+), 71 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index eaa8841b79a78..80c5352701540 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -582,6 +582,7 @@ Numeric - Bug in :class:`Index` multiplication and division methods where operating with a ``Series`` would return an ``Index`` object instead of a ``Series`` object (:issue:`19042`) - Bug in the :class:`DataFrame` constructor in which data containing very large positive or very large negative numbers was causing ``OverflowError`` (:issue:`18584`) - Bug in :class:`Index` constructor with ``dtype='uint64'`` where int-like floats were not coerced to :class:`UInt64Index` (:issue:`18400`) +- Bug in :class:`DataFrame` flex arithmetic (e.g. `df.add(other, fill_value=foo)`) with a `fill_value` other than ``None`` failed to raise ``NotImplementedError`` in corner cases where either the frame or ``other`` has length zero (:issue:`19522`) Indexing diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3d1983f65d70d..b0ead3f0c7f00 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3915,8 +3915,7 @@ def reorder_levels(self, order, axis=0): # ---------------------------------------------------------------------- # Arithmetic / combination related - def _combine_frame(self, other, func, fill_value=None, level=None, - try_cast=True): + def _combine_frame(self, other, func, fill_value=None, level=None): this, other = self.align(other, join='outer', level=level, copy=False) new_index, new_columns = this.index, this.columns @@ -3968,52 +3967,40 @@ def f(i): def _combine_series(self, other, func, fill_value=None, axis=None, level=None, try_cast=True): + if fill_value is not None: + raise NotImplementedError("fill_value {fill} not supported." + .format(fill=fill_value)) + if axis is not None: axis = self._get_axis_name(axis) if axis == 'index': - return self._combine_match_index(other, func, level=level, - fill_value=fill_value, - try_cast=try_cast) + return self._combine_match_index(other, func, level=level) else: return self._combine_match_columns(other, func, level=level, - fill_value=fill_value, try_cast=try_cast) - return self._combine_series_infer(other, func, level=level, - fill_value=fill_value, - try_cast=try_cast) - - def _combine_series_infer(self, other, func, level=None, - fill_value=None, try_cast=True): - if len(other) == 0: - return self * np.nan + else: + if not len(other): + return self * np.nan - if len(self) == 0: - # Ambiguous case, use _series so works with DataFrame - return self._constructor(data=self._series, index=self.index, - columns=self.columns) + if not len(self): + # Ambiguous case, use _series so works with DataFrame + return self._constructor(data=self._series, index=self.index, + columns=self.columns) - return self._combine_match_columns(other, func, level=level, - fill_value=fill_value, - try_cast=try_cast) + # default axis is columns + return self._combine_match_columns(other, func, level=level, + try_cast=try_cast) - def _combine_match_index(self, other, func, level=None, - fill_value=None, try_cast=True): + def _combine_match_index(self, other, func, level=None): left, right = self.align(other, join='outer', axis=0, level=level, copy=False) - if fill_value is not None: - raise NotImplementedError("fill_value %r not supported." % - fill_value) return self._constructor(func(left.values.T, right.values).T, index=left.index, columns=self.columns, copy=False) - def _combine_match_columns(self, other, func, level=None, - fill_value=None, try_cast=True): + def _combine_match_columns(self, other, func, level=None, try_cast=True): left, right = self.align(other, join='outer', axis=1, level=level, copy=False) - if fill_value is not None: - raise NotImplementedError("fill_value %r not supported" % - fill_value) new_data = left._data.eval(func=func, other=right, axes=[left.columns, self.index], diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 1e1bb0d49b3df..15df77bf772dc 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -80,6 +80,26 @@ def _try_get_item(x): return x +def _make_invalid_op(name): + """ + Return a binary method that always raises a TypeError. + + Parameters + ---------- + name : str + + Returns + ------- + invalid_op : function + """ + def invalid_op(self, other=None): + raise TypeError("cannot perform {name} with this index type: " + "{typ}".format(name=name, typ=type(self))) + + invalid_op.__name__ = name + return invalid_op + + class InvalidIndexError(Exception): pass @@ -3916,30 +3936,12 @@ def _evaluate_compare(self, other): @classmethod def _add_numeric_methods_add_sub_disabled(cls): """ add in the numeric add/sub methods to disable """ - - def _make_invalid_op(name): - def invalid_op(self, other=None): - raise TypeError("cannot perform {name} with this index type: " - "{typ}".format(name=name, typ=type(self))) - - invalid_op.__name__ = name - return invalid_op - cls.__add__ = cls.__radd__ = __iadd__ = _make_invalid_op('__add__') # noqa cls.__sub__ = __isub__ = _make_invalid_op('__sub__') # noqa @classmethod def _add_numeric_methods_disabled(cls): """ add in numeric methods to disable other than add/sub """ - - def _make_invalid_op(name): - def invalid_op(self, other=None): - raise TypeError("cannot perform {name} with this index type: " - "{typ}".format(name=name, typ=type(self))) - - invalid_op.__name__ = name - return invalid_op - cls.__pow__ = cls.__rpow__ = _make_invalid_op('__pow__') cls.__mul__ = cls.__rmul__ = _make_invalid_op('__mul__') cls.__floordiv__ = cls.__rfloordiv__ = _make_invalid_op('__floordiv__') @@ -4147,15 +4149,6 @@ def logical_func(self, *args, **kwargs): @classmethod def _add_logical_methods_disabled(cls): """ add in logical methods to disable """ - - def _make_invalid_op(name): - def invalid_op(self, other=None): - raise TypeError("cannot perform {name} with this index type: " - "{typ}".format(name=name, typ=type(self))) - - invalid_op.__name__ = name - return invalid_op - cls.all = _make_invalid_op('all') cls.any = _make_invalid_op('any') diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 6db84aedce7e7..effa35695fcd1 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -1106,12 +1106,13 @@ def f(self, other, axis=default_axis, level=None, fill_value=None): if isinstance(other, ABCDataFrame): # Another DataFrame return self._combine_frame(other, na_op, fill_value, level) elif isinstance(other, ABCSeries): - return self._combine_series(other, na_op, fill_value, axis, level) + return self._combine_series(other, na_op, fill_value, axis, level, + try_cast=True) else: if fill_value is not None: self = self.fillna(fill_value) - return self._combine_const(other, na_op) + return self._combine_const(other, na_op, try_cast=True) f.__name__ = name @@ -1172,7 +1173,8 @@ def f(self, other): if isinstance(other, ABCDataFrame): # Another DataFrame return self._compare_frame(other, func, str_rep) elif isinstance(other, ABCSeries): - return self._combine_series_infer(other, func, try_cast=False) + return self._combine_series(other, func, + axis=None, try_cast=False) else: # straight boolean comparisons we want to allow all columns diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index 91dc44e3f185e..122c2b11f25f9 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -540,8 +540,7 @@ def xs(self, key, axis=0, copy=False): # ---------------------------------------------------------------------- # Arithmetic-related methods - def _combine_frame(self, other, func, fill_value=None, level=None, - try_cast=True): + def _combine_frame(self, other, func, fill_value=None, level=None): this, other = self.align(other, join='outer', level=level, copy=False) new_index, new_columns = this.index, this.columns @@ -584,12 +583,9 @@ def _combine_frame(self, other, func, fill_value=None, level=None, default_fill_value=new_fill_value ).__finalize__(self) - def _combine_match_index(self, other, func, level=None, fill_value=None, - try_cast=True): + def _combine_match_index(self, other, func, level=None): new_data = {} - if fill_value is not None: - raise NotImplementedError("'fill_value' argument is not supported") if level is not None: raise NotImplementedError("'level' argument is not supported") @@ -605,6 +601,7 @@ def _combine_match_index(self, other, func, level=None, fill_value=None, new_data[col] = func(series.values, other.values) # fill_value is a function of our operator + fill_value = None if isna(other.fill_value) or isna(self.default_fill_value): fill_value = np.nan else: @@ -615,15 +612,12 @@ def _combine_match_index(self, other, func, level=None, fill_value=None, new_data, index=new_index, columns=self.columns, default_fill_value=fill_value).__finalize__(self) - def _combine_match_columns(self, other, func, level=None, fill_value=None, - try_cast=True): + def _combine_match_columns(self, other, func, level=None, try_cast=True): # patched version of DataFrame._combine_match_columns to account for # NumPy circumventing __rsub__ with float64 types, e.g.: 3.0 - series, # where 3.0 is numpy.float64 and series is a SparseSeries. Still # possible for this to happen, which is bothersome - if fill_value is not None: - raise NotImplementedError("'fill_value' argument is not supported") if level is not None: raise NotImplementedError("'level' argument is not supported") diff --git a/pandas/tests/frame/test_operators.py b/pandas/tests/frame/test_operators.py index bf895be8bc813..26974b6398694 100644 --- a/pandas/tests/frame/test_operators.py +++ b/pandas/tests/frame/test_operators.py @@ -381,6 +381,19 @@ def test_arith_flex_frame(self): with tm.assert_raises_regex(NotImplementedError, 'fill_value'): self.frame.add(self.frame.iloc[0], axis='index', fill_value=3) + def test_arith_flex_zero_len_raises(self): + # GH#19522 passing fill_value to frame flex arith methods should + # raise even in the zero-length special cases + ser_len0 = pd.Series([]) + df_len0 = pd.DataFrame([], columns=['A', 'B']) + df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B']) + + with tm.assert_raises_regex(NotImplementedError, 'fill_value'): + df.add(ser_len0, fill_value='E') + + with tm.assert_raises_regex(NotImplementedError, 'fill_value'): + df_len0.sub(df['A'], axis=None, fill_value=3) + def test_binary_ops_align(self): # test aligning binary ops From 6b0c7e72b141831b7a9a5651f9e19eef53ec9e76 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 7 Feb 2018 08:06:46 -0500 Subject: [PATCH 065/214] API/BUG: .apply will correctly infer output shape when axis=1 (#18577) closes #16353 closes #17348 closes #17437 closes #18573 closes #17970 closes #17892 closes #17602 closes #18775 closes #18901 closes #18919 --- doc/source/basics.rst | 10 +- doc/source/whatsnew/v0.23.0.txt | 73 ++++- pandas/core/apply.py | 288 ++++++++++++------ pandas/core/frame.py | 136 ++++++++- pandas/core/sparse/frame.py | 42 ++- pandas/io/formats/style.py | 4 +- pandas/tests/frame/test_apply.py | 386 ++++++++++++++++++++++-- pandas/tests/sparse/frame/test_apply.py | 92 ++++++ pandas/tests/sparse/frame/test_frame.py | 46 --- 9 files changed, 885 insertions(+), 192 deletions(-) create mode 100644 pandas/tests/sparse/frame/test_apply.py diff --git a/doc/source/basics.rst b/doc/source/basics.rst index 18da53506f018..fb9e5a6cc75cb 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -793,8 +793,14 @@ The :meth:`~DataFrame.apply` method will also dispatch on a string method name. df.apply('mean') df.apply('mean', axis=1) -Depending on the return type of the function passed to :meth:`~DataFrame.apply`, -the result will either be of lower dimension or the same dimension. +The return type of the function passed to :meth:`~DataFrame.apply` affects the +type of the ultimate output from DataFrame.apply + +* If the applied function returns a ``Series``, the ultimate output is a ``DataFrame``. + The columns match the index of the ``Series`` returned by the applied function. +* If the applied function returns any other type, the ultimate output is a ``Series``. +* A ``result_type`` kwarg is accepted with the options: ``reduce``, ``broadcast``, and ``expand``. + These will determine how list-likes return results expand (or not) to a ``DataFrame``. :meth:`~DataFrame.apply` combined with some cleverness can be used to answer many questions about a data set. For example, suppose we wanted to extract the date where the diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 80c5352701540..1c6b698605521 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -142,7 +142,7 @@ Previous Behavior: 4 NaN dtype: float64 -Current Behavior +Current Behavior: .. ipython:: python @@ -167,7 +167,7 @@ Previous Behavior: 3 2.5 dtype: float64 -Current Behavior +Current Behavior: .. ipython:: python @@ -332,6 +332,73 @@ Convert to an xarray DataArray p.to_xarray() +.. _whatsnew_0230.api_breaking.apply: + +Apply Changes +~~~~~~~~~~~~~ + +:func:`DataFrame.apply` was inconsistent when applying an arbitrary user-defined-function that returned a list-like with ``axis=1``. Several bugs and inconsistencies +are resolved. If the applied function returns a Series, then pandas will return a DataFrame; otherwise a Series will be returned, this includes the case +where a list-like (e.g. ``tuple`` or ``list`` is returned), (:issue:`16353`, :issue:`17437`, :issue:`17970`, :issue:`17348`, :issue:`17892`, :issue:`18573`, +:issue:`17602`, :issue:`18775`, :issue:`18901`, :issue:`18919`) + +.. ipython:: python + + df = pd.DataFrame(np.tile(np.arange(3), 6).reshape(6, -1) + 1, columns=['A', 'B', 'C']) + df + +Previous Behavior. If the returned shape happened to match the original columns, this would return a ``DataFrame``. +If the return shape did not match, a ``Series`` with lists was returned. + +.. code-block:: python + + In [3]: df.apply(lambda x: [1, 2, 3], axis=1) + Out[3]: + A B C + 0 1 2 3 + 1 1 2 3 + 2 1 2 3 + 3 1 2 3 + 4 1 2 3 + 5 1 2 3 + + In [4]: df.apply(lambda x: [1, 2], axis=1) + Out[4]: + 0 [1, 2] + 1 [1, 2] + 2 [1, 2] + 3 [1, 2] + 4 [1, 2] + 5 [1, 2] + dtype: object + + +New Behavior. The behavior is consistent. These will *always* return a ``Series``. + +.. ipython:: python + + df.apply(lambda x: [1, 2, 3], axis=1) + df.apply(lambda x: [1, 2], axis=1) + +To have expanded columns, you can use ``result_type='expand'`` + +.. ipython:: python + + df.apply(lambda x: [1, 2, 3], axis=1, result_type='expand') + +To have broadcast the result across, you can use ``result_type='broadcast'``. The shape +must match the original columns. + +.. ipython:: python + + df.apply(lambda x: [1, 2, 3], axis=1, result_type='broadcast') + +Returning a ``Series`` allows one to control the exact return structure and column names: + +.. ipython:: python + + df.apply(lambda x: Series([1, 2, 3], index=x.index), axis=1) + .. _whatsnew_0230.api_breaking.build_changes: @@ -456,6 +523,8 @@ Deprecations - The ``is_copy`` attribute is deprecated and will be removed in a future version (:issue:`18801`). - ``IntervalIndex.from_intervals`` is deprecated in favor of the :class:`IntervalIndex` constructor (:issue:`19263`) - :func:``DataFrame.from_items`` is deprecated. Use :func:``DataFrame.from_dict()`` instead, or :func:``DataFrame.from_dict(OrderedDict())`` if you wish to preserve the key order (:issue:`17320`) +- The ``broadcast`` parameter of ``.apply()`` is removed in favor of ``result_type='broadcast'`` (:issue:`18577`) +- The ``reduce`` parameter of ``.apply()`` is removed in favor of ``result_type='reduce'`` (:issue:`18577`) .. _whatsnew_0230.prior_deprecations: diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 4cdec54b9a07a..c65943fbbb201 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -1,15 +1,20 @@ +import warnings import numpy as np from pandas import compat from pandas._libs import reduction +from pandas.core.dtypes.generic import ABCSeries from pandas.core.dtypes.common import ( is_extension_type, is_sequence) +from pandas.util._decorators import cache_readonly from pandas.io.formats.printing import pprint_thing -def frame_apply(obj, func, axis=0, broadcast=False, - raw=False, reduce=None, args=(), **kwds): +def frame_apply(obj, func, axis=0, broadcast=None, + raw=False, reduce=None, result_type=None, + ignore_failures=False, + args=None, kwds=None): """ construct and return a row or column based frame apply object """ axis = obj._get_axis_number(axis) @@ -19,20 +24,49 @@ def frame_apply(obj, func, axis=0, broadcast=False, klass = FrameColumnApply return klass(obj, func, broadcast=broadcast, - raw=raw, reduce=reduce, args=args, kwds=kwds) + raw=raw, reduce=reduce, result_type=result_type, + ignore_failures=ignore_failures, + args=args, kwds=kwds) class FrameApply(object): - def __init__(self, obj, func, broadcast, raw, reduce, args, kwds): + def __init__(self, obj, func, broadcast, raw, reduce, result_type, + ignore_failures, args, kwds): self.obj = obj - self.broadcast = broadcast self.raw = raw - self.reduce = reduce - self.args = args - - self.ignore_failures = kwds.pop('ignore_failures', False) - self.kwds = kwds + self.ignore_failures = ignore_failures + self.args = args or () + self.kwds = kwds or {} + + if result_type not in [None, 'reduce', 'broadcast', 'expand']: + raise ValueError("invalid value for result_type, must be one " + "of {None, 'reduce', 'broadcast', 'expand'}") + + if broadcast is not None: + warnings.warn("The broadcast argument is deprecated and will " + "be removed in a future version. You can specify " + "result_type='broadcast' to broadcast the result " + "to the original dimensions", + FutureWarning, stacklevel=4) + if broadcast: + result_type = 'broadcast' + + if reduce is not None: + warnings.warn("The reduce argument is deprecated and will " + "be removed in a future version. You can specify " + "result_type='reduce' to try to reduce the result " + "to the original dimensions", + FutureWarning, stacklevel=4) + if reduce: + + if result_type is not None: + raise ValueError( + "cannot pass both reduce=True and result_type") + + result_type = 'reduce' + + self.result_type = result_type # curry if needed if kwds or args and not isinstance(func, np.ufunc): @@ -43,6 +77,11 @@ def f(x): self.f = f + # results + self.result = None + self.res_index = None + self.res_columns = None + @property def columns(self): return self.obj.columns @@ -51,10 +90,14 @@ def columns(self): def index(self): return self.obj.index - @property + @cache_readonly def values(self): return self.obj.values + @cache_readonly + def dtypes(self): + return self.obj.dtypes + @property def agg_axis(self): return self.obj._get_agg_axis(self.axis) @@ -68,8 +111,7 @@ def get_result(self): # string dispatch if isinstance(self.f, compat.string_types): - if self.axis: - self.kwds['axis'] = self.axis + self.kwds['axis'] = self.axis return getattr(self.obj, self.f)(*self.args, **self.kwds) # ufunc @@ -80,25 +122,37 @@ def get_result(self): columns=self.columns, copy=False) # broadcasting - if self.broadcast: + if self.result_type == 'broadcast': return self.apply_broadcast() # one axis empty - if not all(self.obj.shape): + elif not all(self.obj.shape): return self.apply_empty_result() # raw - if self.raw and not self.obj._is_mixed_type: + elif self.raw and not self.obj._is_mixed_type: return self.apply_raw() return self.apply_standard() def apply_empty_result(self): - from pandas import Series - reduce = self.reduce + """ + we have an empty result; at least 1 axis is 0 + + we will try to apply the function to an empty + series in order to see if this is a reduction function + """ + + # we are not asked to reduce or infer reduction + # so just return a copy of the existing object + if self.result_type not in ['reduce', None]: + return self.obj.copy() + + # we may need to infer + reduce = self.result_type == 'reduce' - if reduce is None: - reduce = False + from pandas import Series + if not reduce: EMPTY_SERIES = Series([]) try: @@ -113,6 +167,8 @@ def apply_empty_result(self): return self.obj.copy() def apply_raw(self): + """ apply to the values as a numpy array """ + try: result = reduction.reduce(self.values, self.f, axis=self.axis) except Exception: @@ -125,49 +181,70 @@ def apply_raw(self): else: return Series(result, index=self.agg_axis) - def apply_standard(self): - from pandas import Series + def apply_broadcast(self, target): + result_values = np.empty_like(target.values) + + # axis which we want to compare compliance + result_compare = target.shape[0] + + for i, col in enumerate(target.columns): + res = self.f(target[col]) + ares = np. asarray(res).ndim + + # must be a scalar or 1d + if ares > 1: + raise ValueError("too many dims to broadcast") + elif ares == 1: + + # must match return dim + if result_compare != len(res): + raise ValueError("cannot broadcast result") - reduce = self.reduce - if reduce is None: - reduce = True + result_values[:, i] = res + + # we *always* preserve the original index / columns + result = self.obj._constructor(result_values, + index=target.index, + columns=target.columns) + return result + + def apply_standard(self): # try to reduce first (by default) # this only matters if the reduction in values is of different dtype # e.g. if we want to apply to a SparseFrame, then can't directly reduce - if reduce: - values = self.values - # we cannot reduce using non-numpy dtypes, - # as demonstrated in gh-12244 - if not is_extension_type(values): + # we cannot reduce using non-numpy dtypes, + # as demonstrated in gh-12244 + if (self.result_type in ['reduce', None] and + not self.dtypes.apply(is_extension_type).any()): - # Create a dummy Series from an empty array - index = self.obj._get_axis(self.axis) - empty_arr = np.empty(len(index), dtype=values.dtype) - - dummy = Series(empty_arr, index=index, dtype=values.dtype) + # Create a dummy Series from an empty array + from pandas import Series + values = self.values + index = self.obj._get_axis(self.axis) + labels = self.agg_axis + empty_arr = np.empty(len(index), dtype=values.dtype) + dummy = Series(empty_arr, index=index, dtype=values.dtype) - try: - labels = self.agg_axis - result = reduction.reduce(values, self.f, - axis=self.axis, - dummy=dummy, - labels=labels) - return Series(result, index=labels) - except Exception: - pass + try: + result = reduction.reduce(values, self.f, + axis=self.axis, + dummy=dummy, + labels=labels) + return Series(result, index=labels) + except Exception: + pass # compute the result using the series generator - results, res_index, res_columns = self._apply_series_generator() + self.apply_series_generator() # wrap results - return self.wrap_results(results, res_index, res_columns) + return self.wrap_results() - def _apply_series_generator(self): + def apply_series_generator(self): series_gen = self.series_generator res_index = self.result_index - res_columns = self.result_columns i = None keys = [] @@ -201,40 +278,23 @@ def _apply_series_generator(self): pprint_thing(k), ) raise - return results, res_index, res_columns + self.results = results + self.res_index = res_index + self.res_columns = self.result_columns - def wrap_results(self, results, res_index, res_columns): - from pandas import Series + def wrap_results(self): + results = self.results + # see if we can infer the results if len(results) > 0 and is_sequence(results[0]): - if not isinstance(results[0], Series): - index = res_columns - else: - index = None - result = self.obj._constructor(data=results, index=index) - result.columns = res_index + return self.wrap_results_for_axis() - if self.axis == 1: - result = result.T - result = result._convert( - datetime=True, timedelta=True, copy=False) - - else: - - result = Series(results) - result.index = res_index - - return result - - def _apply_broadcast(self, target): - result_values = np.empty_like(target.values) - columns = target.columns - for i, col in enumerate(columns): - result_values[:, i] = self.f(target[col]) + # dict of scalars + from pandas import Series + result = Series(results) + result.index = self.res_index - result = self.obj._constructor(result_values, index=target.index, - columns=target.columns) return result @@ -251,7 +311,7 @@ def get_result(self): return super(FrameRowApply, self).get_result() def apply_broadcast(self): - return self._apply_broadcast(self.obj) + return super(FrameRowApply, self).apply_broadcast(self.obj) @property def series_generator(self): @@ -266,29 +326,37 @@ def result_index(self): def result_columns(self): return self.index + def wrap_results_for_axis(self): + """ return the results for the rows """ -class FrameColumnApply(FrameApply): - axis = 1 + results = self.results + result = self.obj._constructor(data=results) - def __init__(self, obj, func, broadcast, raw, reduce, args, kwds): - super(FrameColumnApply, self).__init__(obj, func, broadcast, - raw, reduce, args, kwds) + if not isinstance(results[0], ABCSeries): + try: + result.index = self.res_columns + except ValueError: + pass - # skip if we are mixed datelike and trying reduce across axes - # GH6125 - if self.reduce: - if self.obj._is_mixed_type and self.obj._is_datelike_mixed_type: - self.reduce = False + try: + result.columns = self.res_index + except ValueError: + pass + + return result + + +class FrameColumnApply(FrameApply): + axis = 1 def apply_broadcast(self): - return self._apply_broadcast(self.obj.T).T + result = super(FrameColumnApply, self).apply_broadcast(self.obj.T) + return result.T @property def series_generator(self): - from pandas import Series - dtype = object if self.obj._is_mixed_type else None - return (Series._from_array(arr, index=self.columns, name=name, - dtype=dtype) + constructor = self.obj._constructor_sliced + return (constructor(arr, index=self.columns, name=name) for i, (arr, name) in enumerate(zip(self.values, self.index))) @@ -299,3 +367,39 @@ def result_index(self): @property def result_columns(self): return self.columns + + def wrap_results_for_axis(self): + """ return the results for the columns """ + results = self.results + + # we have requested to expand + if self.result_type == 'expand': + result = self.infer_to_same_shape() + + # we have a non-series and don't want inference + elif not isinstance(results[0], ABCSeries): + from pandas import Series + + result = Series(results) + result.index = self.res_index + + # we may want to infer results + else: + result = self.infer_to_same_shape() + + return result + + def infer_to_same_shape(self): + """ infer the results to the same shape as the input object """ + results = self.results + + result = self.obj._constructor(data=results) + result = result.T + + # set the index + result.index = self.res_index + + # infer dtypes + result = result.infer_objects() + + return result diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b0ead3f0c7f00..9487f51919108 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4820,8 +4820,8 @@ def aggregate(self, func, axis=0, *args, **kwargs): agg = aggregate - def apply(self, func, axis=0, broadcast=False, raw=False, reduce=None, - args=(), **kwds): + def apply(self, func, axis=0, broadcast=None, raw=False, reduce=None, + result_type=None, args=(), **kwds): """Applies function along input axis of DataFrame. Objects passed to functions are Series objects having index @@ -4836,9 +4836,14 @@ def apply(self, func, axis=0, broadcast=False, raw=False, reduce=None, axis : {0 or 'index', 1 or 'columns'}, default 0 * 0 or 'index': apply function to each column * 1 or 'columns': apply function to each row - broadcast : boolean, default False + broadcast : boolean, optional For aggregation functions, return object of same size with values propagated + + .. deprecated:: 0.23.0 + This argument will be removed in a future version, replaced + by result_type='broadcast'. + raw : boolean, default False If False, convert each row or column into a Series. If raw=True the passed function will receive ndarray objects instead. If you are @@ -4852,6 +4857,24 @@ def apply(self, func, axis=0, broadcast=False, raw=False, reduce=None, while guessing, exceptions raised by func will be ignored). If reduce is True a Series will always be returned, and if False a DataFrame will always be returned. + + .. deprecated:: 0.23.0 + This argument will be removed in a future version, replaced + by result_type='reduce'. + + result_type : {'expand', 'reduce', 'broadcast, None} + These only act when axis=1 {columns} + * 'expand' : list-like results will be turned into columns. + * 'reduce' : return a Series if possible rather than expanding + list-like results. This is the opposite to 'expand'. + * 'broadcast' : results will be broadcast to the original shape + of the frame, the original index & columns will be retained. + * None : list-like results will be returned as a list + in a single column. However if the apply function + returns a Series these are expanded to columns. + + .. versionadded:: 0.23.0 + args : tuple Positional arguments to pass to function in addition to the array/series @@ -4867,9 +4890,96 @@ def apply(self, func, axis=0, broadcast=False, raw=False, reduce=None, Examples -------- - >>> df.apply(numpy.sqrt) # returns DataFrame - >>> df.apply(numpy.sum, axis=0) # equiv to df.sum(0) - >>> df.apply(numpy.sum, axis=1) # equiv to df.sum(1) + + We use this DataFrame to illustrate + + >>> df = DataFrame(np.tile(np.arange(3), 6).reshape(6, -1) + 1, + ... columns=['A', 'B', 'C']) + >>> df + A B C + 0 1 2 3 + 1 1 2 3 + 2 1 2 3 + 3 1 2 3 + 4 1 2 3 + 5 1 2 3 + + Using a ufunc + + >>> df.apply(np.sqrt) + A B C + 0 1.0 1.414214 1.732051 + 1 1.0 1.414214 1.732051 + 2 1.0 1.414214 1.732051 + 3 1.0 1.414214 1.732051 + 4 1.0 1.414214 1.732051 + 5 1.0 1.414214 1.732051 + + Using a reducing function on either axis + + >>> df.apply(np.sum, axis=0) + A 6 + B 12 + C 18 + dtype: int64 + + >>> df.apply(np.sum, axis=1) + 0 6 + 1 6 + 2 6 + 3 6 + 4 6 + 5 6 + dtype: int64 + + Retuning a list-like will result in a Series + + >>> df.apply(lambda x: [1, 2], axis=1) + 0 [1, 2] + 1 [1, 2] + 2 [1, 2] + 3 [1, 2] + 4 [1, 2] + 5 [1, 2] + + Passing result_type='expand' will expand list-like results + to columns of a Dataframe + + >>> df.apply(lambda x: [1, 2], axis=1, result_type='expand') + 0 1 + 0 1 2 + 1 1 2 + 2 1 2 + 3 1 2 + 4 1 2 + 5 1 2 + + Return a Series inside the function is similar to passing + Passing result_type='expand'. The resulting column names + will be the Series index. + + >>> df.apply(lambda x: Series([1, 2], index=['foo', 'bar']), axis=1) + foo bar + 0 1 2 + 1 1 2 + 2 1 2 + 3 1 2 + 4 1 2 + 5 1 2 + + + Passing result_type='broadcast' will take a same shape + result, whether list-like or scalar and broadcast it + along the axis. The resulting column names will be the originals. + + >>> df.apply(lambda x: [1, 2, 3], axis=1, result_type='broadcast') + A B C + 0 1 2 3 + 1 1 2 3 + 2 1 2 3 + 3 1 2 3 + 4 1 2 3 + 5 1 2 3 See also -------- @@ -4888,7 +4998,9 @@ def apply(self, func, axis=0, broadcast=False, raw=False, reduce=None, broadcast=broadcast, raw=raw, reduce=reduce, - args=args, **kwds) + result_type=result_type, + args=args, + kwds=kwds) return op.get_result() def applymap(self, func): @@ -5592,12 +5704,16 @@ def f(x): # numeric_only and yet we have tried a # column-by-column reduction, where we have mixed type. # So let's just do what we can - result = self.apply(f, reduce=False, - ignore_failures=True) + from pandas.core.apply import frame_apply + opa = frame_apply(self, + func=f, + result_type='expand', + ignore_failures=True) + result = opa.get_result() if result.ndim == self.ndim: result = result.iloc[0] return result - except: + except Exception: pass if filter_type is None or filter_type == 'numeric': diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index 122c2b11f25f9..371377ce2899c 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -829,7 +829,8 @@ def notna(self): return self._apply_columns(lambda x: x.notna()) notnull = notna - def apply(self, func, axis=0, broadcast=False, reduce=False): + def apply(self, func, axis=0, broadcast=None, reduce=None, + result_type=None): """ Analogous to DataFrame.apply, for SparseDataFrame @@ -842,6 +843,35 @@ def apply(self, func, axis=0, broadcast=False, reduce=False): For aggregation functions, return object of same size with values propagated + .. deprecated:: 0.23.0 + This argument will be removed in a future version, replaced + by result_type='broadcast'. + + reduce : boolean or None, default None + Try to apply reduction procedures. If the DataFrame is empty, + apply will use reduce to determine whether the result should be a + Series or a DataFrame. If reduce is None (the default), apply's + return value will be guessed by calling func an empty Series (note: + while guessing, exceptions raised by func will be ignored). If + reduce is True a Series will always be returned, and if False a + DataFrame will always be returned. + + .. deprecated:: 0.23.0 + This argument will be removed in a future version, replaced + by result_type='reduce'. + + result_type : {'expand', 'reduce', 'broadcast, None} + These only act when axis=1 {columns} + * 'expand' : list-like results will be turned into columns + * 'reduce' : return a Series if possible rather than expanding + list-like results. This is the opposite to 'expand' + * 'broadcast' : scalar results will be broadcast to all columns + * None : list-like results will be returned as a list + in a single column. However if the apply function + returns a Series these are expanded to columns. + + .. versionadded:: 0.23.0 + Returns ------- applied : Series or SparseDataFrame @@ -865,12 +895,10 @@ def apply(self, func, axis=0, broadcast=False, reduce=False): op = frame_apply(self, func=func, axis=axis, - reduce=reduce) - - if broadcast: - return op.apply_broadcast() - - return op.apply_standard() + reduce=reduce, + broadcast=broadcast, + result_type=result_type) + return op.get_result() def applymap(self, func): """ diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 20e72dd6bde91..525f487d8aa39 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -509,7 +509,9 @@ def _apply(self, func, axis=0, subset=None, **kwargs): subset = _non_reducing_slice(subset) data = self.data.loc[subset] if axis is not None: - result = data.apply(func, axis=axis, **kwargs) + result = data.apply(func, axis=axis, + result_type='expand', **kwargs) + result.columns = data.columns else: result = func(data, **kwargs) if not isinstance(result, pd.DataFrame): diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index d69ddcd8f14d4..d1ad9f71e6350 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -82,24 +82,30 @@ def test_apply_empty(self): rs = xp.apply(lambda x: x['a'], axis=1) assert_frame_equal(xp, rs) + def test_apply_with_reduce_empty(self): # reduce with an empty DataFrame x = [] - result = self.empty.apply(x.append, axis=1, reduce=False) + result = self.empty.apply(x.append, axis=1, result_type='expand') assert_frame_equal(result, self.empty) - result = self.empty.apply(x.append, axis=1, reduce=True) + result = self.empty.apply(x.append, axis=1, result_type='reduce') assert_series_equal(result, Series( [], index=pd.Index([], dtype=object))) empty_with_cols = DataFrame(columns=['a', 'b', 'c']) - result = empty_with_cols.apply(x.append, axis=1, reduce=False) + result = empty_with_cols.apply(x.append, axis=1, result_type='expand') assert_frame_equal(result, empty_with_cols) - result = empty_with_cols.apply(x.append, axis=1, reduce=True) + result = empty_with_cols.apply(x.append, axis=1, result_type='reduce') assert_series_equal(result, Series( [], index=pd.Index([], dtype=object))) # Ensure that x.append hasn't been called assert x == [] + def test_apply_deprecate_reduce(self): + with warnings.catch_warnings(record=True): + x = [] + self.empty.apply(x.append, axis=1, result_type='reduce') + def test_apply_standard_nonunique(self): df = DataFrame( [[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=['a', 'a', 'c']) @@ -121,17 +127,79 @@ def test_with_string_args(self): expected = getattr(self.frame, arg)(axis=1) tm.assert_series_equal(result, expected) + def test_apply_broadcast_deprecated(self): + with tm.assert_produces_warning(FutureWarning): + self.frame.apply(np.mean, broadcast=True) + def test_apply_broadcast(self): - broadcasted = self.frame.apply(np.mean, broadcast=True) - agged = self.frame.apply(np.mean) - for col, ts in compat.iteritems(broadcasted): - assert (ts == agged[col]).all() + # scalars + result = self.frame.apply(np.mean, result_type='broadcast') + expected = DataFrame([self.frame.mean()], index=self.frame.index) + tm.assert_frame_equal(result, expected) + + result = self.frame.apply(np.mean, axis=1, result_type='broadcast') + m = self.frame.mean(axis=1) + expected = DataFrame({c: m for c in self.frame.columns}) + tm.assert_frame_equal(result, expected) + + # lists + result = self.frame.apply( + lambda x: list(range(len(self.frame.columns))), + axis=1, + result_type='broadcast') + m = list(range(len(self.frame.columns))) + expected = DataFrame([m] * len(self.frame.index), + dtype='float64', + index=self.frame.index, + columns=self.frame.columns) + tm.assert_frame_equal(result, expected) - broadcasted = self.frame.apply(np.mean, axis=1, broadcast=True) - agged = self.frame.apply(np.mean, axis=1) - for idx in broadcasted.index: - assert (broadcasted.xs(idx) == agged[idx]).all() + result = self.frame.apply(lambda x: list(range(len(self.frame.index))), + result_type='broadcast') + m = list(range(len(self.frame.index))) + expected = DataFrame({c: m for c in self.frame.columns}, + dtype='float64', + index=self.frame.index) + tm.assert_frame_equal(result, expected) + + # preserve columns + df = DataFrame(np.tile(np.arange(3), 6).reshape(6, -1) + 1, + columns=list('ABC')) + result = df.apply(lambda x: [1, 2, 3], + axis=1, + result_type='broadcast') + tm.assert_frame_equal(result, df) + + df = DataFrame(np.tile(np.arange(3), 6).reshape(6, -1) + 1, + columns=list('ABC')) + result = df.apply(lambda x: Series([1, 2, 3], index=list('abc')), + axis=1, + result_type='broadcast') + expected = df.copy() + tm.assert_frame_equal(result, expected) + + def test_apply_broadcast_error(self): + df = DataFrame( + np.tile(np.arange(3, dtype='int64'), 6).reshape(6, -1) + 1, + columns=['A', 'B', 'C']) + + # > 1 ndim + with pytest.raises(ValueError): + df.apply(lambda x: np.array([1, 2]).reshape(-1, 2), + axis=1, + result_type='broadcast') + + # cannot broadcast + with pytest.raises(ValueError): + df.apply(lambda x: [1, 2], + axis=1, + result_type='broadcast') + + with pytest.raises(ValueError): + df.apply(lambda x: Series([1, 2]), + axis=1, + result_type='broadcast') def test_apply_raw(self): result0 = self.frame.apply(np.mean, raw=True) @@ -208,7 +276,7 @@ def _checkit(axis=0, raw=False): _check(no_index, lambda x: x) _check(no_index, lambda x: x.mean()) - result = no_cols.apply(lambda x: x.mean(), broadcast=True) + result = no_cols.apply(lambda x: x.mean(), result_type='broadcast') assert isinstance(result, DataFrame) def test_apply_with_args_kwds(self): @@ -350,33 +418,37 @@ def test_apply_attach_name(self): result = self.frame.apply(lambda x: np.repeat(x.name, len(x)), axis=1) - expected = DataFrame(np.tile(self.frame.index, - (len(self.frame.columns), 1)).T, - index=self.frame.index, - columns=self.frame.columns) - assert_frame_equal(result, expected) + expected = Series(np.repeat(t[0], len(self.frame.columns)) + for t in self.frame.itertuples()) + expected.index = self.frame.index + assert_series_equal(result, expected) def test_apply_multi_index(self): - s = DataFrame([[1, 2], [3, 4], [5, 6]]) - s.index = MultiIndex.from_arrays([['a', 'a', 'b'], ['c', 'd', 'd']]) - s.columns = ['col1', 'col2'] - res = s.apply(lambda x: Series({'min': min(x), 'max': max(x)}), 1) - assert isinstance(res.index, MultiIndex) + index = MultiIndex.from_arrays([['a', 'a', 'b'], ['c', 'd', 'd']]) + s = DataFrame([[1, 2], [3, 4], [5, 6]], + index=index, + columns=['col1', 'col2']) + result = s.apply( + lambda x: Series({'min': min(x), 'max': max(x)}), 1) + expected = DataFrame([[1, 2], [3, 4], [5, 6]], + index=index, + columns=['min', 'max']) + assert_frame_equal(result, expected, check_like=True) def test_apply_dict(self): # GH 8735 A = DataFrame([['foo', 'bar'], ['spam', 'eggs']]) - A_dicts = pd.Series([dict([(0, 'foo'), (1, 'spam')]), - dict([(0, 'bar'), (1, 'eggs')])]) + A_dicts = Series([dict([(0, 'foo'), (1, 'spam')]), + dict([(0, 'bar'), (1, 'eggs')])]) B = DataFrame([[0, 1], [2, 3]]) - B_dicts = pd.Series([dict([(0, 0), (1, 2)]), dict([(0, 1), (1, 3)])]) + B_dicts = Series([dict([(0, 0), (1, 2)]), dict([(0, 1), (1, 3)])]) fn = lambda x: x.to_dict() for df, dicts in [(A, A_dicts), (B, B_dicts)]: - reduce_true = df.apply(fn, reduce=True) - reduce_false = df.apply(fn, reduce=False) - reduce_none = df.apply(fn, reduce=None) + reduce_true = df.apply(fn, result_type='reduce') + reduce_false = df.apply(fn, result_type='expand') + reduce_none = df.apply(fn) assert_series_equal(reduce_true, dicts) assert_frame_equal(reduce_false, df) @@ -465,8 +537,8 @@ def test_frame_apply_dont_convert_datetime64(self): assert df.x1.dtype == 'M8[ns]' - # See gh-12244 def test_apply_non_numpy_dtype(self): + # See gh-12244 df = DataFrame({'dt': pd.date_range( "2015-01-01", periods=3, tz='Europe/Brussels')}) result = df.apply(lambda x: x) @@ -482,6 +554,256 @@ def test_apply_non_numpy_dtype(self): assert_frame_equal(result, df) +class TestInferOutputShape(object): + # the user has supplied an opaque UDF where + # they are transforming the input that requires + # us to infer the output + + def test_infer_row_shape(self): + # gh-17437 + # if row shape is changing, infer it + df = pd.DataFrame(np.random.rand(10, 2)) + result = df.apply(np.fft.fft, axis=0) + assert result.shape == (10, 2) + + result = df.apply(np.fft.rfft, axis=0) + assert result.shape == (6, 2) + + def test_with_dictlike_columns(self): + # gh 17602 + df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) + result = df.apply(lambda x: {'s': x['a'] + x['b']}, + axis=1) + expected = Series([{'s': 3} for t in df.itertuples()]) + assert_series_equal(result, expected) + + df['tm'] = [pd.Timestamp('2017-05-01 00:00:00'), + pd.Timestamp('2017-05-02 00:00:00')] + result = df.apply(lambda x: {'s': x['a'] + x['b']}, + axis=1) + assert_series_equal(result, expected) + + # compose a series + result = (df['a'] + df['b']).apply(lambda x: {'s': x}) + expected = Series([{'s': 3}, {'s': 3}]) + assert_series_equal(result, expected) + + # gh-18775 + df = DataFrame() + df["author"] = ["X", "Y", "Z"] + df["publisher"] = ["BBC", "NBC", "N24"] + df["date"] = pd.to_datetime(['17-10-2010 07:15:30', + '13-05-2011 08:20:35', + '15-01-2013 09:09:09']) + result = df.apply(lambda x: {}, axis=1) + expected = Series([{}, {}, {}]) + assert_series_equal(result, expected) + + def test_with_dictlike_columns_with_infer(self): + # gh 17602 + df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) + result = df.apply(lambda x: {'s': x['a'] + x['b']}, + axis=1, result_type='expand') + expected = DataFrame({'s': [3, 3]}) + assert_frame_equal(result, expected) + + df['tm'] = [pd.Timestamp('2017-05-01 00:00:00'), + pd.Timestamp('2017-05-02 00:00:00')] + result = df.apply(lambda x: {'s': x['a'] + x['b']}, + axis=1, result_type='expand') + assert_frame_equal(result, expected) + + def test_with_listlike_columns(self): + # gh-17348 + df = DataFrame({'a': Series(np.random.randn(4)), + 'b': ['a', 'list', 'of', 'words'], + 'ts': date_range('2016-10-01', periods=4, freq='H')}) + + result = df[['a', 'b']].apply(tuple, axis=1) + expected = Series([t[1:] for t in df[['a', 'b']].itertuples()]) + assert_series_equal(result, expected) + + result = df[['a', 'ts']].apply(tuple, axis=1) + expected = Series([t[1:] for t in df[['a', 'ts']].itertuples()]) + assert_series_equal(result, expected) + + # gh-18919 + df = DataFrame({'x': Series([['a', 'b'], ['q']]), + 'y': Series([['z'], ['q', 't']])}) + df.index = MultiIndex.from_tuples([('i0', 'j0'), ('i1', 'j1')]) + + result = df.apply( + lambda row: [el for el in row['x'] if el in row['y']], + axis=1) + expected = Series([[], ['q']], index=df.index) + assert_series_equal(result, expected) + + def test_infer_output_shape_columns(self): + # gh-18573 + + df = DataFrame({'number': [1., 2.], + 'string': ['foo', 'bar'], + 'datetime': [pd.Timestamp('2017-11-29 03:30:00'), + pd.Timestamp('2017-11-29 03:45:00')]}) + result = df.apply(lambda row: (row.number, row.string), axis=1) + expected = Series([t[2:] for t in df.itertuples()]) + assert_series_equal(result, expected) + + def test_infer_output_shape_listlike_columns(self): + # gh-16353 + + df = DataFrame(np.random.randn(6, 3), columns=['A', 'B', 'C']) + + result = df.apply(lambda x: [1, 2, 3], axis=1) + expected = Series([[1, 2, 3] for t in df.itertuples()]) + assert_series_equal(result, expected) + + result = df.apply(lambda x: [1, 2], axis=1) + expected = Series([[1, 2] for t in df.itertuples()]) + assert_series_equal(result, expected) + + # gh-17970 + df = DataFrame({"a": [1, 2, 3]}, index=list('abc')) + + result = df.apply(lambda row: np.ones(1), axis=1) + expected = Series([np.ones(1) for t in df.itertuples()], + index=df.index) + assert_series_equal(result, expected) + + result = df.apply(lambda row: np.ones(2), axis=1) + expected = Series([np.ones(2) for t in df.itertuples()], + index=df.index) + assert_series_equal(result, expected) + + # gh-17892 + df = pd.DataFrame({'a': [pd.Timestamp('2010-02-01'), + pd.Timestamp('2010-02-04'), + pd.Timestamp('2010-02-05'), + pd.Timestamp('2010-02-06')], + 'b': [9, 5, 4, 3], + 'c': [5, 3, 4, 2], + 'd': [1, 2, 3, 4]}) + + def fun(x): + return (1, 2) + + result = df.apply(fun, axis=1) + expected = Series([(1, 2) for t in df.itertuples()]) + assert_series_equal(result, expected) + + def test_consistent_coerce_for_shapes(self): + # we want column names to NOT be propagated + # just because the shape matches the input shape + df = DataFrame(np.random.randn(4, 3), columns=['A', 'B', 'C']) + + result = df.apply(lambda x: [1, 2, 3], axis=1) + expected = Series([[1, 2, 3] for t in df.itertuples()]) + assert_series_equal(result, expected) + + result = df.apply(lambda x: [1, 2], axis=1) + expected = Series([[1, 2] for t in df.itertuples()]) + assert_series_equal(result, expected) + + def test_consistent_names(self): + # if a Series is returned, we should use the resulting index names + df = DataFrame( + np.tile(np.arange(3, dtype='int64'), 6).reshape(6, -1) + 1, + columns=['A', 'B', 'C']) + + result = df.apply(lambda x: Series([1, 2, 3], + index=['test', 'other', 'cols']), + axis=1) + expected = DataFrame( + np.tile(np.arange(3, dtype='int64'), 6).reshape(6, -1) + 1, + columns=['test', 'other', 'cols']) + assert_frame_equal(result, expected) + + result = df.apply( + lambda x: pd.Series([1, 2], index=['test', 'other']), axis=1) + expected = DataFrame( + np.tile(np.arange(2, dtype='int64'), 6).reshape(6, -1) + 1, + columns=['test', 'other']) + assert_frame_equal(result, expected) + + def test_result_type(self): + # result_type should be consistent no matter which + # path we take in the code + df = DataFrame( + np.tile(np.arange(3, dtype='int64'), 6).reshape(6, -1) + 1, + columns=['A', 'B', 'C']) + + result = df.apply(lambda x: [1, 2, 3], axis=1, result_type='expand') + expected = df.copy() + expected.columns = [0, 1, 2] + assert_frame_equal(result, expected) + + result = df.apply(lambda x: [1, 2], axis=1, result_type='expand') + expected = df[['A', 'B']].copy() + expected.columns = [0, 1] + assert_frame_equal(result, expected) + + # broadcast result + result = df.apply(lambda x: [1, 2, 3], axis=1, result_type='broadcast') + expected = df.copy() + assert_frame_equal(result, expected) + + columns = ['other', 'col', 'names'] + result = df.apply( + lambda x: pd.Series([1, 2, 3], + index=columns), + axis=1, + result_type='broadcast') + expected = df.copy() + assert_frame_equal(result, expected) + + # series result + result = df.apply(lambda x: Series([1, 2, 3], index=x.index), axis=1) + expected = df.copy() + assert_frame_equal(result, expected) + + # series result with other index + columns = ['other', 'col', 'names'] + result = df.apply( + lambda x: pd.Series([1, 2, 3], index=columns), + axis=1) + expected = df.copy() + expected.columns = columns + assert_frame_equal(result, expected) + + @pytest.mark.parametrize("result_type", ['foo', 1]) + def test_result_type_error(self, result_type): + # allowed result_type + df = DataFrame( + np.tile(np.arange(3, dtype='int64'), 6).reshape(6, -1) + 1, + columns=['A', 'B', 'C']) + + with pytest.raises(ValueError): + df.apply(lambda x: [1, 2, 3], + axis=1, + result_type=result_type) + + @pytest.mark.parametrize( + "box", + [lambda x: list(x), + lambda x: tuple(x), + lambda x: np.array(x, dtype='int64')], + ids=['list', 'tuple', 'array']) + def test_consistency_for_boxed(self, box): + # passing an array or list should not affect the output shape + df = DataFrame( + np.tile(np.arange(3, dtype='int64'), 6).reshape(6, -1) + 1, + columns=['A', 'B', 'C']) + + result = df.apply(lambda x: box([1, 2]), axis=1) + expected = Series([box([1, 2]) for t in df.itertuples()]) + assert_series_equal(result, expected) + + result = df.apply(lambda x: box([1, 2]), axis=1, result_type='expand') + expected = DataFrame( + np.tile(np.arange(2, dtype='int64'), 6).reshape(6, -1) + 1) + assert_frame_equal(result, expected) + + def zip_frames(*frames): """ take a list of frames, zip the columns together for each @@ -657,13 +979,13 @@ def test_non_callable_aggregates(self): # Function aggregate result = df.agg({'A': 'count'}) - expected = pd.Series({'A': 2}) + expected = Series({'A': 2}) assert_series_equal(result, expected) # Non-function aggregate result = df.agg({'A': 'size'}) - expected = pd.Series({'A': 3}) + expected = Series({'A': 3}) assert_series_equal(result, expected) diff --git a/pandas/tests/sparse/frame/test_apply.py b/pandas/tests/sparse/frame/test_apply.py new file mode 100644 index 0000000000000..07e4b1bf7c913 --- /dev/null +++ b/pandas/tests/sparse/frame/test_apply.py @@ -0,0 +1,92 @@ +import pytest +import numpy as np +from pandas import SparseDataFrame, DataFrame, Series, bdate_range +from pandas.core import nanops +from pandas.util import testing as tm + + +@pytest.fixture +def dates(): + return bdate_range('1/1/2011', periods=10) + + +@pytest.fixture +def empty(): + return SparseDataFrame() + + +@pytest.fixture +def frame(dates): + data = {'A': [np.nan, np.nan, np.nan, 0, 1, 2, 3, 4, 5, 6], + 'B': [0, 1, 2, np.nan, np.nan, np.nan, 3, 4, 5, 6], + 'C': np.arange(10, dtype=np.float64), + 'D': [0, 1, 2, 3, 4, 5, np.nan, np.nan, np.nan, np.nan]} + + return SparseDataFrame(data, index=dates) + + +@pytest.fixture +def fill_frame(frame): + values = frame.values.copy() + values[np.isnan(values)] = 2 + + return SparseDataFrame(values, columns=['A', 'B', 'C', 'D'], + default_fill_value=2, + index=frame.index) + + +def test_apply(frame): + applied = frame.apply(np.sqrt) + assert isinstance(applied, SparseDataFrame) + tm.assert_almost_equal(applied.values, np.sqrt(frame.values)) + + # agg / broadcast + with tm.assert_produces_warning(FutureWarning): + broadcasted = frame.apply(np.sum, broadcast=True) + assert isinstance(broadcasted, SparseDataFrame) + + with tm.assert_produces_warning(FutureWarning): + exp = frame.to_dense().apply(np.sum, broadcast=True) + tm.assert_frame_equal(broadcasted.to_dense(), exp) + + applied = frame.apply(np.sum) + tm.assert_series_equal(applied, + frame.to_dense().apply(nanops.nansum)) + + +def test_apply_fill(fill_frame): + applied = fill_frame.apply(np.sqrt) + assert applied['A'].fill_value == np.sqrt(2) + + +def test_apply_empty(empty): + assert empty.apply(np.sqrt) is empty + + +def test_apply_nonuq(): + orig = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], + index=['a', 'a', 'c']) + sparse = orig.to_sparse() + res = sparse.apply(lambda s: s[0], axis=1) + exp = orig.apply(lambda s: s[0], axis=1) + + # dtype must be kept + assert res.dtype == np.int64 + + # ToDo: apply must return subclassed dtype + assert isinstance(res, Series) + tm.assert_series_equal(res.to_dense(), exp) + + # df.T breaks + sparse = orig.T.to_sparse() + res = sparse.apply(lambda s: s[0], axis=0) # noqa + exp = orig.T.apply(lambda s: s[0], axis=0) + + # TODO: no non-unique columns supported in sparse yet + # tm.assert_series_equal(res.to_dense(), exp) + + +def test_applymap(frame): + # just test that it works + result = frame.applymap(lambda x: x * 2) + assert isinstance(result, SparseDataFrame) diff --git a/pandas/tests/sparse/frame/test_frame.py b/pandas/tests/sparse/frame/test_frame.py index 54f567bcd2a8c..29fad3c8eefaf 100644 --- a/pandas/tests/sparse/frame/test_frame.py +++ b/pandas/tests/sparse/frame/test_frame.py @@ -621,52 +621,6 @@ def test_append(self): tm.assert_sp_frame_equal(appended.iloc[:, :3], self.frame.iloc[:, :3], exact_indices=False) - def test_apply(self): - applied = self.frame.apply(np.sqrt) - assert isinstance(applied, SparseDataFrame) - tm.assert_almost_equal(applied.values, np.sqrt(self.frame.values)) - - applied = self.fill_frame.apply(np.sqrt) - assert applied['A'].fill_value == np.sqrt(2) - - # agg / broadcast - broadcasted = self.frame.apply(np.sum, broadcast=True) - assert isinstance(broadcasted, SparseDataFrame) - - exp = self.frame.to_dense().apply(np.sum, broadcast=True) - tm.assert_frame_equal(broadcasted.to_dense(), exp) - - assert self.empty.apply(np.sqrt) is self.empty - - from pandas.core import nanops - applied = self.frame.apply(np.sum) - tm.assert_series_equal(applied, - self.frame.to_dense().apply(nanops.nansum)) - - def test_apply_nonuq(self): - orig = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], - index=['a', 'a', 'c']) - sparse = orig.to_sparse() - res = sparse.apply(lambda s: s[0], axis=1) - exp = orig.apply(lambda s: s[0], axis=1) - # dtype must be kept - assert res.dtype == np.int64 - # ToDo: apply must return subclassed dtype - assert isinstance(res, pd.Series) - tm.assert_series_equal(res.to_dense(), exp) - - # df.T breaks - sparse = orig.T.to_sparse() - res = sparse.apply(lambda s: s[0], axis=0) # noqa - exp = orig.T.apply(lambda s: s[0], axis=0) - # TODO: no non-unique columns supported in sparse yet - # tm.assert_series_equal(res.to_dense(), exp) - - def test_applymap(self): - # just test that it works - result = self.frame.applymap(lambda x: x * 2) - assert isinstance(result, SparseDataFrame) - def test_astype(self): sparse = pd.SparseDataFrame({'A': SparseArray([1, 2, 3, 4], dtype=np.int64), From 50528421ae79b27a26b32ba715b17271c8dfda7e Mon Sep 17 00:00:00 2001 From: cbertinato Date: Wed, 7 Feb 2018 10:25:38 -0500 Subject: [PATCH 066/214] BUG: Fixes rounding error in Timestamp.floor() (#19240) --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/_libs/tslibs/timestamps.pyx | 60 +++++++++++++------ pandas/core/indexes/datetimelike.py | 17 +----- .../indexes/datetimes/test_scalar_compat.py | 21 +++++++ .../tests/scalar/timestamp/test_unary_ops.py | 25 +++++++- 5 files changed, 90 insertions(+), 34 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 1c6b698605521..a7300f7d1ceb0 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -620,6 +620,7 @@ Datetimelike - Bug in :func:`~DataFrame.pct_change` using ``periods`` and ``freq`` returned different length outputs (:issue:`7292`) - Bug in comparison of :class:`DatetimeIndex` against ``None`` or ``datetime.date`` objects raising ``TypeError`` for ``==`` and ``!=`` comparisons instead of all-``False`` and all-``True``, respectively (:issue:`19301`) - Bug in :class:`Timestamp` and :func:`to_datetime` where a string representing a barely out-of-bounds timestamp would be incorrectly rounded down instead of raising ``OutOfBoundsDatetime`` (:issue:`19382`) +- Bug in :func:`Timestamp.floor` :func:`DatetimeIndex.floor` where time stamps far in the future and past were not rounded correctly (:issue:`19206`) - Timezones diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 47179a4e1d761..ed77916a1d887 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -58,6 +58,46 @@ cdef inline object create_timestamp_from_ts(int64_t value, return ts_base +def round_ns(values, rounder, freq): + """ + Applies rounding function at given frequency + + Parameters + ---------- + values : int, :obj:`ndarray` + rounder : function + freq : str, obj + + Returns + ------- + int or :obj:`ndarray` + """ + from pandas.tseries.frequencies import to_offset + unit = to_offset(freq).nanos + if unit < 1000: + # for nano rounding, work with the last 6 digits separately + # due to float precision + buff = 1000000 + r = (buff * (values // buff) + unit * + (rounder((values % buff) * (1 / float(unit)))).astype('i8')) + else: + if unit % 1000 != 0: + msg = 'Precision will be lost using frequency: {}' + warnings.warn(msg.format(freq)) + + # GH19206 + # to deal with round-off when unit is large + if unit >= 1e9: + divisor = 10 ** int(np.log10(unit / 1e7)) + else: + divisor = 10 + + r = (unit * rounder((values * (divisor / float(unit))) / divisor) + .astype('i8')) + + return r + + # This is PITA. Because we inherit from datetime, which has very specific # construction requirements, we need to do object instantiation in python # (see Timestamp class above). This will serve as a C extension type that @@ -581,28 +621,12 @@ class Timestamp(_Timestamp): return create_timestamp_from_ts(ts.value, ts.dts, ts.tzinfo, freq) def _round(self, freq, rounder): - - cdef: - int64_t unit, r, value, buff = 1000000 - object result - - from pandas.tseries.frequencies import to_offset - unit = to_offset(freq).nanos if self.tz is not None: value = self.tz_localize(None).value else: value = self.value - if unit < 1000 and unit % 1000 != 0: - # for nano rounding, work with the last 6 digits separately - # due to float precision - r = (buff * (value // buff) + unit * - (rounder((value % buff) / float(unit))).astype('i8')) - elif unit >= 1000 and unit % 1000 != 0: - msg = 'Precision will be lost using frequency: {}' - warnings.warn(msg.format(freq)) - r = (unit * rounder(value / float(unit)).astype('i8')) - else: - r = (unit * rounder(value / float(unit)).astype('i8')) + + r = round_ns(value, rounder, freq) result = Timestamp(r, unit='ns') if self.tz is not None: result = result.tz_localize(self.tz) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 8e77c7a7fa48c..4a526955d9bf4 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -36,6 +36,7 @@ from pandas._libs import lib, iNaT, NaT from pandas._libs.tslibs.period import Period from pandas._libs.tslibs.timedeltas import delta_to_nanoseconds +from pandas._libs.tslibs.timestamps import round_ns from pandas.core.indexes.base import Index, _index_shared_docs from pandas.util._decorators import Appender, cache_readonly @@ -90,23 +91,9 @@ class TimelikeOps(object): """) def _round(self, freq, rounder): - - from pandas.tseries.frequencies import to_offset - unit = to_offset(freq).nanos # round the local times values = _ensure_datetimelike_to_i8(self) - if unit < 1000 and unit % 1000 != 0: - # for nano rounding, work with the last 6 digits separately - # due to float precision - buff = 1000000 - result = (buff * (values // buff) + unit * - (rounder((values % buff) / float(unit))).astype('i8')) - elif unit >= 1000 and unit % 1000 != 0: - msg = 'Precision will be lost using frequency: {}' - warnings.warn(msg.format(freq)) - result = (unit * rounder(values / float(unit)).astype('i8')) - else: - result = (unit * rounder(values / float(unit)).astype('i8')) + result = round_ns(values, rounder, freq) result = self._maybe_mask_results(result, fill_value=NaT) attribs = self._get_attributes_dict() diff --git a/pandas/tests/indexes/datetimes/test_scalar_compat.py b/pandas/tests/indexes/datetimes/test_scalar_compat.py index 111f68ba14775..83e7a0cd68d63 100644 --- a/pandas/tests/indexes/datetimes/test_scalar_compat.py +++ b/pandas/tests/indexes/datetimes/test_scalar_compat.py @@ -126,6 +126,27 @@ def test_round(self, tz): ts = '2016-10-17 12:00:00.001501031' DatetimeIndex([ts]).round('1010ns') + @pytest.mark.parametrize('test_input, rounder, freq, expected', [ + (['2117-01-01 00:00:45'], 'floor', '15s', ['2117-01-01 00:00:45']), + (['2117-01-01 00:00:45'], 'ceil', '15s', ['2117-01-01 00:00:45']), + (['2117-01-01 00:00:45.000000012'], 'floor', '10ns', + ['2117-01-01 00:00:45.000000010']), + (['1823-01-01 00:00:01.000000012'], 'ceil', '10ns', + ['1823-01-01 00:00:01.000000020']), + (['1823-01-01 00:00:01'], 'floor', '1s', ['1823-01-01 00:00:01']), + (['1823-01-01 00:00:01'], 'ceil', '1s', ['1823-01-01 00:00:01']), + (('NaT', '1823-01-01 00:00:01'), 'floor', '1s', + ('NaT', '1823-01-01 00:00:01')), + (('NaT', '1823-01-01 00:00:01'), 'ceil', '1s', + ('NaT', '1823-01-01 00:00:01')) + ]) + def test_ceil_floor_edge(self, tz, test_input, rounder, freq, expected): + dt = DatetimeIndex(list(test_input)) + func = getattr(dt, rounder) + result = func(freq) + expected = DatetimeIndex(list(expected)) + assert expected.equals(result) + # ---------------------------------------------------------------- # DatetimeIndex.normalize diff --git a/pandas/tests/scalar/timestamp/test_unary_ops.py b/pandas/tests/scalar/timestamp/test_unary_ops.py index 70c7308dd3991..8a6989c909cb2 100644 --- a/pandas/tests/scalar/timestamp/test_unary_ops.py +++ b/pandas/tests/scalar/timestamp/test_unary_ops.py @@ -10,7 +10,7 @@ from pandas.compat import PY3 from pandas._libs.tslibs.frequencies import _INVALID_FREQ_ERROR -from pandas import Timestamp +from pandas import Timestamp, NaT class TestTimestampUnaryOps(object): @@ -93,6 +93,29 @@ def test_round_frequencies(self, freq, expected): result = stamp.round(freq=freq) assert result == expected + @pytest.mark.parametrize('test_input, rounder, freq, expected', [ + ('2117-01-01 00:00:45', 'floor', '15s', '2117-01-01 00:00:45'), + ('2117-01-01 00:00:45', 'ceil', '15s', '2117-01-01 00:00:45'), + ('2117-01-01 00:00:45.000000012', 'floor', '10ns', + '2117-01-01 00:00:45.000000010'), + ('1823-01-01 00:00:01.000000012', 'ceil', '10ns', + '1823-01-01 00:00:01.000000020'), + ('1823-01-01 00:00:01', 'floor', '1s', '1823-01-01 00:00:01'), + ('1823-01-01 00:00:01', 'ceil', '1s', '1823-01-01 00:00:01'), + ('NaT', 'floor', '1s', 'NaT'), + ('NaT', 'ceil', '1s', 'NaT') + ]) + def test_ceil_floor_edge(self, test_input, rounder, freq, expected): + dt = Timestamp(test_input) + func = getattr(dt, rounder) + result = func(freq) + + if dt is NaT: + assert result is NaT + else: + expected = Timestamp(expected) + assert result == expected + def test_ceil(self): dt = Timestamp('20130101 09:10:11') result = dt.ceil('D') From d0c13a3b768fb1b5c511d66a19761e2b43f3e4ed Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 8 Feb 2018 01:57:19 +0100 Subject: [PATCH 067/214] DOC: some clean-up of the apply docs (follow-up #18577) (#19573) --- doc/source/basics.rst | 16 ++++++++------- doc/source/whatsnew/v0.23.0.txt | 23 +++++++++++---------- pandas/core/frame.py | 36 ++++++++++++++++++--------------- pandas/core/sparse/frame.py | 18 ++++++++++------- 4 files changed, 52 insertions(+), 41 deletions(-) diff --git a/doc/source/basics.rst b/doc/source/basics.rst index fb9e5a6cc75cb..749d4be11ad45 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -774,9 +774,9 @@ We encourage you to view the source code of :meth:`~DataFrame.pipe`. Row or Column-wise Function Application ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Arbitrary functions can be applied along the axes of a DataFrame or Panel +Arbitrary functions can be applied along the axes of a DataFrame using the :meth:`~DataFrame.apply` method, which, like the descriptive -statistics methods, take an optional ``axis`` argument: +statistics methods, takes an optional ``axis`` argument: .. ipython:: python @@ -794,13 +794,15 @@ The :meth:`~DataFrame.apply` method will also dispatch on a string method name. df.apply('mean', axis=1) The return type of the function passed to :meth:`~DataFrame.apply` affects the -type of the ultimate output from DataFrame.apply +type of the final output from ``DataFrame.apply`` for the default behaviour: -* If the applied function returns a ``Series``, the ultimate output is a ``DataFrame``. +* If the applied function returns a ``Series``, the final output is a ``DataFrame``. The columns match the index of the ``Series`` returned by the applied function. -* If the applied function returns any other type, the ultimate output is a ``Series``. -* A ``result_type`` kwarg is accepted with the options: ``reduce``, ``broadcast``, and ``expand``. - These will determine how list-likes return results expand (or not) to a ``DataFrame``. +* If the applied function returns any other type, the final output is a ``Series``. + +This default behaviour can be overridden using the ``result_type``, which +accepts three options: ``reduce``, ``broadcast``, and ``expand``. +These will determine how list-likes return values expand (or not) to a ``DataFrame``. :meth:`~DataFrame.apply` combined with some cleverness can be used to answer many questions about a data set. For example, suppose we wanted to extract the date where the diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index a7300f7d1ceb0..7782e5f1ffa56 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -334,20 +334,20 @@ Convert to an xarray DataArray .. _whatsnew_0230.api_breaking.apply: -Apply Changes -~~~~~~~~~~~~~ +Changes to make output of ``DataFrame.apply`` consistent +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :func:`DataFrame.apply` was inconsistent when applying an arbitrary user-defined-function that returned a list-like with ``axis=1``. Several bugs and inconsistencies are resolved. If the applied function returns a Series, then pandas will return a DataFrame; otherwise a Series will be returned, this includes the case -where a list-like (e.g. ``tuple`` or ``list`` is returned), (:issue:`16353`, :issue:`17437`, :issue:`17970`, :issue:`17348`, :issue:`17892`, :issue:`18573`, -:issue:`17602`, :issue:`18775`, :issue:`18901`, :issue:`18919`) +where a list-like (e.g. ``tuple`` or ``list`` is returned) (:issue:`16353`, :issue:`17437`, :issue:`17970`, :issue:`17348`, :issue:`17892`, :issue:`18573`, +:issue:`17602`, :issue:`18775`, :issue:`18901`, :issue:`18919`). .. ipython:: python df = pd.DataFrame(np.tile(np.arange(3), 6).reshape(6, -1) + 1, columns=['A', 'B', 'C']) df -Previous Behavior. If the returned shape happened to match the original columns, this would return a ``DataFrame``. +Previous Behavior: if the returned shape happened to match the length of original columns, this would return a ``DataFrame``. If the return shape did not match, a ``Series`` with lists was returned. .. code-block:: python @@ -373,7 +373,7 @@ If the return shape did not match, a ``Series`` with lists was returned. dtype: object -New Behavior. The behavior is consistent. These will *always* return a ``Series``. +New Behavior: When the applied function returns a list-like, this will now *always* return a ``Series``. .. ipython:: python @@ -386,8 +386,9 @@ To have expanded columns, you can use ``result_type='expand'`` df.apply(lambda x: [1, 2, 3], axis=1, result_type='expand') -To have broadcast the result across, you can use ``result_type='broadcast'``. The shape -must match the original columns. +To broadcast the result across the original columns (the old behaviour for +list-likes of the correct length), you can use ``result_type='broadcast'``. +The shape must match the original columns. .. ipython:: python @@ -397,7 +398,7 @@ Returning a ``Series`` allows one to control the exact return structure and colu .. ipython:: python - df.apply(lambda x: Series([1, 2, 3], index=x.index), axis=1) + df.apply(lambda x: Series([1, 2, 3], index=['D', 'E', 'F']]), axis=1) .. _whatsnew_0230.api_breaking.build_changes: @@ -523,8 +524,8 @@ Deprecations - The ``is_copy`` attribute is deprecated and will be removed in a future version (:issue:`18801`). - ``IntervalIndex.from_intervals`` is deprecated in favor of the :class:`IntervalIndex` constructor (:issue:`19263`) - :func:``DataFrame.from_items`` is deprecated. Use :func:``DataFrame.from_dict()`` instead, or :func:``DataFrame.from_dict(OrderedDict())`` if you wish to preserve the key order (:issue:`17320`) -- The ``broadcast`` parameter of ``.apply()`` is removed in favor of ``result_type='broadcast'`` (:issue:`18577`) -- The ``reduce`` parameter of ``.apply()`` is removed in favor of ``result_type='reduce'`` (:issue:`18577`) +- The ``broadcast`` parameter of ``.apply()`` is deprecated in favor of ``result_type='broadcast'`` (:issue:`18577`) +- The ``reduce`` parameter of ``.apply()`` is deprecated in favor of ``result_type='reduce'`` (:issue:`18577`) .. _whatsnew_0230.prior_deprecations: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9487f51919108..28923f0fbf240 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4822,12 +4822,12 @@ def aggregate(self, func, axis=0, *args, **kwargs): def apply(self, func, axis=0, broadcast=None, raw=False, reduce=None, result_type=None, args=(), **kwds): - """Applies function along input axis of DataFrame. + """Applies function along an axis of the DataFrame. Objects passed to functions are Series objects having index either the DataFrame's index (axis=0) or the columns (axis=1). - Return type depends on whether passed function aggregates, or the - reduce argument if the DataFrame is empty. + Final return type depends on the return type of the applied function, + or on the `result_type` argument. Parameters ---------- @@ -4863,15 +4863,18 @@ def apply(self, func, axis=0, broadcast=None, raw=False, reduce=None, by result_type='reduce'. result_type : {'expand', 'reduce', 'broadcast, None} - These only act when axis=1 {columns} + These only act when axis=1 {columns}: + * 'expand' : list-like results will be turned into columns. * 'reduce' : return a Series if possible rather than expanding list-like results. This is the opposite to 'expand'. * 'broadcast' : results will be broadcast to the original shape of the frame, the original index & columns will be retained. - * None : list-like results will be returned as a list - in a single column. However if the apply function - returns a Series these are expanded to columns. + + The default behaviour (None) depends on the return value of the + applied function: list-like results will be returned as a Series + of those. However if the apply function returns a Series these + are expanded to columns. .. versionadded:: 0.23.0 @@ -4893,8 +4896,8 @@ def apply(self, func, axis=0, broadcast=None, raw=False, reduce=None, We use this DataFrame to illustrate - >>> df = DataFrame(np.tile(np.arange(3), 6).reshape(6, -1) + 1, - ... columns=['A', 'B', 'C']) + >>> df = pd.DataFrame(np.tile(np.arange(3), 6).reshape(6, -1) + 1, + ... columns=['A', 'B', 'C']) >>> df A B C 0 1 2 3 @@ -4904,7 +4907,8 @@ def apply(self, func, axis=0, broadcast=None, raw=False, reduce=None, 4 1 2 3 5 1 2 3 - Using a ufunc + Using a numpy universal function (in this case the same as + ``np.sqrt(df)``): >>> df.apply(np.sqrt) A B C @@ -4954,8 +4958,8 @@ def apply(self, func, axis=0, broadcast=None, raw=False, reduce=None, 4 1 2 5 1 2 - Return a Series inside the function is similar to passing - Passing result_type='expand'. The resulting column names + Returning a Series inside the function is similar to passing + ``result_type='expand'``. The resulting column names will be the Series index. >>> df.apply(lambda x: Series([1, 2], index=['foo', 'bar']), axis=1) @@ -4967,10 +4971,10 @@ def apply(self, func, axis=0, broadcast=None, raw=False, reduce=None, 4 1 2 5 1 2 - - Passing result_type='broadcast' will take a same shape - result, whether list-like or scalar and broadcast it - along the axis. The resulting column names will be the originals. + Passing ``result_type='broadcast'`` will ensure the same shape + result, whether list-like or scalar is returned by the function, + and broadcast it along the axis. The resulting column names will + be the originals. >>> df.apply(lambda x: [1, 2, 3], axis=1, result_type='broadcast') A B C diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index 371377ce2899c..19b126216db81 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -861,14 +861,18 @@ def apply(self, func, axis=0, broadcast=None, reduce=None, by result_type='reduce'. result_type : {'expand', 'reduce', 'broadcast, None} - These only act when axis=1 {columns} - * 'expand' : list-like results will be turned into columns + These only act when axis=1 {columns}: + + * 'expand' : list-like results will be turned into columns. * 'reduce' : return a Series if possible rather than expanding - list-like results. This is the opposite to 'expand' - * 'broadcast' : scalar results will be broadcast to all columns - * None : list-like results will be returned as a list - in a single column. However if the apply function - returns a Series these are expanded to columns. + list-like results. This is the opposite to 'expand'. + * 'broadcast' : results will be broadcast to the original shape + of the frame, the original index & columns will be retained. + + The default behaviour (None) depends on the return value of the + applied function: list-like results will be returned as a Series + of those. However if the apply function returns a Series these + are expanded to columns. .. versionadded:: 0.23.0 From 31973f573ffd60eafeb56b4f20f33e8aa223bbd6 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 7 Feb 2018 17:09:48 -0800 Subject: [PATCH 068/214] Remove duplicated logic from period_helper (#19540) --- pandas/_libs/src/period_helper.c | 519 +++++------------------------- pandas/_libs/src/period_helper.h | 29 +- pandas/_libs/tslibs/ccalendar.pyx | 3 +- pandas/_libs/tslibs/period.pyx | 178 +++++++--- 4 files changed, 216 insertions(+), 513 deletions(-) diff --git a/pandas/_libs/src/period_helper.c b/pandas/_libs/src/period_helper.c index 8f1c527a68455..570f20b790750 100644 --- a/pandas/_libs/src/period_helper.c +++ b/pandas/_libs/src/period_helper.c @@ -14,6 +14,7 @@ See end of file for stuff pandas uses (search for 'pandas'). */ #include "period_helper.h" +#include "../datetime/np_datetime.h" /* ------------------------------------------------------------------ * Code derived from scikits.timeseries @@ -37,193 +38,39 @@ static int floordiv(int x, int divisor) { } } -/* Table with day offsets for each month (0-based, without and with leap) */ -static int month_offset[2][13] = { - {0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365}, - {0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366}}; - -/* Table of number of days in a month (0-based, without and with leap) */ -static int days_in_month[2][12] = { - {31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}, - {31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}}; - -/* Return 1/0 iff year points to a leap year. - * Assumes GREGORIAN_CALENDAR */ -static int dInfoCalc_Leapyear(npy_int64 year) { - return (year % 4 == 0) && ((year % 100 != 0) || (year % 400 == 0)); -} - -/* Return the day of the week for the given absolute date. */ -static int dInfoCalc_DayOfWeek(npy_int64 absdate) { - int day_of_week; - - if (absdate >= 1) { - day_of_week = (absdate - 1) % 7; - } else { - day_of_week = 6 - ((-absdate) % 7); - } - return day_of_week; -} static int monthToQuarter(int month) { return ((month - 1) / 3) + 1; } -/* Return the year offset, that is the absolute date of the day - 31.12.(year-1) - Assumes GREGORIAN_CALENDAR - - This is equivalent to: - - (datetime(year, 1, 1) - datetime(1970, 1, 1)).days - - Note: - For the Julian calendar we shift the absdate (which is measured - using the Gregorian Epoch) value by two days because the Epoch - (0001-01-01) in the Julian calendar lies 2 days before the Epoch in - the Gregorian calendar. */ -static int dInfoCalc_YearOffset(npy_int64 year) { - year--; - if (year >= 0 || -1 / 4 == -1) - return year * 365 + year / 4 - year / 100 + year / 400; - else - return year * 365 + (year - 3) / 4 - (year - 99) / 100 + - (year - 399) / 400; -} - -/* Set the instance's value using the given date and time. +/* Find the absdate (days elapsed since datetime(1, 1, 1) + * for the given year/month/day. * Assumes GREGORIAN_CALENDAR */ -static int dInfoCalc_SetFromDateAndTime(struct date_info *dinfo, int year, - int month, int day, int hour, - int minute, double second) { +static npy_int64 dInfoCalc_SetFromDateAndTime(int year, int month, int day) { /* Calculate the absolute date */ - { - int leap; - npy_int64 absdate; - int yearoffset; - - /* Range check */ - Py_AssertWithArg(year > -(INT_MAX / 366) && year < (INT_MAX / 366), - PyExc_ValueError, "year out of range: %i", year); - - /* Is it a leap year ? */ - leap = dInfoCalc_Leapyear(year); + pandas_datetimestruct dts; + npy_int64 unix_date; - /* Negative month values indicate months relative to the years end */ - if (month < 0) month += 13; - Py_AssertWithArg(month >= 1 && month <= 12, PyExc_ValueError, - "month out of range (1-12): %i", month); - - /* Negative values indicate days relative to the months end */ - if (day < 0) day += days_in_month[leap][month - 1] + 1; - Py_AssertWithArg(day >= 1 && day <= days_in_month[leap][month - 1], - PyExc_ValueError, "day out of range: %i", day); - - yearoffset = dInfoCalc_YearOffset(year); - if (yearoffset == INT_ERR_CODE) goto onError; - - absdate = day + month_offset[leap][month - 1] + yearoffset; - - dinfo->absdate = absdate; - - dinfo->year = year; - dinfo->month = month; - dinfo->quarter = ((month - 1) / 3) + 1; - dinfo->day = day; - - dinfo->day_of_week = dInfoCalc_DayOfWeek(absdate); - dinfo->day_of_year = (short)(absdate - yearoffset); - } - - /* Calculate the absolute time */ - { - Py_AssertWithArg(hour >= 0 && hour <= 23, PyExc_ValueError, - "hour out of range (0-23): %i", hour); - Py_AssertWithArg(minute >= 0 && minute <= 59, PyExc_ValueError, - "minute out of range (0-59): %i", minute); - Py_AssertWithArg( - second >= (double)0.0 && - (second < (double)60.0 || - (hour == 23 && minute == 59 && second < (double)61.0)), - PyExc_ValueError, - "second out of range (0.0 - <60.0; <61.0 for 23:59): %f", second); - - dinfo->abstime = (double)(hour * 3600 + minute * 60) + second; - - dinfo->hour = hour; - dinfo->minute = minute; - dinfo->second = second; - } - return 0; - -onError: - return INT_ERR_CODE; + memset(&dts, 0, sizeof(pandas_datetimestruct)); + dts.year = year; + dts.month = month; + dts.day = day; + unix_date = pandas_datetimestruct_to_datetime(PANDAS_FR_D, &dts); + return ORD_OFFSET + unix_date; } /* Sets the date part of the date_info struct - Assumes GREGORIAN_CALENDAR - - XXX This could also be done using some integer arithmetics rather - than with this iterative approach... */ + Assumes GREGORIAN_CALENDAR */ static int dInfoCalc_SetFromAbsDate(register struct date_info *dinfo, npy_int64 absdate) { - register npy_int64 year; - npy_int64 yearoffset; - int leap, dayoffset; - int *monthoffset; - - /* Approximate year */ - year = (npy_int64)(((double)absdate) / 365.2425); - - if (absdate > 0) year++; - - /* Apply corrections to reach the correct year */ - while (1) { - /* Calculate the year offset */ - yearoffset = dInfoCalc_YearOffset(year); - if (yearoffset == INT_ERR_CODE) goto onError; - - /* Backward correction: absdate must be greater than the - yearoffset */ - if (yearoffset >= absdate) { - year--; - continue; - } + pandas_datetimestruct dts; - dayoffset = absdate - yearoffset; - leap = dInfoCalc_Leapyear(year); + pandas_datetime_to_datetimestruct(absdate - ORD_OFFSET, PANDAS_FR_D, &dts); + dinfo->year = dts.year; + dinfo->month = dts.month; + dinfo->day = dts.day; - /* Forward correction: non leap years only have 365 days */ - if (dayoffset > 365 && !leap) { - year++; - continue; - } - break; - } - - dinfo->year = year; - - /* Now iterate to find the month */ - monthoffset = month_offset[leap]; - { - register int month; - - for (month = 1; month < 13; month++) { - if (monthoffset[month] >= dayoffset) break; - } - - dinfo->month = month; - dinfo->quarter = monthToQuarter(month); - dinfo->day = dayoffset - month_offset[leap][month - 1]; - } - - dinfo->day_of_week = dInfoCalc_DayOfWeek(absdate); - dinfo->day_of_year = dayoffset; dinfo->absdate = absdate; - return 0; - -onError: - return INT_ERR_CODE; } /////////////////////////////////////////////// @@ -358,9 +205,6 @@ PANDAS_INLINE npy_int64 transform_via_day(npy_int64 ordinal, char relation, asfreq_info *af_info, freq_conv_func first_func, freq_conv_func second_func) { - // printf("transform_via_day(%ld, %ld, %d)\n", ordinal, - // af_info->intraday_conversion_factor, - // af_info->intraday_conversion_upsample); npy_int64 result; result = (*first_func)(ordinal, relation, af_info); @@ -373,28 +217,26 @@ static npy_int64 DtoB_weekday(npy_int64 absdate) { return (((absdate) / 7) * 5) + (absdate) % 7 - BDAY_OFFSET; } -static npy_int64 DtoB_WeekendToMonday(npy_int64 absdate, int day_of_week) { - if (day_of_week > 4) { - // change to Monday after weekend - absdate += (7 - day_of_week); - } - return DtoB_weekday(absdate); -} +static npy_int64 DtoB(struct date_info *dinfo, int roll_back) { + int day_of_week = dayofweek(dinfo->year, dinfo->month, dinfo->day); + npy_int64 absdate = dinfo->absdate; -static npy_int64 DtoB_WeekendToFriday(npy_int64 absdate, int day_of_week) { - if (day_of_week > 4) { - // change to friday before weekend - absdate -= (day_of_week - 4); + if (roll_back == 1) { + if (day_of_week > 4) { + // change to friday before weekend + absdate -= (day_of_week - 4); + } + } else { + if (day_of_week > 4) { + // change to Monday after weekend + absdate += (7 - day_of_week); + } } return DtoB_weekday(absdate); } static npy_int64 absdate_from_ymd(int y, int m, int d) { - struct date_info tempDate; - if (dInfoCalc_SetFromDateAndTime(&tempDate, y, m, d, 0, 0, 0)) { - return INT_ERR_CODE; - } - return tempDate.absdate; + return dInfoCalc_SetFromDateAndTime(y, m, d); } //************ FROM DAILY *************** @@ -403,8 +245,7 @@ static npy_int64 asfreq_DTtoA(npy_int64 ordinal, char relation, asfreq_info *af_info) { struct date_info dinfo; ordinal = downsample_daytime(ordinal, af_info, 0); - if (dInfoCalc_SetFromAbsDate(&dinfo, ordinal + ORD_OFFSET)) - return INT_ERR_CODE; + dInfoCalc_SetFromAbsDate(&dinfo, ordinal + ORD_OFFSET); if (dinfo.month > af_info->to_a_year_end) { return (npy_int64)(dinfo.year + 1 - BASE_YEAR); } else { @@ -415,8 +256,7 @@ static npy_int64 asfreq_DTtoA(npy_int64 ordinal, char relation, static npy_int64 DtoQ_yq(npy_int64 ordinal, asfreq_info *af_info, int *year, int *quarter) { struct date_info dinfo; - if (dInfoCalc_SetFromAbsDate(&dinfo, ordinal + ORD_OFFSET)) - return INT_ERR_CODE; + dInfoCalc_SetFromAbsDate(&dinfo, ordinal + ORD_OFFSET); if (af_info->to_q_year_end != 12) { dinfo.month -= af_info->to_q_year_end; if (dinfo.month <= 0) { @@ -424,11 +264,10 @@ static npy_int64 DtoQ_yq(npy_int64 ordinal, asfreq_info *af_info, int *year, } else { dinfo.year += 1; } - dinfo.quarter = monthToQuarter(dinfo.month); } *year = dinfo.year; - *quarter = dinfo.quarter; + *quarter = monthToQuarter(dinfo.month); return 0; } @@ -439,10 +278,7 @@ static npy_int64 asfreq_DTtoQ(npy_int64 ordinal, char relation, ordinal = downsample_daytime(ordinal, af_info, 0); - if (DtoQ_yq(ordinal, af_info, &year, &quarter) == INT_ERR_CODE) { - return INT_ERR_CODE; - } - + DtoQ_yq(ordinal, af_info, &year, &quarter); return (npy_int64)((year - BASE_YEAR) * 4 + quarter - 1); } @@ -452,8 +288,7 @@ static npy_int64 asfreq_DTtoM(npy_int64 ordinal, char relation, ordinal = downsample_daytime(ordinal, af_info, 0); - if (dInfoCalc_SetFromAbsDate(&dinfo, ordinal + ORD_OFFSET)) - return INT_ERR_CODE; + dInfoCalc_SetFromAbsDate(&dinfo, ordinal + ORD_OFFSET); return (npy_int64)((dinfo.year - BASE_YEAR) * 12 + dinfo.month - 1); } @@ -467,17 +302,15 @@ static npy_int64 asfreq_DTtoW(npy_int64 ordinal, char relation, static npy_int64 asfreq_DTtoB(npy_int64 ordinal, char relation, asfreq_info *af_info) { struct date_info dinfo; + int roll_back; ordinal = downsample_daytime(ordinal, af_info, 0); - if (dInfoCalc_SetFromAbsDate(&dinfo, ordinal + ORD_OFFSET)) - return INT_ERR_CODE; + dInfoCalc_SetFromAbsDate(&dinfo, ordinal + ORD_OFFSET); - if (relation == 'S') { - return DtoB_WeekendToFriday(dinfo.absdate, dinfo.day_of_week); - } else { - return DtoB_WeekendToMonday(dinfo.absdate, dinfo.day_of_week); - } + // This usage defines roll_back the opposite way from the others + roll_back = (relation == 'S') ? 1 : 0; + return DtoB(&dinfo, roll_back); } // all intra day calculations are now done within one function @@ -570,15 +403,12 @@ static npy_int64 asfreq_WtoW(npy_int64 ordinal, char relation, static npy_int64 asfreq_WtoB(npy_int64 ordinal, char relation, asfreq_info *af_info) { struct date_info dinfo; - if (dInfoCalc_SetFromAbsDate( - &dinfo, asfreq_WtoDT(ordinal, relation, af_info) + ORD_OFFSET)) - return INT_ERR_CODE; + int roll_back; + dInfoCalc_SetFromAbsDate( + &dinfo, asfreq_WtoDT(ordinal, relation, af_info) + ORD_OFFSET); - if (relation == 'S') { - return DtoB_WeekendToMonday(dinfo.absdate, dinfo.day_of_week); - } else { - return DtoB_WeekendToFriday(dinfo.absdate, dinfo.day_of_week); - } + roll_back = (relation == 'S') ? 0 : 1; + return DtoB(&dinfo, roll_back); } //************ FROM MONTHLY *************** @@ -596,8 +426,7 @@ static npy_int64 asfreq_MtoDT(npy_int64 ordinal, char relation, ordinal += 1; } MtoD_ym(ordinal, &y, &m); - if ((absdate = absdate_from_ymd(y, m, 1)) == INT_ERR_CODE) - return INT_ERR_CODE; + absdate = absdate_from_ymd(y, m, 1); ordinal = absdate - ORD_OFFSET; if (relation == 'E') { @@ -628,16 +457,13 @@ static npy_int64 asfreq_MtoW(npy_int64 ordinal, char relation, static npy_int64 asfreq_MtoB(npy_int64 ordinal, char relation, asfreq_info *af_info) { struct date_info dinfo; + int roll_back; - if (dInfoCalc_SetFromAbsDate( - &dinfo, asfreq_MtoDT(ordinal, relation, af_info) + ORD_OFFSET)) - return INT_ERR_CODE; + dInfoCalc_SetFromAbsDate( + &dinfo, asfreq_MtoDT(ordinal, relation, af_info) + ORD_OFFSET); - if (relation == 'S') { - return DtoB_WeekendToMonday(dinfo.absdate, dinfo.day_of_week); - } else { - return DtoB_WeekendToFriday(dinfo.absdate, dinfo.day_of_week); - } + roll_back = (relation == 'S') ? 0 : 1; + return DtoB(&dinfo, roll_back); } //************ FROM QUARTERLY *************** @@ -667,8 +493,7 @@ static npy_int64 asfreq_QtoDT(npy_int64 ordinal, char relation, QtoD_ym(ordinal, &y, &m, af_info); - if ((absdate = absdate_from_ymd(y, m, 1)) == INT_ERR_CODE) - return INT_ERR_CODE; + absdate = absdate_from_ymd(y, m, 1); if (relation == 'E') { absdate -= 1; @@ -704,15 +529,12 @@ static npy_int64 asfreq_QtoW(npy_int64 ordinal, char relation, static npy_int64 asfreq_QtoB(npy_int64 ordinal, char relation, asfreq_info *af_info) { struct date_info dinfo; - if (dInfoCalc_SetFromAbsDate( - &dinfo, asfreq_QtoDT(ordinal, relation, af_info) + ORD_OFFSET)) - return INT_ERR_CODE; + int roll_back; + dInfoCalc_SetFromAbsDate( + &dinfo, asfreq_QtoDT(ordinal, relation, af_info) + ORD_OFFSET); - if (relation == 'S') { - return DtoB_WeekendToMonday(dinfo.absdate, dinfo.day_of_week); - } else { - return DtoB_WeekendToFriday(dinfo.absdate, dinfo.day_of_week); - } + roll_back = (relation == 'S') ? 0 : 1; + return DtoB(&dinfo, roll_back); } //************ FROM ANNUAL *************** @@ -737,10 +559,6 @@ static npy_int64 asfreq_AtoDT(npy_int64 year, char relation, absdate = absdate_from_ymd(year, month, 1); - if (absdate == INT_ERR_CODE) { - return INT_ERR_CODE; - } - if (relation == 'E') { absdate -= 1; } @@ -775,15 +593,12 @@ static npy_int64 asfreq_AtoW(npy_int64 ordinal, char relation, static npy_int64 asfreq_AtoB(npy_int64 ordinal, char relation, asfreq_info *af_info) { struct date_info dinfo; - if (dInfoCalc_SetFromAbsDate( - &dinfo, asfreq_AtoDT(ordinal, relation, af_info) + ORD_OFFSET)) - return INT_ERR_CODE; + int roll_back; + dInfoCalc_SetFromAbsDate( + &dinfo, asfreq_AtoDT(ordinal, relation, af_info) + ORD_OFFSET); - if (relation == 'S') { - return DtoB_WeekendToMonday(dinfo.absdate, dinfo.day_of_week); - } else { - return DtoB_WeekendToFriday(dinfo.absdate, dinfo.day_of_week); - } + roll_back = (relation == 'S') ? 0 : 1; + return DtoB(&dinfo, roll_back); } static npy_int64 nofunc(npy_int64 ordinal, char relation, @@ -815,10 +630,6 @@ void get_asfreq_info(int fromFreq, int toFreq, asfreq_info *af_info) { get_freq_group_index(max_value(fromGroup, FR_DAY)), get_freq_group_index(max_value(toGroup, FR_DAY))); - // printf("get_asfreq_info(%d, %d) %ld, %d\n", fromFreq, toFreq, - // af_info->intraday_conversion_factor, - // af_info->intraday_conversion_upsample); - switch (fromGroup) { case FR_WK: af_info->from_week_end = calc_week_end(fromFreq, fromGroup); @@ -1014,8 +825,6 @@ freq_conv_func get_asfreq_func(int fromFreq, int toFreq) { } double get_abs_time(int freq, npy_int64 date_ordinal, npy_int64 ordinal) { - // printf("get_abs_time %d %lld %lld\n", freq, date_ordinal, ordinal); - int freq_index, day_index, base_index; npy_int64 per_day, start_ord; double unit, result; @@ -1028,23 +837,15 @@ double get_abs_time(int freq, npy_int64 date_ordinal, npy_int64 ordinal) { day_index = get_freq_group_index(FR_DAY); base_index = get_freq_group_index(FR_SEC); - // printf(" indices: day %d, freq %d, base %d\n", day_index, freq_index, - // base_index); - per_day = get_daytime_conversion_factor(day_index, freq_index); unit = get_daytime_conversion_factor(freq_index, base_index); - // printf(" per_day: %lld, unit: %f\n", per_day, unit); - if (base_index < freq_index) { unit = 1 / unit; - // printf(" corrected unit: %f\n", unit); } start_ord = date_ordinal * per_day; - // printf("start_ord: %lld\n", start_ord); result = (double)(unit * (ordinal - start_ord)); - // printf(" result: %f\n", result); return result; } @@ -1062,9 +863,6 @@ static int dInfoCalc_SetFromAbsTime(struct date_info *dinfo, double abstime) { dinfo->hour = hour; dinfo->minute = minute; dinfo->second = second; - - dinfo->abstime = abstime; - return 0; } @@ -1073,19 +871,16 @@ static int dInfoCalc_SetFromAbsTime(struct date_info *dinfo, double abstime) { static int dInfoCalc_SetFromAbsDateTime(struct date_info *dinfo, npy_int64 absdate, double abstime) { /* Bounds check */ - Py_AssertWithArg(abstime >= 0.0 && abstime <= SECONDS_PER_DAY, - PyExc_ValueError, - "abstime out of range (0.0 - 86400.0): %f", abstime); + // The calling function is responsible for ensuring that + // abstime >= 0.0 && abstime <= 86400 /* Calculate the date */ - if (dInfoCalc_SetFromAbsDate(dinfo, absdate)) goto onError; + dInfoCalc_SetFromAbsDate(dinfo, absdate); /* Calculate the time */ - if (dInfoCalc_SetFromAbsTime(dinfo, abstime)) goto onError; + dInfoCalc_SetFromAbsTime(dinfo, abstime); return 0; -onError: - return INT_ERR_CODE; } /* ------------------------------------------------------------------ @@ -1102,19 +897,8 @@ npy_int64 asfreq(npy_int64 period_ordinal, int freq1, int freq2, get_asfreq_info(freq1, freq2, &finfo); - // printf("\n%x %d %d %ld %ld\n", func, freq1, freq2, - // finfo.intraday_conversion_factor, -finfo.intraday_conversion_factor); - val = (*func)(period_ordinal, relation, &finfo); - - if (val == INT_ERR_CODE) { - // Py_Error(PyExc_ValueError, "Unable to convert to desired - // frequency."); - goto onError; - } return val; -onError: - return INT_ERR_CODE; } /* generate an ordinal in period space */ @@ -1155,9 +939,7 @@ npy_int64 get_period_ordinal(int year, int month, int day, int hour, int minute, } if (freq == FR_HR) { - if ((absdays = absdate_from_ymd(year, month, day)) == INT_ERR_CODE) { - goto onError; - } + absdays = absdate_from_ymd(year, month, day); delta = (absdays - ORD_OFFSET); return (npy_int64)(delta * 24 + hour); } @@ -1171,9 +953,7 @@ npy_int64 get_period_ordinal(int year, int month, int day, int hour, int minute, } if (freq == FR_BUS) { - if ((days = absdate_from_ymd(year, month, day)) == INT_ERR_CODE) { - goto onError; - } + days = absdate_from_ymd(year, month, day); // calculate the current week assuming sunday as last day of a week weeks = (days - BASE_WEEK_TO_DAY_OFFSET) / DAYS_PER_WEEK; // calculate the current weekday (in range 1 .. 7) @@ -1187,10 +967,7 @@ npy_int64 get_period_ordinal(int year, int month, int day, int hour, int minute, } if (freq_group == FR_WK) { - if ((ordinal = (npy_int64)absdate_from_ymd(year, month, day)) == - INT_ERR_CODE) { - goto onError; - } + ordinal = (npy_int64)absdate_from_ymd(year, month, day); day_adj = freq - FR_WK; return (ordinal - (1 + day_adj)) / 7 + 1 - WEEK_OFFSET; } @@ -1246,32 +1023,6 @@ npy_int64 get_python_ordinal(npy_int64 period_ordinal, int freq) { } -// function to generate a nice string representation of the period -// object, originally from DateObject_strftime - -char *c_strftime(struct date_info *tmp, char *fmt) { - struct tm c_date; - char *result; - struct date_info dinfo = *tmp; - int result_len = strlen(fmt) + 50; - - c_date.tm_sec = (int)dinfo.second; - c_date.tm_min = dinfo.minute; - c_date.tm_hour = dinfo.hour; - c_date.tm_mday = dinfo.day; - c_date.tm_mon = dinfo.month - 1; - c_date.tm_year = dinfo.year - 1900; - c_date.tm_wday = (dinfo.day_of_week + 1) % 7; - c_date.tm_yday = dinfo.day_of_year - 1; - c_date.tm_isdst = -1; - - result = malloc(result_len * sizeof(char)); - - strftime(result, result_len, fmt, &c_date); - - return result; -} - int get_yq(npy_int64 ordinal, int freq, int *quarter, int *year) { asfreq_info af_info; int qtr_freq; @@ -1290,12 +1041,11 @@ int get_yq(npy_int64 ordinal, int freq, int *quarter, int *year) { } get_asfreq_info(FR_DAY, qtr_freq, &af_info); - if (DtoQ_yq(daily_ord, &af_info, year, quarter) == INT_ERR_CODE) return -1; - + DtoQ_yq(daily_ord, &af_info, year, quarter); return 0; } -static int _quarter_year(npy_int64 ordinal, int freq, int *year, int *quarter) { +int _quarter_year(npy_int64 ordinal, int freq, int *year, int *quarter) { asfreq_info af_info; int qtr_freq; @@ -1308,37 +1058,13 @@ static int _quarter_year(npy_int64 ordinal, int freq, int *year, int *quarter) { get_asfreq_info(FR_DAY, qtr_freq, &af_info); - if (DtoQ_yq(ordinal, &af_info, year, quarter) == INT_ERR_CODE) - return INT_ERR_CODE; + DtoQ_yq(ordinal, &af_info, year, quarter); if ((qtr_freq % 1000) > 12) *year -= 1; return 0; } -static int _ISOWeek(struct date_info *dinfo) { - int week; - - /* Estimate */ - week = (dinfo->day_of_year - 1) - dinfo->day_of_week + 3; - if (week >= 0) week = week / 7 + 1; - - /* Verify */ - if (week < 0) { - /* The day lies in last week of the previous year */ - if ((week > -2) || (week == -2 && dInfoCalc_Leapyear(dinfo->year - 1))) - week = 53; - else - week = 52; - } else if (week == 53) { - /* Check if the week belongs to year or year+1 */ - if (31 - dinfo->day + dinfo->day_of_week < 3) { - week = 1; - } - } - - return week; -} int get_date_info(npy_int64 ordinal, int freq, struct date_info *dinfo) { npy_int64 absdate = get_python_ordinal(ordinal, freq); @@ -1353,101 +1079,6 @@ int get_date_info(npy_int64 ordinal, int freq, struct date_info *dinfo) { absdate += 1; } - if (dInfoCalc_SetFromAbsDateTime(dinfo, absdate, abstime)) - return INT_ERR_CODE; - + dInfoCalc_SetFromAbsDateTime(dinfo, absdate, abstime); return 0; } - -int pyear(npy_int64 ordinal, int freq) { - struct date_info dinfo; - get_date_info(ordinal, freq, &dinfo); - return dinfo.year; -} - -int pqyear(npy_int64 ordinal, int freq) { - int year, quarter; - if (_quarter_year(ordinal, freq, &year, &quarter) == INT_ERR_CODE) - return INT_ERR_CODE; - return year; -} - -int pquarter(npy_int64 ordinal, int freq) { - int year, quarter; - if (_quarter_year(ordinal, freq, &year, &quarter) == INT_ERR_CODE) - return INT_ERR_CODE; - return quarter; -} - -int pmonth(npy_int64 ordinal, int freq) { - struct date_info dinfo; - if (get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) - return INT_ERR_CODE; - return dinfo.month; -} - -int pday(npy_int64 ordinal, int freq) { - struct date_info dinfo; - if (get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) - return INT_ERR_CODE; - return dinfo.day; -} - -int pweekday(npy_int64 ordinal, int freq) { - struct date_info dinfo; - if (get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) - return INT_ERR_CODE; - return dinfo.day_of_week; -} - -int pday_of_week(npy_int64 ordinal, int freq) { - struct date_info dinfo; - if (get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) - return INT_ERR_CODE; - return dinfo.day_of_week; -} - -int pday_of_year(npy_int64 ordinal, int freq) { - struct date_info dinfo; - if (get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) - return INT_ERR_CODE; - return dinfo.day_of_year; -} - -int pweek(npy_int64 ordinal, int freq) { - struct date_info dinfo; - if (get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) - return INT_ERR_CODE; - return _ISOWeek(&dinfo); -} - -int phour(npy_int64 ordinal, int freq) { - struct date_info dinfo; - if (get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) - return INT_ERR_CODE; - return dinfo.hour; -} - -int pminute(npy_int64 ordinal, int freq) { - struct date_info dinfo; - if (get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) - return INT_ERR_CODE; - return dinfo.minute; -} - -int psecond(npy_int64 ordinal, int freq) { - struct date_info dinfo; - if (get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) - return INT_ERR_CODE; - return (int)dinfo.second; -} - -int pdays_in_month(npy_int64 ordinal, int freq) { - int days; - struct date_info dinfo; - if (get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) - return INT_ERR_CODE; - - days = days_in_month[dInfoCalc_Leapyear(dinfo.year)][dinfo.month - 1]; - return days; -} diff --git a/pandas/_libs/src/period_helper.h b/pandas/_libs/src/period_helper.h index d3d32f81d1f66..2c74659346b15 100644 --- a/pandas/_libs/src/period_helper.h +++ b/pandas/_libs/src/period_helper.h @@ -24,15 +24,6 @@ frequency conversion routines. * declarations from period here */ -#define SECONDS_PER_DAY ((double)86400.0) - -#define Py_AssertWithArg(x, errortype, errorstr, a1) \ - { \ - if (!(x)) { \ - PyErr_Format(errortype, errorstr, a1); \ - goto onError; \ - } \ - } #define Py_Error(errortype, errorstr) \ { \ PyErr_SetString(errortype, errorstr); \ @@ -124,17 +115,13 @@ typedef struct asfreq_info { typedef struct date_info { npy_int64 absdate; - double abstime; double second; int minute; int hour; int day; int month; - int quarter; int year; - int day_of_week; - int day_of_year; } date_info; typedef npy_int64 (*freq_conv_func)(npy_int64, char, asfreq_info *); @@ -155,22 +142,8 @@ int get_date_info(npy_int64 ordinal, int freq, struct date_info *dinfo); freq_conv_func get_asfreq_func(int fromFreq, int toFreq); void get_asfreq_info(int fromFreq, int toFreq, asfreq_info *af_info); -int pyear(npy_int64 ordinal, int freq); -int pqyear(npy_int64 ordinal, int freq); -int pquarter(npy_int64 ordinal, int freq); -int pmonth(npy_int64 ordinal, int freq); -int pday(npy_int64 ordinal, int freq); -int pweekday(npy_int64 ordinal, int freq); -int pday_of_week(npy_int64 ordinal, int freq); -int pday_of_year(npy_int64 ordinal, int freq); -int pweek(npy_int64 ordinal, int freq); -int phour(npy_int64 ordinal, int freq); -int pminute(npy_int64 ordinal, int freq); -int psecond(npy_int64 ordinal, int freq); -int pdays_in_month(npy_int64 ordinal, int freq); - -char *c_strftime(struct date_info *dinfo, char *fmt); int get_yq(npy_int64 ordinal, int freq, int *quarter, int *year); +int _quarter_year(npy_int64 ordinal, int freq, int *year, int *quarter); void initialize_daytime_conversion_factor_matrix(void); diff --git a/pandas/_libs/tslibs/ccalendar.pyx b/pandas/_libs/tslibs/ccalendar.pyx index 613e111443636..9bd315b43ea9e 100644 --- a/pandas/_libs/tslibs/ccalendar.pyx +++ b/pandas/_libs/tslibs/ccalendar.pyx @@ -191,8 +191,7 @@ cpdef int32_t get_day_of_year(int year, int month, int day) nogil: cdef: bint isleap int32_t mo_off - int32_t doy, dow - int woy + int day_of_year isleap = is_leapyear(year) diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index e82c9c613c62a..ba17b3d345ac8 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -11,7 +11,9 @@ from numpy cimport int64_t, import_array, ndarray import numpy as np import_array() -from libc.stdlib cimport free +from libc.stdlib cimport free, malloc +from libc.time cimport strftime, tm +from libc.string cimport strlen from pandas.compat import PY2 @@ -33,6 +35,8 @@ from timestamps import Timestamp from timezones cimport is_utc, is_tzlocal, get_utcoffset, get_dst_info from timedeltas cimport delta_to_nanoseconds +cimport ccalendar +from ccalendar cimport dayofweek, get_day_of_year from ccalendar import MONTH_NUMBERS from ccalendar cimport is_leapyear from frequencies cimport (get_freq_code, get_base_alias, @@ -49,17 +53,12 @@ from pandas.tseries import frequencies cdef extern from "period_helper.h": ctypedef struct date_info: - int64_t absdate - double abstime double second int minute int hour int day int month - int quarter int year - int day_of_week - int day_of_year ctypedef struct asfreq_info: int from_week_end @@ -85,28 +84,43 @@ cdef extern from "period_helper.h": int freq) nogil except INT32_MIN int get_date_info(int64_t ordinal, int freq, - date_info *dinfo) nogil except INT32_MIN - - int pyear(int64_t ordinal, int freq) except INT32_MIN - int pqyear(int64_t ordinal, int freq) except INT32_MIN - int pquarter(int64_t ordinal, int freq) except INT32_MIN - int pmonth(int64_t ordinal, int freq) except INT32_MIN - int pday(int64_t ordinal, int freq) except INT32_MIN - int pweekday(int64_t ordinal, int freq) except INT32_MIN - int pday_of_week(int64_t ordinal, int freq) except INT32_MIN - # TODO: pday_of_week and pweekday are identical. Make one an alias instead - # of importing them separately. - int pday_of_year(int64_t ordinal, int freq) except INT32_MIN - int pweek(int64_t ordinal, int freq) except INT32_MIN - int phour(int64_t ordinal, int freq) except INT32_MIN - int pminute(int64_t ordinal, int freq) except INT32_MIN - int psecond(int64_t ordinal, int freq) except INT32_MIN - int pdays_in_month(int64_t ordinal, int freq) except INT32_MIN - char *c_strftime(date_info *dinfo, char *fmt) + date_info *dinfo) nogil + int get_yq(int64_t ordinal, int freq, int *quarter, int *year) + int _quarter_year(int64_t ordinal, int freq, int *year, int *quarter) + initialize_daytime_conversion_factor_matrix() + +@cython.cdivision +cdef char* c_strftime(date_info *dinfo, char *fmt): + """ + function to generate a nice string representation of the period + object, originally from DateObject_strftime + """ + cdef: + tm c_date + char *result + int result_len = strlen(fmt) + 50 + + c_date.tm_sec = dinfo.second + c_date.tm_min = dinfo.minute + c_date.tm_hour = dinfo.hour + c_date.tm_mday = dinfo.day + c_date.tm_mon = dinfo.month - 1 + c_date.tm_year = dinfo.year - 1900 + c_date.tm_wday = (dayofweek(dinfo.year, dinfo.month, dinfo.day) + 1) % 7 + c_date.tm_yday = get_day_of_year(dinfo.year, dinfo.month, dinfo.day) - 1 + c_date.tm_isdst = -1 + + result = malloc(result_len * sizeof(char)) + + strftime(result, result_len, fmt, &c_date) + + return result + + # ---------------------------------------------------------------------- # Period logic @@ -367,19 +381,105 @@ cdef object _period_strftime(int64_t value, int freq, object fmt): return result + +# ---------------------------------------------------------------------- # period accessors ctypedef int (*accessor)(int64_t ordinal, int freq) except INT32_MIN +cdef int pyear(int64_t ordinal, int freq): + cdef: + date_info dinfo + get_date_info(ordinal, freq, &dinfo) + return dinfo.year + + +cdef int pqyear(int64_t ordinal, int freq): + cdef: + int year, quarter + _quarter_year(ordinal, freq, &year, &quarter) + return year + + +cdef int pquarter(int64_t ordinal, int freq): + cdef: + int year, quarter + _quarter_year(ordinal, freq, &year, &quarter) + return quarter + + +cdef int pmonth(int64_t ordinal, int freq): + cdef: + date_info dinfo + get_date_info(ordinal, freq, &dinfo) + return dinfo.month + + +cdef int pday(int64_t ordinal, int freq): + cdef: + date_info dinfo + get_date_info(ordinal, freq, &dinfo) + return dinfo.day + + +cdef int pweekday(int64_t ordinal, int freq): + cdef: + date_info dinfo + get_date_info(ordinal, freq, &dinfo) + return dayofweek(dinfo.year, dinfo.month, dinfo.day) + + +cdef int pday_of_year(int64_t ordinal, int freq): + cdef: + date_info dinfo + get_date_info(ordinal, freq, &dinfo) + return get_day_of_year(dinfo.year, dinfo.month, dinfo.day) + + +cdef int pweek(int64_t ordinal, int freq): + cdef: + date_info dinfo + get_date_info(ordinal, freq, &dinfo) + return ccalendar.get_week_of_year(dinfo.year, dinfo.month, dinfo.day) + + +cdef int phour(int64_t ordinal, int freq): + cdef: + date_info dinfo + get_date_info(ordinal, freq, &dinfo) + return dinfo.hour + + +cdef int pminute(int64_t ordinal, int freq): + cdef: + date_info dinfo + get_date_info(ordinal, freq, &dinfo) + return dinfo.minute + + +cdef int psecond(int64_t ordinal, int freq): + cdef: + date_info dinfo + get_date_info(ordinal, freq, &dinfo) + return dinfo.second + + +cdef int pdays_in_month(int64_t ordinal, int freq): + cdef: + date_info dinfo + get_date_info(ordinal, freq, &dinfo) + return ccalendar.get_days_in_month(dinfo.year, dinfo.month) + + def get_period_field_arr(int code, ndarray[int64_t] arr, int freq): cdef: Py_ssize_t i, sz ndarray[int64_t] out accessor f - f = _get_accessor_func(code) - if f is NULL: + func = _get_accessor_func(code) + if func is NULL: raise ValueError('Unrecognized period code: %d' % code) sz = len(arr) @@ -389,36 +489,36 @@ def get_period_field_arr(int code, ndarray[int64_t] arr, int freq): if arr[i] == iNaT: out[i] = -1 continue - out[i] = f(arr[i], freq) + out[i] = func(arr[i], freq) return out cdef accessor _get_accessor_func(int code): if code == 0: - return &pyear + return pyear elif code == 1: - return &pqyear + return pqyear elif code == 2: - return &pquarter + return pquarter elif code == 3: - return &pmonth + return pmonth elif code == 4: - return &pday + return pday elif code == 5: - return &phour + return phour elif code == 6: - return &pminute + return pminute elif code == 7: - return &psecond + return psecond elif code == 8: - return &pweek + return pweek elif code == 9: - return &pday_of_year + return pday_of_year elif code == 10: - return &pweekday + return pweekday elif code == 11: - return &pdays_in_month + return pdays_in_month return NULL From affb5d91555a4d4e57e7ca3696ecc2187d3ddb32 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Wed, 7 Feb 2018 17:17:24 -0800 Subject: [PATCH 069/214] CI: Run ASV on Travis for failed benchmarks (#19236) --- .travis.yml | 8 +++++ asv_bench/benchmarks/algorithms.py | 4 ++- asv_bench/benchmarks/categoricals.py | 10 +++++-- asv_bench/benchmarks/frame_methods.py | 9 ++++-- asv_bench/benchmarks/gil.py | 43 +++++++++++++-------------- asv_bench/benchmarks/groupby.py | 6 ++-- asv_bench/benchmarks/indexing.py | 19 ++++++++---- asv_bench/benchmarks/io/hdf.py | 21 ++++++++----- asv_bench/benchmarks/join_merge.py | 23 +++++++++----- asv_bench/benchmarks/offset.py | 7 +++-- asv_bench/benchmarks/panel_ctor.py | 10 +++++-- asv_bench/benchmarks/panel_methods.py | 11 +++++-- asv_bench/benchmarks/reindex.py | 4 --- asv_bench/benchmarks/reshape.py | 4 +-- asv_bench/benchmarks/strings.py | 5 +++- asv_bench/benchmarks/timeseries.py | 6 ++-- ci/asv.sh | 35 ++++++++++++++++++++++ ci/requirements-3.6_ASV.build | 5 ++++ ci/requirements-3.6_ASV.run | 25 ++++++++++++++++ ci/requirements-3.6_ASV.sh | 7 +++++ ci/script_multi.sh | 3 ++ ci/script_single.sh | 3 ++ 22 files changed, 199 insertions(+), 69 deletions(-) create mode 100755 ci/asv.sh create mode 100644 ci/requirements-3.6_ASV.build create mode 100644 ci/requirements-3.6_ASV.run create mode 100755 ci/requirements-3.6_ASV.sh diff --git a/.travis.yml b/.travis.yml index bd5cac8955c8d..4cbe7f86bd2fa 100644 --- a/.travis.yml +++ b/.travis.yml @@ -73,6 +73,10 @@ matrix: env: - JOB="3.6_NUMPY_DEV" TEST_ARGS="--skip-slow --skip-network" PANDAS_TESTING_MODE="deprecate" # In allow_failures + - dist: trusty + env: + - JOB="3.6_ASV" ASV=true + # In allow_failures - dist: trusty env: - JOB="3.6_DOC" DOC=true @@ -93,6 +97,9 @@ matrix: - dist: trusty env: - JOB="3.6_NUMPY_DEV" TEST_ARGS="--skip-slow --skip-network" PANDAS_TESTING_MODE="deprecate" + - dist: trusty + env: + - JOB="3.6_ASV" ASV=true - dist: trusty env: - JOB="3.6_DOC" DOC=true @@ -128,6 +135,7 @@ script: - ci/script_single.sh - ci/script_multi.sh - ci/lint.sh + - ci/asv.sh - echo "checking imports" - source activate pandas && python ci/check_imports.py - echo "script done" diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 45d62163ae80b..cccd38ef11251 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -1,3 +1,4 @@ +import warnings from importlib import import_module import numpy as np @@ -83,7 +84,8 @@ def setup(self): self.all = self.uniques.repeat(10) def time_match_string(self): - pd.match(self.all, self.uniques) + with warnings.catch_warnings(record=True): + pd.match(self.all, self.uniques) class Hashing(object): diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index 1613ca1b97f4b..7743921003353 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -1,3 +1,5 @@ +import warnings + import numpy as np import pandas as pd import pandas.util.testing as tm @@ -119,11 +121,15 @@ def setup(self): self.s_str = pd.Series(tm.makeCategoricalIndex(N, ncats)).astype(str) self.s_str_cat = self.s_str.astype('category') - self.s_str_cat_ordered = self.s_str.astype('category', ordered=True) + with warnings.catch_warnings(record=True): + self.s_str_cat_ordered = self.s_str.astype('category', + ordered=True) self.s_int = pd.Series(np.random.randint(0, ncats, size=N)) self.s_int_cat = self.s_int.astype('category') - self.s_int_cat_ordered = self.s_int.astype('category', ordered=True) + with warnings.catch_warnings(record=True): + self.s_int_cat_ordered = self.s_int.astype('category', + ordered=True) def time_rank_string(self): self.s_str.rank() diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index 4cecf12a27042..4ff71c706cd34 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -1,4 +1,6 @@ import string +import warnings + import numpy as np import pandas.util.testing as tm from pandas import (DataFrame, Series, MultiIndex, date_range, period_range, @@ -15,7 +17,8 @@ def setup(self): self.df = DataFrame(np.random.randn(10000, 25)) self.df['foo'] = 'bar' self.df['bar'] = 'baz' - self.df = self.df.consolidate() + with warnings.catch_warnings(record=True): + self.df = self.df.consolidate() def time_frame_get_numeric_data(self): self.df._get_numeric_data() @@ -141,8 +144,8 @@ class Repr(object): def setup(self): nrows = 10000 data = np.random.randn(nrows, 10) - idx = MultiIndex.from_arrays(np.tile(np.random.randn(3, nrows / 100), - 100)) + arrays = np.tile(np.random.randn(3, int(nrows / 100)), 100) + idx = MultiIndex.from_arrays(arrays) self.df3 = DataFrame(data, index=idx) self.df4 = DataFrame(data, index=np.random.randn(nrows)) self.df_tall = DataFrame(np.random.randn(nrows, 10)) diff --git a/asv_bench/benchmarks/gil.py b/asv_bench/benchmarks/gil.py index 7d63d78084270..21c1ccf46e1c4 100644 --- a/asv_bench/benchmarks/gil.py +++ b/asv_bench/benchmarks/gil.py @@ -1,9 +1,13 @@ import numpy as np import pandas.util.testing as tm -from pandas import (DataFrame, Series, rolling_median, rolling_mean, - rolling_min, rolling_max, rolling_var, rolling_skew, - rolling_kurt, rolling_std, read_csv, factorize, date_range) +from pandas import DataFrame, Series, read_csv, factorize, date_range from pandas.core.algorithms import take_1d +try: + from pandas import (rolling_median, rolling_mean, rolling_min, rolling_max, + rolling_var, rolling_skew, rolling_kurt, rolling_std) + have_rolling_methods = True +except ImportError: + have_rolling_methods = False try: from pandas._libs import algos except ImportError: @@ -171,8 +175,7 @@ def run(period): class ParallelRolling(object): goal_time = 0.2 - params = ['rolling_median', 'rolling_mean', 'rolling_min', 'rolling_max', - 'rolling_var', 'rolling_skew', 'rolling_kurt', 'rolling_std'] + params = ['median', 'mean', 'min', 'max', 'var', 'skew', 'kurt', 'std'] param_names = ['method'] def setup(self, method): @@ -181,34 +184,28 @@ def setup(self, method): win = 100 arr = np.random.rand(100000) if hasattr(DataFrame, 'rolling'): - rolling = {'rolling_median': 'median', - 'rolling_mean': 'mean', - 'rolling_min': 'min', - 'rolling_max': 'max', - 'rolling_var': 'var', - 'rolling_skew': 'skew', - 'rolling_kurt': 'kurt', - 'rolling_std': 'std'} df = DataFrame(arr).rolling(win) @test_parallel(num_threads=2) def parallel_rolling(): - getattr(df, rolling[method])() + getattr(df, method)() self.parallel_rolling = parallel_rolling - else: - rolling = {'rolling_median': rolling_median, - 'rolling_mean': rolling_mean, - 'rolling_min': rolling_min, - 'rolling_max': rolling_max, - 'rolling_var': rolling_var, - 'rolling_skew': rolling_skew, - 'rolling_kurt': rolling_kurt, - 'rolling_std': rolling_std} + elif have_rolling_methods: + rolling = {'median': rolling_median, + 'mean': rolling_mean, + 'min': rolling_min, + 'max': rolling_max, + 'var': rolling_var, + 'skew': rolling_skew, + 'kurt': rolling_kurt, + 'std': rolling_std} @test_parallel(num_threads=2) def parallel_rolling(): rolling[method](arr, win) self.parallel_rolling = parallel_rolling + else: + raise NotImplementedError def time_rolling(self, method): self.parallel_rolling() diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 4dfd215e6dc3a..8aa67d8bc6a6a 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -1,3 +1,4 @@ +import warnings from string import ascii_letters from itertools import product from functools import partial @@ -340,7 +341,8 @@ def time_dt_size(self): self.df.groupby(['dates']).size() def time_dt_timegrouper_size(self): - self.df.groupby(TimeGrouper(key='dates', freq='M')).size() + with warnings.catch_warnings(record=True): + self.df.groupby(TimeGrouper(key='dates', freq='M')).size() def time_category_size(self): self.draws.groupby(self.cats).size() @@ -467,7 +469,7 @@ class SumMultiLevel(object): def setup(self): N = 50 - self.df = DataFrame({'A': range(N) * 2, + self.df = DataFrame({'A': list(range(N)) * 2, 'B': range(N * 2), 'C': 1}).set_index(['A', 'B']) diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index b35f00db2b054..77e013e1e4fb0 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -1,3 +1,5 @@ +import warnings + import numpy as np import pandas.util.testing as tm from pandas import (Series, DataFrame, MultiIndex, Int64Index, Float64Index, @@ -91,7 +93,8 @@ def time_getitem_pos_slice(self, index): self.s[:80000] def time_get_value(self, index): - self.s.get_value(self.lbl) + with warnings.catch_warnings(record=True): + self.s.get_value(self.lbl) def time_getitem_scalar(self, index): self.s[self.lbl] @@ -112,7 +115,8 @@ def setup(self): self.bool_obj_indexer = self.bool_indexer.astype(object) def time_get_value(self): - self.df.get_value(self.idx_scalar, self.col_scalar) + with warnings.catch_warnings(record=True): + self.df.get_value(self.idx_scalar, self.col_scalar) def time_ix(self): self.df.ix[self.idx_scalar, self.col_scalar] @@ -231,11 +235,13 @@ class PanelIndexing(object): goal_time = 0.2 def setup(self): - self.p = Panel(np.random.randn(100, 100, 100)) - self.inds = range(0, 100, 10) + with warnings.catch_warnings(record=True): + self.p = Panel(np.random.randn(100, 100, 100)) + self.inds = range(0, 100, 10) def time_subset(self): - self.p.ix[(self.inds, self.inds, self.inds)] + with warnings.catch_warnings(record=True): + self.p.ix[(self.inds, self.inds, self.inds)] class MethodLookup(object): @@ -295,7 +301,8 @@ def setup(self): def time_insert(self): np.random.seed(1234) for i in range(100): - self.df.insert(0, i, np.random.randn(self.N)) + self.df.insert(0, i, np.random.randn(self.N), + allow_duplicates=True) def time_assign_with_setitem(self): np.random.seed(1234) diff --git a/asv_bench/benchmarks/io/hdf.py b/asv_bench/benchmarks/io/hdf.py index 5c0e9586c1cb5..4b6e1d69af92d 100644 --- a/asv_bench/benchmarks/io/hdf.py +++ b/asv_bench/benchmarks/io/hdf.py @@ -1,3 +1,5 @@ +import warnings + import numpy as np from pandas import DataFrame, Panel, date_range, HDFStore, read_hdf import pandas.util.testing as tm @@ -105,22 +107,25 @@ class HDFStorePanel(BaseIO): def setup(self): self.fname = '__test__.h5' - self.p = Panel(np.random.randn(20, 1000, 25), - items=['Item%03d' % i for i in range(20)], - major_axis=date_range('1/1/2000', periods=1000), - minor_axis=['E%03d' % i for i in range(25)]) - self.store = HDFStore(self.fname) - self.store.append('p1', self.p) + with warnings.catch_warnings(record=True): + self.p = Panel(np.random.randn(20, 1000, 25), + items=['Item%03d' % i for i in range(20)], + major_axis=date_range('1/1/2000', periods=1000), + minor_axis=['E%03d' % i for i in range(25)]) + self.store = HDFStore(self.fname) + self.store.append('p1', self.p) def teardown(self): self.store.close() self.remove(self.fname) def time_read_store_table_panel(self): - self.store.select('p1') + with warnings.catch_warnings(record=True): + self.store.select('p1') def time_write_store_table_panel(self): - self.store.append('p2', self.p) + with warnings.catch_warnings(record=True): + self.store.append('p2', self.p) class HDF(BaseIO): diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index 5b40a29d54683..de0a3b33da147 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -1,3 +1,4 @@ +import warnings import string import numpy as np @@ -26,7 +27,8 @@ def setup(self): self.mdf1['obj2'] = 'bar' self.mdf1['int1'] = 5 try: - self.mdf1.consolidate(inplace=True) + with warnings.catch_warnings(record=True): + self.mdf1.consolidate(inplace=True) except: pass self.mdf2 = self.mdf1.copy() @@ -75,16 +77,23 @@ class ConcatPanels(object): param_names = ['axis', 'ignore_index'] def setup(self, axis, ignore_index): - panel_c = Panel(np.zeros((10000, 200, 2), dtype=np.float32, order='C')) - self.panels_c = [panel_c] * 20 - panel_f = Panel(np.zeros((10000, 200, 2), dtype=np.float32, order='F')) - self.panels_f = [panel_f] * 20 + with warnings.catch_warnings(record=True): + panel_c = Panel(np.zeros((10000, 200, 2), + dtype=np.float32, + order='C')) + self.panels_c = [panel_c] * 20 + panel_f = Panel(np.zeros((10000, 200, 2), + dtype=np.float32, + order='F')) + self.panels_f = [panel_f] * 20 def time_c_ordered(self, axis, ignore_index): - concat(self.panels_c, axis=axis, ignore_index=ignore_index) + with warnings.catch_warnings(record=True): + concat(self.panels_c, axis=axis, ignore_index=ignore_index) def time_f_ordered(self, axis, ignore_index): - concat(self.panels_f, axis=axis, ignore_index=ignore_index) + with warnings.catch_warnings(record=True): + concat(self.panels_f, axis=axis, ignore_index=ignore_index) class ConcatDataFrames(object): diff --git a/asv_bench/benchmarks/offset.py b/asv_bench/benchmarks/offset.py index 034e861e7fc01..e161b887ee86f 100644 --- a/asv_bench/benchmarks/offset.py +++ b/asv_bench/benchmarks/offset.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +import warnings from datetime import datetime import numpy as np @@ -76,7 +77,8 @@ def setup(self, offset): self.data = pd.Series(rng) def time_add_offset(self, offset): - self.data + offset + with warnings.catch_warnings(record=True): + self.data + offset class OffsetDatetimeIndexArithmetic(object): @@ -90,7 +92,8 @@ def setup(self, offset): self.data = pd.date_range(start='1/1/2000', periods=N, freq='T') def time_add_offset(self, offset): - self.data + offset + with warnings.catch_warnings(record=True): + self.data + offset class OffestDatetimeArithmetic(object): diff --git a/asv_bench/benchmarks/panel_ctor.py b/asv_bench/benchmarks/panel_ctor.py index 456fe959c5aa3..ce946c76ed199 100644 --- a/asv_bench/benchmarks/panel_ctor.py +++ b/asv_bench/benchmarks/panel_ctor.py @@ -1,3 +1,4 @@ +import warnings from datetime import datetime, timedelta from pandas import DataFrame, DatetimeIndex, date_range @@ -19,7 +20,8 @@ def setup(self): self.data_frames[x] = df def time_from_dict(self): - Panel.from_dict(self.data_frames) + with warnings.catch_warnings(record=True): + Panel.from_dict(self.data_frames) class SameIndexes(object): @@ -34,7 +36,8 @@ def setup(self): self.data_frames = dict(enumerate([df] * 100)) def time_from_dict(self): - Panel.from_dict(self.data_frames) + with warnings.catch_warnings(record=True): + Panel.from_dict(self.data_frames) class TwoIndexes(object): @@ -53,4 +56,5 @@ def setup(self): self.data_frames = dict(enumerate(dfs)) def time_from_dict(self): - Panel.from_dict(self.data_frames) + with warnings.catch_warnings(record=True): + Panel.from_dict(self.data_frames) diff --git a/asv_bench/benchmarks/panel_methods.py b/asv_bench/benchmarks/panel_methods.py index 9ee1949b311db..a5b1a92e9cf67 100644 --- a/asv_bench/benchmarks/panel_methods.py +++ b/asv_bench/benchmarks/panel_methods.py @@ -1,3 +1,5 @@ +import warnings + import numpy as np from .pandas_vb_common import Panel, setup # noqa @@ -10,10 +12,13 @@ class PanelMethods(object): param_names = ['axis'] def setup(self, axis): - self.panel = Panel(np.random.randn(100, 1000, 100)) + with warnings.catch_warnings(record=True): + self.panel = Panel(np.random.randn(100, 1000, 100)) def time_pct_change(self, axis): - self.panel.pct_change(1, axis=axis) + with warnings.catch_warnings(record=True): + self.panel.pct_change(1, axis=axis) def time_shift(self, axis): - self.panel.shift(1, axis=axis) + with warnings.catch_warnings(record=True): + self.panel.shift(1, axis=axis) diff --git a/asv_bench/benchmarks/reindex.py b/asv_bench/benchmarks/reindex.py index 69a1a604b1ccc..413427a16f40b 100644 --- a/asv_bench/benchmarks/reindex.py +++ b/asv_bench/benchmarks/reindex.py @@ -167,10 +167,6 @@ def setup(self): col_array2 = col_array.copy() col_array2[:, :10000] = np.nan self.col_array_list = list(col_array) - self.col_array_list2 = list(col_array2) def time_lib_fast_zip(self): lib.fast_zip(self.col_array_list) - - def time_lib_fast_zip_fillna(self): - lib.fast_zip_fillna(self.col_array_list2) diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py index bd3b580d9d130..9044b080c45f9 100644 --- a/asv_bench/benchmarks/reshape.py +++ b/asv_bench/benchmarks/reshape.py @@ -104,9 +104,9 @@ def setup(self): self.letters = list('ABCD') yrvars = [l + str(num) for l, num in product(self.letters, range(1, nyrs + 1))] - + columns = [str(i) for i in range(nidvars)] + yrvars self.df = DataFrame(np.random.randn(N, nidvars + len(yrvars)), - columns=list(range(nidvars)) + yrvars) + columns=columns) self.df['id'] = self.df.index def time_wide_to_long_big(self): diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index 4435327e1eb38..b203c8b0fa5c9 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -1,3 +1,5 @@ +import warnings + import numpy as np from pandas import Series import pandas.util.testing as tm @@ -23,7 +25,8 @@ def time_endswith(self): self.s.str.endswith('A') def time_extract(self): - self.s.str.extract('(\\w*)A(\\w*)') + with warnings.catch_warnings(record=True): + self.s.str.extract('(\\w*)A(\\w*)') def time_findall(self): self.s.str.findall('[A-Z]+') diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py index ea2f077f980d0..e1a6bc7a68e9d 100644 --- a/asv_bench/benchmarks/timeseries.py +++ b/asv_bench/benchmarks/timeseries.py @@ -1,3 +1,4 @@ +import warnings from datetime import timedelta import numpy as np @@ -74,7 +75,8 @@ def setup(self): freq='S')) def time_infer_dst(self): - self.index.tz_localize('US/Eastern', infer_dst=True) + with warnings.catch_warnings(record=True): + self.index.tz_localize('US/Eastern', infer_dst=True) class ResetIndex(object): @@ -365,7 +367,7 @@ class ToDatetimeCache(object): def setup(self, cache): N = 10000 - self.unique_numeric_seconds = range(N) + self.unique_numeric_seconds = list(range(N)) self.dup_numeric_seconds = [1000] * N self.dup_string_dates = ['2000-02-11'] * N self.dup_string_with_tz = ['2000-02-11 15:00:00-0800'] * N diff --git a/ci/asv.sh b/ci/asv.sh new file mode 100755 index 0000000000000..1e9a8d6380eb5 --- /dev/null +++ b/ci/asv.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +echo "inside $0" + +source activate pandas + +RET=0 + +if [ "$ASV" ]; then + echo "Check for failed asv benchmarks" + + cd asv_bench + + asv machine --yes + + time asv dev | tee failed_asv.txt + + echo "The following asvs benchmarks (if any) failed." + + cat failed_asv.txt | grep "failed" failed_asv.txt + + if [ $? = "0" ]; then + RET=1 + fi + + echo "DONE displaying failed asvs benchmarks." + + rm failed_asv.txt + + echo "Check for failed asv benchmarks DONE" +else + echo "NOT checking for failed asv benchmarks" +fi + +exit $RET diff --git a/ci/requirements-3.6_ASV.build b/ci/requirements-3.6_ASV.build new file mode 100644 index 0000000000000..bc72eed2a0d4e --- /dev/null +++ b/ci/requirements-3.6_ASV.build @@ -0,0 +1,5 @@ +python=3.6* +python-dateutil +pytz +numpy=1.13* +cython diff --git a/ci/requirements-3.6_ASV.run b/ci/requirements-3.6_ASV.run new file mode 100644 index 0000000000000..6c45e3371e9cf --- /dev/null +++ b/ci/requirements-3.6_ASV.run @@ -0,0 +1,25 @@ +ipython +ipykernel +ipywidgets +sphinx=1.5* +nbconvert +nbformat +notebook +matplotlib +seaborn +scipy +lxml +beautifulsoup4 +html5lib +pytables +python-snappy +openpyxl +xlrd +xlwt +xlsxwriter +sqlalchemy +numexpr +bottleneck +statsmodels +xarray +pyqt diff --git a/ci/requirements-3.6_ASV.sh b/ci/requirements-3.6_ASV.sh new file mode 100755 index 0000000000000..8a46f85dbb6bc --- /dev/null +++ b/ci/requirements-3.6_ASV.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +source activate pandas + +echo "[install ASV_BUILD deps]" + +pip install git+https://github.com/spacetelescope/asv diff --git a/ci/script_multi.sh b/ci/script_multi.sh index c1fa756ece965..766e51625fbe6 100755 --- a/ci/script_multi.sh +++ b/ci/script_multi.sh @@ -37,6 +37,9 @@ if [ "$PIP_BUILD_TEST" ] || [ "$CONDA_BUILD_TEST" ]; then elif [ "$DOC" ]; then echo "We are not running pytest as this is a doc-build" +elif [ "$ASV" ]; then + echo "We are not running pytest as this is an asv-build" + elif [ "$COVERAGE" ]; then echo pytest -s -n 2 -m "not single" --cov=pandas --cov-report xml:/tmp/cov-multiple.xml --junitxml=/tmp/multiple.xml --strict $TEST_ARGS pandas pytest -s -n 2 -m "not single" --cov=pandas --cov-report xml:/tmp/cov-multiple.xml --junitxml=/tmp/multiple.xml --strict $TEST_ARGS pandas diff --git a/ci/script_single.sh b/ci/script_single.sh index 005c648ee025f..153847ab2e8c9 100755 --- a/ci/script_single.sh +++ b/ci/script_single.sh @@ -22,6 +22,9 @@ if [ "$PIP_BUILD_TEST" ] || [ "$CONDA_BUILD_TEST" ]; then elif [ "$DOC" ]; then echo "We are not running pytest as this is a doc-build" +elif [ "$ASV" ]; then + echo "We are not running pytest as this is an asv-build" + elif [ "$COVERAGE" ]; then echo pytest -s -m "single" --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas pytest -s -m "single" --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas From b2940a04d65e5e683283da472904ed5314d24efd Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 7 Feb 2018 19:20:28 -0600 Subject: [PATCH 070/214] BUG: Fixed merge on dtype equal categories (#19553) --- doc/source/whatsnew/v0.23.0.txt | 40 +++++++++++++++-------- pandas/core/indexes/category.py | 11 +++++-- pandas/core/reshape/merge.py | 10 +++++- pandas/tests/indexing/test_categorical.py | 17 ++++++++++ pandas/tests/reshape/merge/test_merge.py | 19 +++++++++++ 5 files changed, 80 insertions(+), 17 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 7782e5f1ffa56..bed0c077c1348 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -598,6 +598,32 @@ Documentation Changes Bug Fixes ~~~~~~~~~ +Categorical +^^^^^^^^^^^ + +.. warning:: + + A class of bugs were introduced in pandas 0.21 with ``CategoricalDtype`` that + affects the correctness of operations like ``merge``, ``concat``, and + indexing when comparing multiple unordered ``Categorical`` arrays that have + the same categories, but in a different order. We highly recommend upgrading + or manually aligning your categories before doing these operations. + +- Bug in ``Categorical.equals`` returning the wrong result when comparing two + unordered ``Categorical`` arrays with the same categories, but in a different + order (:issue:`16603`) +- Bug in :func:`pandas.api.types.union_categoricals` returning the wrong result + when for unordered categoricals with the categories in a different order. + This affected :func:`pandas.concat` with Categorical data (:issue:`19096`). +- Bug in :func:`pandas.merge` returning the wrong result when joining on an + unordered ``Categorical`` that had the same categories but in a different + order (:issue:`19551`) +- Bug in :meth:`CategoricalIndex.get_indexer` returning the wrong result when + ``target`` was an unordered ``Categorical`` that had the same categories as + ``self`` but in a different order (:issue:`19551`) +- Bug in :meth:`Index.astype` with a categorical dtype where the resultant index is not converted to a :class:`CategoricalIndex` for all types of index (:issue:`18630`) +- Bug in :meth:`Series.astype` and ``Categorical.astype()`` where an existing categorical data does not get updated (:issue:`10696`, :issue:`18593`) +- Bug in :class:`Index` constructor with ``dtype=CategoricalDtype(...)`` where ``categories`` and ``ordered`` are not maintained (issue:`19032`) Datetimelike ^^^^^^^^^^^^ @@ -745,20 +771,6 @@ Reshaping - Improved error message for :func:`DataFrame.merge` when there is no common merge key (:issue:`19427`) - - -Categorical -^^^^^^^^^^^ - -- -- Bug in :func:`pandas.api.types.union_categoricals` returning the wrong result - when all the categoricals had the same categories, but in a different order. - This affected :func:`pandas.concat` with Categorical data (:issue:`19096`). -- Bug in ``Categorical.equals`` between two unordered categories with the same categories, but in a different order (:issue:`16603`) -- Bug in :meth:`Index.astype` with a categorical dtype where the resultant index is not converted to a :class:`CategoricalIndex` for all types of index (:issue:`18630`) -- Bug in :meth:`Series.astype` and ``Categorical.astype()`` where an existing categorical data does not get updated (:issue:`10696`, :issue:`18593`) -- Bug in :class:`Index` constructor with ``dtype=CategoricalDtype(...)`` where ``categories`` and ``ordered`` are not maintained (issue:`19032`) -- - Other ^^^^^ diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 2c7be2b21f959..b36bc1df23247 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -553,6 +553,8 @@ def _reindex_non_unique(self, target): @Appender(_index_shared_docs['get_indexer'] % _index_doc_kwargs) def get_indexer(self, target, method=None, limit=None, tolerance=None): + from pandas.core.arrays.categorical import _recode_for_categories + method = missing.clean_reindex_fill_method(method) target = ibase._ensure_index(target) @@ -568,8 +570,13 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): if (isinstance(target, CategoricalIndex) and self.values.is_dtype_equal(target)): - # we have the same codes - codes = target.codes + if self.values.equals(target.values): + # we have the same codes + codes = target.codes + else: + codes = _recode_for_categories(target.codes, + target.categories, + self.values.categories) else: if isinstance(target, CategoricalIndex): code_indexer = self.categories.get_indexer(target.categories) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 9dbb327e3d956..4b99b0407cfcc 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -12,6 +12,7 @@ from pandas import (Categorical, DataFrame, Index, MultiIndex, Timedelta) +from pandas.core.arrays.categorical import _recode_for_categories from pandas.core.frame import _merge_doc from pandas.core.dtypes.common import ( is_datetime64tz_dtype, @@ -1540,8 +1541,15 @@ def _factorize_keys(lk, rk, sort=True): is_categorical_dtype(rk) and lk.is_dtype_equal(rk)): klass = libhashtable.Int64Factorizer + + if lk.categories.equals(rk.categories): + rk = rk.codes + else: + # Same categories in different orders -> recode + rk = _recode_for_categories(rk.codes, rk.categories, lk.categories) + lk = _ensure_int64(lk.codes) - rk = _ensure_int64(rk.codes) + rk = _ensure_int64(rk) elif is_int_or_datetime_dtype(lk) and is_int_or_datetime_dtype(rk): klass = libhashtable.Int64Factorizer lk = _ensure_int64(com._values_from_object(lk)) diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index f2182687d047f..634ad0d8160ed 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -432,6 +432,23 @@ def test_get_indexer_array(self): expected = np.array([0, 1], dtype='intp') tm.assert_numpy_array_equal(result, expected) + def test_get_indexer_same_categories_same_order(self): + ci = CategoricalIndex(['a', 'b'], categories=['a', 'b']) + + result = ci.get_indexer(CategoricalIndex(['b', 'b'], + categories=['a', 'b'])) + expected = np.array([1, 1], dtype='intp') + tm.assert_numpy_array_equal(result, expected) + + def test_get_indexer_same_categories_different_order(self): + # https://github.com/pandas-dev/pandas/issues/19551 + ci = CategoricalIndex(['a', 'b'], categories=['a', 'b']) + + result = ci.get_indexer(CategoricalIndex(['b', 'b'], + categories=['b', 'a'])) + expected = np.array([1, 1], dtype='intp') + tm.assert_numpy_array_equal(result, expected) + def test_getitem_with_listlike(self): # GH 16115 cats = Categorical([Timestamp('12-31-1999'), diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 32f83ab972be5..101d34ebdb89f 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1643,6 +1643,25 @@ def test_merge_categorical(self): result = pd.merge(cleft, cright, how='left', left_on='b', right_on='c') tm.assert_frame_equal(result, expected) + def tests_merge_categorical_unordered_equal(self): + # GH-19551 + df1 = DataFrame({ + 'Foo': Categorical(['A', 'B', 'C'], categories=['A', 'B', 'C']), + 'Left': ['A0', 'B0', 'C0'], + }) + + df2 = DataFrame({ + 'Foo': Categorical(['C', 'B', 'A'], categories=['C', 'B', 'A']), + 'Right': ['C1', 'B1', 'A1'], + }) + result = pd.merge(df1, df2, on=['Foo']) + expected = DataFrame({ + 'Foo': pd.Categorical(['A', 'B', 'C']), + 'Left': ['A0', 'B0', 'C0'], + 'Right': ['A1', 'B1', 'C1'], + }) + assert_frame_equal(result, expected) + def test_other_columns(self, left, right): # non-merge columns should preserve if possible right = right.assign(Z=right.Z.astype('category')) From 36f905285c0089228985cffc9f9f6c7d28789128 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 7 Feb 2018 19:32:02 -0600 Subject: [PATCH 071/214] PERF: Correct signature for group_nth / group_object (#19579) --- asv_bench/benchmarks/groupby.py | 16 ++++++++++++++++ doc/source/whatsnew/v0.23.0.txt | 1 + pandas/_libs/groupby.pyx | 10 ++++++++-- 3 files changed, 25 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 8aa67d8bc6a6a..61db39528a5fb 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -160,6 +160,22 @@ def time_series_nth(self, df): df[1].groupby(df[0]).nth(0) +class NthObject(object): + + goal_time = 0.2 + + def setup_cache(self): + df = DataFrame(np.random.randint(1, 100, (10000,)), columns=['g']) + df['obj'] = ['a'] * 5000 + ['b'] * 5000 + return df + + def time_nth(self, df): + df.groupby('g').nth(5) + + def time_nth_last(self, df): + df.groupby('g').last() + + class DateAttributes(object): goal_time = 0.2 diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index bed0c077c1348..6c4fce35529ad 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -746,6 +746,7 @@ Groupby/Resample/Rolling - Bug in :func:`DataFrame.transform` where particular aggregation functions were being incorrectly cast to match the dtype(s) of the grouped data (:issue:`19200`) - Bug in :func:`DataFrame.groupby` passing the `on=` kwarg, and subsequently using ``.apply()`` (:issue:`17813`) - Bug in :func:`DataFrame.resample().aggregate` not raising a ``KeyError`` when aggregating a non-existent column (:issue:`16766`, :issue:`19566`) +- Fixed a performance regression for ``GroupBy.nth`` and ``GroupBy.last`` with some object columns (:issue:`19283`) Sparse ^^^^^^ diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 9cc15fb6692d9..55de700c9af52 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -36,7 +36,8 @@ def group_nth_object(ndarray[object, ndim=2] out, ndarray[int64_t] counts, ndarray[object, ndim=2] values, ndarray[int64_t] labels, - int64_t rank): + int64_t rank, + Py_ssize_t min_count=-1): """ Only aggregates on axis=0 """ @@ -47,6 +48,8 @@ def group_nth_object(ndarray[object, ndim=2] out, ndarray[int64_t, ndim=2] nobs ndarray[object, ndim=2] resx + assert min_count == -1, "'min_count' only used in add and prod" + nobs = np.zeros(( out).shape, dtype=np.int64) resx = np.empty(( out).shape, dtype=object) @@ -80,7 +83,8 @@ def group_nth_object(ndarray[object, ndim=2] out, def group_last_object(ndarray[object, ndim=2] out, ndarray[int64_t] counts, ndarray[object, ndim=2] values, - ndarray[int64_t] labels): + ndarray[int64_t] labels, + Py_ssize_t min_count=-1): """ Only aggregates on axis=0 """ @@ -91,6 +95,8 @@ def group_last_object(ndarray[object, ndim=2] out, ndarray[object, ndim=2] resx ndarray[int64_t, ndim=2] nobs + assert min_count == -1, "'min_count' only used in add and prod" + nobs = np.zeros(( out).shape, dtype=np.int64) resx = np.empty(( out).shape, dtype=object) From 8f4ad305dc3a98332cc9765200b0535669dfafa2 Mon Sep 17 00:00:00 2001 From: xpvpc <32843902+xpvpc@users.noreply.github.com> Date: Thu, 8 Feb 2018 12:17:12 +0100 Subject: [PATCH 072/214] DOC: doc/source/indexing.rst says pd.df.ix is deprecated, show warning in generated doc. (#19596) --- pandas/core/indexing.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 9463512ac11de..352ce921d1d44 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1300,6 +1300,9 @@ class _IXIndexer(_NDFrameIndexer): """A primarily label-location based indexer, with integer position fallback. + Warning: Starting in 0.20.0, the .ix indexer is deprecated, in + favor of the more strict .iloc and .loc indexers. + ``.ix[]`` supports mixed integer and label based access. It is primarily label based, but will fall back to integer positional access unless the corresponding axis is of integer type. From 432642eac39c8063d2cc06cd0175bd48463282ac Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 8 Feb 2018 03:20:39 -0800 Subject: [PATCH 073/214] Simplify argument passing in period_helper (#19550) --- pandas/_libs/src/period_helper.c | 410 ++++++++++--------------------- pandas/_libs/src/period_helper.h | 11 +- pandas/_libs/tslibs/period.pyx | 32 ++- 3 files changed, 157 insertions(+), 296 deletions(-) diff --git a/pandas/_libs/src/period_helper.c b/pandas/_libs/src/period_helper.c index 570f20b790750..f0e24fec685d0 100644 --- a/pandas/_libs/src/period_helper.c +++ b/pandas/_libs/src/period_helper.c @@ -82,11 +82,14 @@ static int dInfoCalc_SetFromAbsDate(register struct date_info *dinfo, // helpers for frequency conversion routines // -static int daytime_conversion_factors[][2] = { - {FR_DAY, 1}, {FR_HR, 24}, {FR_MIN, 60}, {FR_SEC, 60}, - {FR_MS, 1000}, {FR_US, 1000}, {FR_NS, 1000}, {0, 0}}; - -static npy_int64 **daytime_conversion_factor_matrix = NULL; +static npy_int64 daytime_conversion_factor_matrix[7][7] = { + {1, 24, 1440, 86400, 86400000, 86400000000, 86400000000000}, + {0, 1, 60, 3600, 3600000, 3600000000, 3600000000000}, + {0, 0, 1, 60, 60000, 60000000, 60000000000}, + {0, 0, 0, 1, 1000, 1000000, 1000000000}, + {0, 0, 0, 0, 1, 1000, 1000000}, + {0, 0, 0, 0, 0, 1, 1000}, + {0, 0, 0, 0, 0, 0, 1}}; PANDAS_INLINE int max_value(int a, int b) { return a > b ? a : b; } @@ -96,100 +99,24 @@ PANDAS_INLINE int get_freq_group(int freq) { return (freq / 1000) * 1000; } PANDAS_INLINE int get_freq_group_index(int freq) { return freq / 1000; } -static int calc_conversion_factors_matrix_size(void) { - int matrix_size = 0; - int index; - for (index = 0;; index++) { - int period_value = - get_freq_group_index(daytime_conversion_factors[index][0]); - if (period_value == 0) { - break; - } - matrix_size = max_value(matrix_size, period_value); - } - return matrix_size + 1; -} - -static void alloc_conversion_factors_matrix(int matrix_size) { - int row_index; - int column_index; - daytime_conversion_factor_matrix = - malloc(matrix_size * sizeof(**daytime_conversion_factor_matrix)); - for (row_index = 0; row_index < matrix_size; row_index++) { - daytime_conversion_factor_matrix[row_index] = - malloc(matrix_size * sizeof(**daytime_conversion_factor_matrix)); - for (column_index = 0; column_index < matrix_size; column_index++) { - daytime_conversion_factor_matrix[row_index][column_index] = 0; - } - } -} - -static npy_int64 calculate_conversion_factor(int start_value, int end_value) { - npy_int64 conversion_factor = 0; - int index; - for (index = 0;; index++) { - int freq_group = daytime_conversion_factors[index][0]; - - if (freq_group == 0) { - conversion_factor = 0; - break; - } - - if (freq_group == start_value) { - conversion_factor = 1; - } else { - conversion_factor *= daytime_conversion_factors[index][1]; - } - - if (freq_group == end_value) { - break; - } - } - return conversion_factor; -} - -static void populate_conversion_factors_matrix(void) { - int row_index_index; - int row_value, row_index; - int column_index_index; - int column_value, column_index; - - for (row_index_index = 0;; row_index_index++) { - row_value = daytime_conversion_factors[row_index_index][0]; - if (row_value == 0) { - break; - } - row_index = get_freq_group_index(row_value); - for (column_index_index = row_index_index;; column_index_index++) { - column_value = daytime_conversion_factors[column_index_index][0]; - if (column_value == 0) { - break; - } - column_index = get_freq_group_index(column_value); - - daytime_conversion_factor_matrix[row_index][column_index] = - calculate_conversion_factor(row_value, column_value); - } - } -} - -void initialize_daytime_conversion_factor_matrix() { - if (daytime_conversion_factor_matrix == NULL) { - int matrix_size = calc_conversion_factors_matrix_size(); - alloc_conversion_factors_matrix(matrix_size); - populate_conversion_factors_matrix(); - } -} PANDAS_INLINE npy_int64 get_daytime_conversion_factor(int from_index, int to_index) { - return daytime_conversion_factor_matrix[min_value(from_index, to_index)] - [max_value(from_index, to_index)]; + int row = min_value(from_index, to_index); + int col = max_value(from_index, to_index); + // row or col < 6 means frequency strictly lower than Daily, which + // do not use daytime_conversion_factors + if (row < 6) { + return 0; + } else if (col < 6) { + return 0; + } + return daytime_conversion_factor_matrix[row - 6][col - 6]; } PANDAS_INLINE npy_int64 upsample_daytime(npy_int64 ordinal, - asfreq_info *af_info, int atEnd) { - if (atEnd) { + asfreq_info *af_info) { + if (af_info->is_end) { return (ordinal + 1) * af_info->intraday_conversion_factor - 1; } else { return ordinal * af_info->intraday_conversion_factor; @@ -197,18 +124,18 @@ PANDAS_INLINE npy_int64 upsample_daytime(npy_int64 ordinal, } PANDAS_INLINE npy_int64 downsample_daytime(npy_int64 ordinal, - asfreq_info *af_info, int atEnd) { + asfreq_info *af_info) { return ordinal / (af_info->intraday_conversion_factor); } -PANDAS_INLINE npy_int64 transform_via_day(npy_int64 ordinal, char relation, +PANDAS_INLINE npy_int64 transform_via_day(npy_int64 ordinal, asfreq_info *af_info, freq_conv_func first_func, freq_conv_func second_func) { npy_int64 result; - result = (*first_func)(ordinal, relation, af_info); - result = (*second_func)(result, relation, af_info); + result = (*first_func)(ordinal, af_info); + result = (*second_func)(result, af_info); return result; } @@ -241,10 +168,9 @@ static npy_int64 absdate_from_ymd(int y, int m, int d) { //************ FROM DAILY *************** -static npy_int64 asfreq_DTtoA(npy_int64 ordinal, char relation, - asfreq_info *af_info) { +static npy_int64 asfreq_DTtoA(npy_int64 ordinal, asfreq_info *af_info) { struct date_info dinfo; - ordinal = downsample_daytime(ordinal, af_info, 0); + ordinal = downsample_daytime(ordinal, af_info); dInfoCalc_SetFromAbsDate(&dinfo, ordinal + ORD_OFFSET); if (dinfo.month > af_info->to_a_year_end) { return (npy_int64)(dinfo.year + 1 - BASE_YEAR); @@ -272,142 +198,110 @@ static npy_int64 DtoQ_yq(npy_int64 ordinal, asfreq_info *af_info, int *year, return 0; } -static npy_int64 asfreq_DTtoQ(npy_int64 ordinal, char relation, - asfreq_info *af_info) { +static npy_int64 asfreq_DTtoQ(npy_int64 ordinal, asfreq_info *af_info) { int year, quarter; - ordinal = downsample_daytime(ordinal, af_info, 0); + ordinal = downsample_daytime(ordinal, af_info); DtoQ_yq(ordinal, af_info, &year, &quarter); return (npy_int64)((year - BASE_YEAR) * 4 + quarter - 1); } -static npy_int64 asfreq_DTtoM(npy_int64 ordinal, char relation, - asfreq_info *af_info) { +static npy_int64 asfreq_DTtoM(npy_int64 ordinal, asfreq_info *af_info) { struct date_info dinfo; - ordinal = downsample_daytime(ordinal, af_info, 0); + ordinal = downsample_daytime(ordinal, af_info); dInfoCalc_SetFromAbsDate(&dinfo, ordinal + ORD_OFFSET); return (npy_int64)((dinfo.year - BASE_YEAR) * 12 + dinfo.month - 1); } -static npy_int64 asfreq_DTtoW(npy_int64 ordinal, char relation, - asfreq_info *af_info) { - ordinal = downsample_daytime(ordinal, af_info, 0); +static npy_int64 asfreq_DTtoW(npy_int64 ordinal, asfreq_info *af_info) { + ordinal = downsample_daytime(ordinal, af_info); return (ordinal + ORD_OFFSET - (1 + af_info->to_week_end)) / 7 + 1 - WEEK_OFFSET; } -static npy_int64 asfreq_DTtoB(npy_int64 ordinal, char relation, - asfreq_info *af_info) { +static npy_int64 asfreq_DTtoB(npy_int64 ordinal, asfreq_info *af_info) { struct date_info dinfo; int roll_back; - ordinal = downsample_daytime(ordinal, af_info, 0); + ordinal = downsample_daytime(ordinal, af_info); dInfoCalc_SetFromAbsDate(&dinfo, ordinal + ORD_OFFSET); // This usage defines roll_back the opposite way from the others - roll_back = (relation == 'S') ? 1 : 0; + roll_back = 1 - af_info->is_end; return DtoB(&dinfo, roll_back); } // all intra day calculations are now done within one function -static npy_int64 asfreq_DownsampleWithinDay(npy_int64 ordinal, char relation, +static npy_int64 asfreq_DownsampleWithinDay(npy_int64 ordinal, asfreq_info *af_info) { - return downsample_daytime(ordinal, af_info, relation == 'E'); + return downsample_daytime(ordinal, af_info); } -static npy_int64 asfreq_UpsampleWithinDay(npy_int64 ordinal, char relation, +static npy_int64 asfreq_UpsampleWithinDay(npy_int64 ordinal, asfreq_info *af_info) { - return upsample_daytime(ordinal, af_info, relation == 'E'); + return upsample_daytime(ordinal, af_info); } //************ FROM BUSINESS *************** -static npy_int64 asfreq_BtoDT(npy_int64 ordinal, char relation, - asfreq_info *af_info) { +static npy_int64 asfreq_BtoDT(npy_int64 ordinal, asfreq_info *af_info) { ordinal += BDAY_OFFSET; ordinal = (((ordinal - 1) / 5) * 7 + mod_compat(ordinal - 1, 5) + 1 - ORD_OFFSET); - return upsample_daytime(ordinal, af_info, relation != 'S'); + return upsample_daytime(ordinal, af_info); } -static npy_int64 asfreq_BtoA(npy_int64 ordinal, char relation, - asfreq_info *af_info) { - return transform_via_day(ordinal, relation, af_info, asfreq_BtoDT, - asfreq_DTtoA); +static npy_int64 asfreq_BtoA(npy_int64 ordinal, asfreq_info *af_info) { + return transform_via_day(ordinal, af_info, asfreq_BtoDT, asfreq_DTtoA); } -static npy_int64 asfreq_BtoQ(npy_int64 ordinal, char relation, - asfreq_info *af_info) { - return transform_via_day(ordinal, relation, af_info, asfreq_BtoDT, - asfreq_DTtoQ); +static npy_int64 asfreq_BtoQ(npy_int64 ordinal, asfreq_info *af_info) { + return transform_via_day(ordinal, af_info, asfreq_BtoDT, asfreq_DTtoQ); } -static npy_int64 asfreq_BtoM(npy_int64 ordinal, char relation, - asfreq_info *af_info) { - return transform_via_day(ordinal, relation, af_info, asfreq_BtoDT, - asfreq_DTtoM); +static npy_int64 asfreq_BtoM(npy_int64 ordinal, asfreq_info *af_info) { + return transform_via_day(ordinal, af_info, asfreq_BtoDT, asfreq_DTtoM); } -static npy_int64 asfreq_BtoW(npy_int64 ordinal, char relation, - asfreq_info *af_info) { - return transform_via_day(ordinal, relation, af_info, asfreq_BtoDT, - asfreq_DTtoW); +static npy_int64 asfreq_BtoW(npy_int64 ordinal, asfreq_info *af_info) { + return transform_via_day(ordinal, af_info, asfreq_BtoDT, asfreq_DTtoW); } //************ FROM WEEKLY *************** -static npy_int64 asfreq_WtoDT(npy_int64 ordinal, char relation, - asfreq_info *af_info) { - ordinal += WEEK_OFFSET; - if (relation != 'S') { - ordinal += 1; - } - - ordinal = ordinal * 7 - 6 + af_info->from_week_end - ORD_OFFSET; - - if (relation != 'S') { - ordinal -= 1; - } - - return upsample_daytime(ordinal, af_info, relation != 'S'); +static npy_int64 asfreq_WtoDT(npy_int64 ordinal, asfreq_info *af_info) { + ordinal = (ordinal + WEEK_OFFSET) * 7 + + af_info->from_week_end - ORD_OFFSET + + (7 - 1) * (af_info->is_end - 1); + return upsample_daytime(ordinal, af_info); } -static npy_int64 asfreq_WtoA(npy_int64 ordinal, char relation, - asfreq_info *af_info) { - return transform_via_day(ordinal, relation, af_info, asfreq_WtoDT, - asfreq_DTtoA); +static npy_int64 asfreq_WtoA(npy_int64 ordinal, asfreq_info *af_info) { + return transform_via_day(ordinal, af_info, asfreq_WtoDT, asfreq_DTtoA); } -static npy_int64 asfreq_WtoQ(npy_int64 ordinal, char relation, - asfreq_info *af_info) { - return transform_via_day(ordinal, relation, af_info, asfreq_WtoDT, - asfreq_DTtoQ); +static npy_int64 asfreq_WtoQ(npy_int64 ordinal, asfreq_info *af_info) { + return transform_via_day(ordinal, af_info, asfreq_WtoDT, asfreq_DTtoQ); } -static npy_int64 asfreq_WtoM(npy_int64 ordinal, char relation, - asfreq_info *af_info) { - return transform_via_day(ordinal, relation, af_info, asfreq_WtoDT, - asfreq_DTtoM); +static npy_int64 asfreq_WtoM(npy_int64 ordinal, asfreq_info *af_info) { + return transform_via_day(ordinal, af_info, asfreq_WtoDT, asfreq_DTtoM); } -static npy_int64 asfreq_WtoW(npy_int64 ordinal, char relation, - asfreq_info *af_info) { - return transform_via_day(ordinal, relation, af_info, asfreq_WtoDT, - asfreq_DTtoW); +static npy_int64 asfreq_WtoW(npy_int64 ordinal, asfreq_info *af_info) { + return transform_via_day(ordinal, af_info, asfreq_WtoDT, asfreq_DTtoW); } -static npy_int64 asfreq_WtoB(npy_int64 ordinal, char relation, - asfreq_info *af_info) { +static npy_int64 asfreq_WtoB(npy_int64 ordinal, asfreq_info *af_info) { struct date_info dinfo; - int roll_back; + int roll_back = af_info->is_end; dInfoCalc_SetFromAbsDate( - &dinfo, asfreq_WtoDT(ordinal, relation, af_info) + ORD_OFFSET); + &dinfo, asfreq_WtoDT(ordinal, af_info) + ORD_OFFSET); - roll_back = (relation == 'S') ? 0 : 1; return DtoB(&dinfo, roll_back); } @@ -417,52 +311,38 @@ static void MtoD_ym(npy_int64 ordinal, int *y, int *m) { *m = mod_compat(ordinal, 12) + 1; } -static npy_int64 asfreq_MtoDT(npy_int64 ordinal, char relation, - asfreq_info *af_info) { +static npy_int64 asfreq_MtoDT(npy_int64 ordinal, asfreq_info *af_info) { npy_int64 absdate; int y, m; - if (relation == 'E') { - ordinal += 1; - } + ordinal += af_info->is_end; MtoD_ym(ordinal, &y, &m); absdate = absdate_from_ymd(y, m, 1); ordinal = absdate - ORD_OFFSET; - if (relation == 'E') { - ordinal -= 1; - } - - return upsample_daytime(ordinal, af_info, relation != 'S'); + ordinal -= af_info->is_end; + return upsample_daytime(ordinal, af_info); } -static npy_int64 asfreq_MtoA(npy_int64 ordinal, char relation, - asfreq_info *af_info) { - return transform_via_day(ordinal, relation, af_info, asfreq_MtoDT, - asfreq_DTtoA); +static npy_int64 asfreq_MtoA(npy_int64 ordinal, asfreq_info *af_info) { + return transform_via_day(ordinal, af_info, asfreq_MtoDT, asfreq_DTtoA); } -static npy_int64 asfreq_MtoQ(npy_int64 ordinal, char relation, - asfreq_info *af_info) { - return transform_via_day(ordinal, relation, af_info, asfreq_MtoDT, - asfreq_DTtoQ); +static npy_int64 asfreq_MtoQ(npy_int64 ordinal, asfreq_info *af_info) { + return transform_via_day(ordinal, af_info, asfreq_MtoDT, asfreq_DTtoQ); } -static npy_int64 asfreq_MtoW(npy_int64 ordinal, char relation, - asfreq_info *af_info) { - return transform_via_day(ordinal, relation, af_info, asfreq_MtoDT, - asfreq_DTtoW); +static npy_int64 asfreq_MtoW(npy_int64 ordinal, asfreq_info *af_info) { + return transform_via_day(ordinal, af_info, asfreq_MtoDT, asfreq_DTtoW); } -static npy_int64 asfreq_MtoB(npy_int64 ordinal, char relation, - asfreq_info *af_info) { +static npy_int64 asfreq_MtoB(npy_int64 ordinal, asfreq_info *af_info) { struct date_info dinfo; - int roll_back; + int roll_back = af_info->is_end; dInfoCalc_SetFromAbsDate( - &dinfo, asfreq_MtoDT(ordinal, relation, af_info) + ORD_OFFSET); + &dinfo, asfreq_MtoDT(ordinal, af_info) + ORD_OFFSET); - roll_back = (relation == 'S') ? 0 : 1; return DtoB(&dinfo, roll_back); } @@ -482,130 +362,94 @@ static void QtoD_ym(npy_int64 ordinal, int *y, int *m, asfreq_info *af_info) { } } -static npy_int64 asfreq_QtoDT(npy_int64 ordinal, char relation, - asfreq_info *af_info) { +static npy_int64 asfreq_QtoDT(npy_int64 ordinal, asfreq_info *af_info) { npy_int64 absdate; int y, m; - if (relation == 'E') { - ordinal += 1; - } - + ordinal += af_info->is_end; QtoD_ym(ordinal, &y, &m, af_info); absdate = absdate_from_ymd(y, m, 1); - if (relation == 'E') { - absdate -= 1; - } - - return upsample_daytime(absdate - ORD_OFFSET, af_info, relation != 'S'); + absdate -= af_info->is_end; + return upsample_daytime(absdate - ORD_OFFSET, af_info); } -static npy_int64 asfreq_QtoQ(npy_int64 ordinal, char relation, - asfreq_info *af_info) { - return transform_via_day(ordinal, relation, af_info, asfreq_QtoDT, - asfreq_DTtoQ); +static npy_int64 asfreq_QtoQ(npy_int64 ordinal, asfreq_info *af_info) { + return transform_via_day(ordinal, af_info, asfreq_QtoDT, asfreq_DTtoQ); } -static npy_int64 asfreq_QtoA(npy_int64 ordinal, char relation, - asfreq_info *af_info) { - return transform_via_day(ordinal, relation, af_info, asfreq_QtoDT, - asfreq_DTtoA); +static npy_int64 asfreq_QtoA(npy_int64 ordinal, asfreq_info *af_info) { + return transform_via_day(ordinal, af_info, asfreq_QtoDT, asfreq_DTtoA); } -static npy_int64 asfreq_QtoM(npy_int64 ordinal, char relation, - asfreq_info *af_info) { - return transform_via_day(ordinal, relation, af_info, asfreq_QtoDT, - asfreq_DTtoM); +static npy_int64 asfreq_QtoM(npy_int64 ordinal, asfreq_info *af_info) { + return transform_via_day(ordinal, af_info, asfreq_QtoDT, asfreq_DTtoM); } -static npy_int64 asfreq_QtoW(npy_int64 ordinal, char relation, - asfreq_info *af_info) { - return transform_via_day(ordinal, relation, af_info, asfreq_QtoDT, - asfreq_DTtoW); +static npy_int64 asfreq_QtoW(npy_int64 ordinal, asfreq_info *af_info) { + return transform_via_day(ordinal, af_info, asfreq_QtoDT, asfreq_DTtoW); } -static npy_int64 asfreq_QtoB(npy_int64 ordinal, char relation, - asfreq_info *af_info) { +static npy_int64 asfreq_QtoB(npy_int64 ordinal, asfreq_info *af_info) { struct date_info dinfo; - int roll_back; + int roll_back = af_info->is_end; + dInfoCalc_SetFromAbsDate( - &dinfo, asfreq_QtoDT(ordinal, relation, af_info) + ORD_OFFSET); + &dinfo, asfreq_QtoDT(ordinal, af_info) + ORD_OFFSET); - roll_back = (relation == 'S') ? 0 : 1; return DtoB(&dinfo, roll_back); } //************ FROM ANNUAL *************** -static npy_int64 asfreq_AtoDT(npy_int64 year, char relation, - asfreq_info *af_info) { +static npy_int64 asfreq_AtoDT(npy_int64 ordinal, asfreq_info *af_info) { npy_int64 absdate; - int month = (af_info->from_a_year_end) % 12; // start from 1970 - year += BASE_YEAR; - - month += 1; + npy_int64 year = ordinal + BASE_YEAR; + int month = (af_info->from_a_year_end % 12) + 1; if (af_info->from_a_year_end != 12) { year -= 1; } - if (relation == 'E') { - year += 1; - } - + year += af_info->is_end; absdate = absdate_from_ymd(year, month, 1); - if (relation == 'E') { - absdate -= 1; - } - - return upsample_daytime(absdate - ORD_OFFSET, af_info, relation != 'S'); + absdate -= af_info->is_end; + return upsample_daytime(absdate - ORD_OFFSET, af_info); } -static npy_int64 asfreq_AtoA(npy_int64 ordinal, char relation, - asfreq_info *af_info) { - return transform_via_day(ordinal, relation, af_info, asfreq_AtoDT, - asfreq_DTtoA); +static npy_int64 asfreq_AtoA(npy_int64 ordinal, asfreq_info *af_info) { + return transform_via_day(ordinal, af_info, asfreq_AtoDT, asfreq_DTtoA); } -static npy_int64 asfreq_AtoQ(npy_int64 ordinal, char relation, - asfreq_info *af_info) { - return transform_via_day(ordinal, relation, af_info, asfreq_AtoDT, - asfreq_DTtoQ); +static npy_int64 asfreq_AtoQ(npy_int64 ordinal, asfreq_info *af_info) { + return transform_via_day(ordinal, af_info, asfreq_AtoDT, asfreq_DTtoQ); } -static npy_int64 asfreq_AtoM(npy_int64 ordinal, char relation, - asfreq_info *af_info) { - return transform_via_day(ordinal, relation, af_info, asfreq_AtoDT, - asfreq_DTtoM); +static npy_int64 asfreq_AtoM(npy_int64 ordinal, asfreq_info *af_info) { + return transform_via_day(ordinal, af_info, asfreq_AtoDT, asfreq_DTtoM); } -static npy_int64 asfreq_AtoW(npy_int64 ordinal, char relation, - asfreq_info *af_info) { - return transform_via_day(ordinal, relation, af_info, asfreq_AtoDT, - asfreq_DTtoW); +static npy_int64 asfreq_AtoW(npy_int64 ordinal, asfreq_info *af_info) { + return transform_via_day(ordinal, af_info, asfreq_AtoDT, asfreq_DTtoW); } -static npy_int64 asfreq_AtoB(npy_int64 ordinal, char relation, - asfreq_info *af_info) { +static npy_int64 asfreq_AtoB(npy_int64 ordinal, asfreq_info *af_info) { struct date_info dinfo; - int roll_back; + int roll_back = af_info->is_end; dInfoCalc_SetFromAbsDate( - &dinfo, asfreq_AtoDT(ordinal, relation, af_info) + ORD_OFFSET); + &dinfo, asfreq_AtoDT(ordinal, af_info) + ORD_OFFSET); - roll_back = (relation == 'S') ? 0 : 1; return DtoB(&dinfo, roll_back); } -static npy_int64 nofunc(npy_int64 ordinal, char relation, - asfreq_info *af_info) { +static npy_int64 nofunc(npy_int64 ordinal, asfreq_info *af_info) { return INT_ERR_CODE; } -static npy_int64 no_op(npy_int64 ordinal, char relation, asfreq_info *af_info) { +static npy_int64 no_op(npy_int64 ordinal, asfreq_info *af_info) { return ordinal; } @@ -622,10 +466,17 @@ static int calc_a_year_end(int freq, int group) { static int calc_week_end(int freq, int group) { return freq - group; } -void get_asfreq_info(int fromFreq, int toFreq, asfreq_info *af_info) { +void get_asfreq_info(int fromFreq, int toFreq, char relation, + asfreq_info *af_info) { int fromGroup = get_freq_group(fromFreq); int toGroup = get_freq_group(toFreq); + if (relation == 'E') { + af_info->is_end = 1; + } else { + af_info->is_end = 0; + } + af_info->intraday_conversion_factor = get_daytime_conversion_factor( get_freq_group_index(max_value(fromGroup, FR_DAY)), get_freq_group_index(max_value(toGroup, FR_DAY))); @@ -895,9 +746,8 @@ npy_int64 asfreq(npy_int64 period_ordinal, int freq1, int freq2, func = get_asfreq_func(freq1, freq2); - get_asfreq_info(freq1, freq2, &finfo); - - val = (*func)(period_ordinal, relation, &finfo); + get_asfreq_info(freq1, freq2, relation, &finfo); + val = (*func)(period_ordinal, &finfo); return val; } @@ -1017,9 +867,9 @@ npy_int64 get_python_ordinal(npy_int64 period_ordinal, int freq) { if (freq == FR_DAY) return period_ordinal + ORD_OFFSET; toDaily = get_asfreq_func(freq, FR_DAY); - get_asfreq_info(freq, FR_DAY, &af_info); + get_asfreq_info(freq, FR_DAY, 'E', &af_info); - return toDaily(period_ordinal, 'E', &af_info) + ORD_OFFSET; + return toDaily(period_ordinal, &af_info) + ORD_OFFSET; } @@ -1027,19 +877,19 @@ int get_yq(npy_int64 ordinal, int freq, int *quarter, int *year) { asfreq_info af_info; int qtr_freq; npy_int64 daily_ord; - npy_int64 (*toDaily)(npy_int64, char, asfreq_info *) = NULL; + freq_conv_func toDaily = NULL; toDaily = get_asfreq_func(freq, FR_DAY); - get_asfreq_info(freq, FR_DAY, &af_info); + get_asfreq_info(freq, FR_DAY, 'E', &af_info); - daily_ord = toDaily(ordinal, 'E', &af_info); + daily_ord = toDaily(ordinal, &af_info); if (get_freq_group(freq) == FR_QTR) { qtr_freq = freq; } else { qtr_freq = FR_QTR; } - get_asfreq_info(FR_DAY, qtr_freq, &af_info); + get_asfreq_info(FR_DAY, qtr_freq, 'E', &af_info); DtoQ_yq(daily_ord, &af_info, year, quarter); return 0; @@ -1056,7 +906,7 @@ int _quarter_year(npy_int64 ordinal, int freq, int *year, int *quarter) { else qtr_freq = FR_QTR; - get_asfreq_info(FR_DAY, qtr_freq, &af_info); + get_asfreq_info(FR_DAY, qtr_freq, 'E', &af_info); DtoQ_yq(ordinal, &af_info, year, quarter); diff --git a/pandas/_libs/src/period_helper.h b/pandas/_libs/src/period_helper.h index 2c74659346b15..f14aec268a1fb 100644 --- a/pandas/_libs/src/period_helper.h +++ b/pandas/_libs/src/period_helper.h @@ -101,6 +101,10 @@ frequency conversion routines. #define INT_ERR_CODE INT32_MIN typedef struct asfreq_info { + int is_end; + // char relation == 'S' (for START) --> is_end = 0 + // char relation == 'E' (for END) --> is_end = 1 + int from_week_end; // day the week ends on in the "from" frequency int to_week_end; // day the week ends on in the "to" frequency @@ -124,7 +128,7 @@ typedef struct date_info { int year; } date_info; -typedef npy_int64 (*freq_conv_func)(npy_int64, char, asfreq_info *); +typedef npy_int64 (*freq_conv_func)(npy_int64, asfreq_info *af_info); /* * new pandas API helper functions here @@ -140,11 +144,10 @@ npy_int64 get_python_ordinal(npy_int64 period_ordinal, int freq); int get_date_info(npy_int64 ordinal, int freq, struct date_info *dinfo); freq_conv_func get_asfreq_func(int fromFreq, int toFreq); -void get_asfreq_info(int fromFreq, int toFreq, asfreq_info *af_info); +void get_asfreq_info(int fromFreq, int toFreq, char relation, + asfreq_info *af_info); int get_yq(npy_int64 ordinal, int freq, int *quarter, int *year); int _quarter_year(npy_int64 ordinal, int freq, int *year, int *quarter); -void initialize_daytime_conversion_factor_matrix(void); - #endif // PANDAS__LIBS_SRC_PERIOD_HELPER_H_ diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index ba17b3d345ac8..3c396a9ff4f3c 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -61,6 +61,8 @@ cdef extern from "period_helper.h": int year ctypedef struct asfreq_info: + int is_end + int from_week_end int to_week_end @@ -70,13 +72,13 @@ cdef extern from "period_helper.h": int from_q_year_end int to_q_year_end - ctypedef int64_t (*freq_conv_func)(int64_t, char, asfreq_info*) + ctypedef int64_t (*freq_conv_func)(int64_t, asfreq_info*) - void initialize_daytime_conversion_factor_matrix() int64_t asfreq(int64_t dtordinal, int freq1, int freq2, char relation) except INT32_MIN freq_conv_func get_asfreq_func(int fromFreq, int toFreq) - void get_asfreq_info(int fromFreq, int toFreq, asfreq_info *af_info) + void get_asfreq_info(int fromFreq, int toFreq, char relation, + asfreq_info *af_info) int64_t get_period_ordinal(int year, int month, int day, int hour, int minute, int second, @@ -90,14 +92,20 @@ cdef extern from "period_helper.h": int _quarter_year(int64_t ordinal, int freq, int *year, int *quarter) -initialize_daytime_conversion_factor_matrix() - - @cython.cdivision cdef char* c_strftime(date_info *dinfo, char *fmt): """ - function to generate a nice string representation of the period + Generate a nice string representation of the period object, originally from DateObject_strftime + + Parameters + ---------- + dinfo : date_info* + fmt : char* + + Returns + ------- + result : char* """ cdef: tm c_date @@ -224,26 +232,26 @@ def period_asfreq_arr(ndarray[int64_t] arr, int freq1, int freq2, bint end): n = len(arr) result = np.empty(n, dtype=np.int64) - func = get_asfreq_func(freq1, freq2) - get_asfreq_info(freq1, freq2, &finfo) - if end: relation = END else: relation = START + func = get_asfreq_func(freq1, freq2) + get_asfreq_info(freq1, freq2, relation, &finfo) + mask = arr == iNaT if mask.any(): # NaT process for i in range(n): val = arr[i] if val != iNaT: - val = func(val, relation, &finfo) + val = func(val, &finfo) if val == INT32_MIN: raise ValueError("Unable to convert to desired frequency.") result[i] = val else: for i in range(n): - val = func(arr[i], relation, &finfo) + val = func(arr[i], &finfo) if val == INT32_MIN: raise ValueError("Unable to convert to desired frequency.") result[i] = val From 34b86fd75d3620723d0bd6ff2a16bef6adb8b079 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 8 Feb 2018 03:28:54 -0800 Subject: [PATCH 074/214] separate numeric tests so we can isolate division by zero (#19336) --- pandas/tests/series/test_operators.py | 196 ++++++++++++++------------ 1 file changed, 102 insertions(+), 94 deletions(-) diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index 05ccb25960b1f..554b3e15d8f10 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -596,77 +596,81 @@ def test_divide_decimal(self): assert_series_equal(expected, s) - def test_div(self): + @pytest.mark.parametrize( + 'dtype2', + [ + np.int64, np.int32, np.int16, np.int8, + np.float64, np.float32, np.float16, + np.uint64, np.uint32, + np.uint16, np.uint8 + ]) + @pytest.mark.parametrize('dtype1', [np.int64, np.float64, np.uint64]) + def test_ser_div_ser(self, dtype1, dtype2): + # no longer do integer div for any ops, but deal with the 0's + first = Series([3, 4, 5, 8], name='first').astype(dtype1) + second = Series([0, 0, 0, 3], name='second').astype(dtype2) + with np.errstate(all='ignore'): - # no longer do integer div for any ops, but deal with the 0's - p = DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]}) - result = p['first'] / p['second'] - expected = Series( - p['first'].values.astype(float) / p['second'].values, - dtype='float64') - expected.iloc[0:3] = np.inf - assert_series_equal(result, expected) + expected = Series(first.values.astype(np.float64) / second.values, + dtype='float64', name=None) + expected.iloc[0:3] = np.inf - result = p['first'] / 0 - expected = Series(np.inf, index=p.index, name='first') - assert_series_equal(result, expected) + result = first / second + assert_series_equal(result, expected) + assert not result.equals(second / first) - p = p.astype('float64') - result = p['first'] / p['second'] - expected = Series(p['first'].values / p['second'].values) - assert_series_equal(result, expected) + def test_div_equiv_binop(self): + # Test Series.div as well as Series.__div__ + # float/integer issue + # GH#7785 + first = pd.Series([1, 0], name='first') + second = pd.Series([-0.01, -0.02], name='second') + expected = Series([-0.01, -np.inf]) - p = DataFrame({'first': [3, 4, 5, 8], 'second': [1, 1, 1, 1]}) - result = p['first'] / p['second'] - assert_series_equal(result, p['first'].astype('float64'), - check_names=False) - assert result.name is None - assert not result.equals(p['second'] / p['first']) - - # inf signing - s = Series([np.nan, 1., -1.]) - result = s / 0 - expected = Series([np.nan, np.inf, -np.inf]) - assert_series_equal(result, expected) + result = second.div(first) + assert_series_equal(result, expected, check_names=False) - # float/integer issue - # GH 7785 - p = DataFrame({'first': (1, 0), 'second': (-0.01, -0.02)}) - expected = Series([-0.01, -np.inf]) + result = second / first + assert_series_equal(result, expected) - result = p['second'].div(p['first']) - assert_series_equal(result, expected, check_names=False) + def test_rdiv_zero_compat(self): + # GH#8674 + zero_array = np.array([0] * 5) + data = np.random.randn(5) + expected = pd.Series([0.] * 5) - result = p['second'] / p['first'] - assert_series_equal(result, expected) + result = zero_array / pd.Series(data) + assert_series_equal(result, expected) - # GH 9144 - s = Series([-1, 0, 1]) + result = pd.Series(zero_array) / data + assert_series_equal(result, expected) - result = 0 / s - expected = Series([0.0, nan, 0.0]) - assert_series_equal(result, expected) + result = pd.Series(zero_array) / pd.Series(data) + assert_series_equal(result, expected) - result = s / 0 - expected = Series([-inf, nan, inf]) - assert_series_equal(result, expected) + def test_div_zero_inf_signs(self): + # GH#9144, inf signing + ser = Series([-1, 0, 1], name='first') + expected = Series([-np.inf, np.nan, np.inf], name='first') - result = s // 0 - expected = Series([-inf, nan, inf]) - assert_series_equal(result, expected) + result = ser / 0 + assert_series_equal(result, expected) - # GH 8674 - zero_array = np.array([0] * 5) - data = np.random.randn(5) - expected = pd.Series([0.] * 5) - result = zero_array / pd.Series(data) - assert_series_equal(result, expected) + def test_rdiv_zero(self): + # GH#9144 + ser = Series([-1, 0, 1], name='first') + expected = Series([0.0, np.nan, 0.0], name='first') - result = pd.Series(zero_array) / data - assert_series_equal(result, expected) + result = 0 / ser + assert_series_equal(result, expected) - result = pd.Series(zero_array) / pd.Series(data) - assert_series_equal(result, expected) + def test_floordiv_div(self): + # GH#9144 + ser = Series([-1, 0, 1], name='first') + + result = ser // 0 + expected = Series([-inf, nan, inf], name='first') + assert_series_equal(result, expected) class TestTimedeltaSeriesArithmeticWithIntegers(object): @@ -1576,33 +1580,42 @@ def test_dt64_series_add_intlike(self, tz): class TestSeriesOperators(TestData): - def test_op_method(self): - def check(series, other, check_reverse=False): - simple_ops = ['add', 'sub', 'mul', 'floordiv', 'truediv', 'pow'] - if not compat.PY3: - simple_ops.append('div') - - for opname in simple_ops: - op = getattr(Series, opname) - - if op == 'div': - alt = operator.truediv - else: - alt = getattr(operator, opname) - - result = op(series, other) - expected = alt(series, other) - assert_almost_equal(result, expected) - if check_reverse: - rop = getattr(Series, "r" + opname) - result = rop(series, other) - expected = alt(other, series) - assert_almost_equal(result, expected) + @pytest.mark.parametrize( + 'ts', + [ + (lambda x: x, lambda x: x * 2, False), + (lambda x: x, lambda x: x[::2], False), + (lambda x: x, lambda x: 5, True), + (lambda x: tm.makeFloatSeries(), + lambda x: tm.makeFloatSeries(), + True) + ]) + @pytest.mark.parametrize('opname', ['add', 'sub', 'mul', 'floordiv', + 'truediv', 'div', 'pow']) + def test_op_method(self, opname, ts): + # check that Series.{opname} behaves like Series.__{opname}__, + series = ts[0](self.ts) + other = ts[1](self.ts) + check_reverse = ts[2] + + if opname == 'div' and compat.PY3: + pytest.skip('div test only for Py3') + + op = getattr(Series, opname) + + if op == 'div': + alt = operator.truediv + else: + alt = getattr(operator, opname) - check(self.ts, self.ts * 2) - check(self.ts, self.ts[::2]) - check(self.ts, 5, check_reverse=True) - check(tm.makeFloatSeries(), tm.makeFloatSeries(), check_reverse=True) + result = op(series, other) + expected = alt(series, other) + assert_almost_equal(result, expected) + if check_reverse: + rop = getattr(Series, "r" + opname) + result = rop(series, other) + expected = alt(other, series) + assert_almost_equal(result, expected) def test_neg(self): assert_series_equal(-self.series, -1 * self.series) @@ -1971,20 +1984,15 @@ def test_operators_corner(self): index=self.ts.index[:-5], name='ts') tm.assert_series_equal(added[:-5], expected) - def test_operators_reverse_object(self): + @pytest.mark.parametrize('op', [operator.add, operator.sub, operator.mul, + operator.truediv, operator.floordiv]) + def test_operators_reverse_object(self, op): # GH 56 arr = Series(np.random.randn(10), index=np.arange(10), dtype=object) - def _check_op(arr, op): - result = op(1., arr) - expected = op(1., arr.astype(float)) - assert_series_equal(result.astype(float), expected) - - _check_op(arr, operator.add) - _check_op(arr, operator.sub) - _check_op(arr, operator.mul) - _check_op(arr, operator.truediv) - _check_op(arr, operator.floordiv) + result = op(1., arr) + expected = op(1., arr.astype(float)) + assert_series_equal(result.astype(float), expected) def test_arith_ops_df_compat(self): # GH 1134 From b83512773ab0dd3908cf2ef5ccaddcd8e0337c64 Mon Sep 17 00:00:00 2001 From: Dillon Niederhut Date: Thu, 8 Feb 2018 05:32:04 -0600 Subject: [PATCH 075/214] Bug: adds support for unary plus (#19297) --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/core/generic.py | 19 +++++++- pandas/tests/computation/test_eval.py | 60 ++++++++------------------ pandas/tests/frame/test_arithmetic.py | 4 +- pandas/tests/frame/test_operators.py | 43 ++++++++++++++++-- pandas/tests/series/test_arithmetic.py | 4 +- 6 files changed, 80 insertions(+), 51 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 6c4fce35529ad..5e94b9c15fa57 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -253,6 +253,7 @@ Current Behavior: Other Enhancements ^^^^^^^^^^^^^^^^^^ +- Unary ``+`` now permitted for ``Series`` and ``DataFrame`` as numeric operator (:issue:`16073`) - Better support for :func:`Dataframe.style.to_excel` output with the ``xlsxwriter`` engine. (:issue:`16149`) - :func:`pandas.tseries.frequencies.to_offset` now accepts leading '+' signs e.g. '+1h'. (:issue:`18171`) - :func:`MultiIndex.unique` now supports the ``level=`` argument, to get unique values from a specific index level (:issue:`17896`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index cb4bbb7b27c42..35f866c9e7d58 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -25,6 +25,7 @@ is_list_like, is_dict_like, is_re_compilable, + is_period_arraylike, pandas_dtype) from pandas.core.dtypes.cast import maybe_promote, maybe_upcast_putmask from pandas.core.dtypes.inference import is_hashable @@ -1027,10 +1028,24 @@ def _indexed_same(self, other): def __neg__(self): values = com._values_from_object(self) - if values.dtype == np.bool_: + if is_bool_dtype(values): arr = operator.inv(values) - else: + elif (is_numeric_dtype(values) or is_timedelta64_dtype(values)): arr = operator.neg(values) + else: + raise TypeError("Unary negative expects numeric dtype, not {}" + .format(values.dtype)) + return self.__array_wrap__(arr) + + def __pos__(self): + values = com._values_from_object(self) + if (is_bool_dtype(values) or is_period_arraylike(values)): + arr = values + elif (is_numeric_dtype(values) or is_timedelta64_dtype(values)): + arr = operator.pos(values) + else: + raise TypeError("Unary plus expects numeric dtype, not {}" + .format(values.dtype)) return self.__array_wrap__(arr) def __invert__(self): diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index 9c3572f9ffe72..07ba0b681418e 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -542,66 +542,42 @@ def test_frame_pos(self): # float lhs = DataFrame(randn(5, 2)) - if self.engine == 'python': - with pytest.raises(TypeError): - result = pd.eval(expr, engine=self.engine, parser=self.parser) - else: - expect = lhs - result = pd.eval(expr, engine=self.engine, parser=self.parser) - assert_frame_equal(expect, result) + expect = lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + assert_frame_equal(expect, result) # int lhs = DataFrame(randint(5, size=(5, 2))) - if self.engine == 'python': - with pytest.raises(TypeError): - result = pd.eval(expr, engine=self.engine, parser=self.parser) - else: - expect = lhs - result = pd.eval(expr, engine=self.engine, parser=self.parser) - assert_frame_equal(expect, result) + expect = lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + assert_frame_equal(expect, result) # bool doesn't work with numexpr but works elsewhere lhs = DataFrame(rand(5, 2) > 0.5) - if self.engine == 'python': - with pytest.raises(TypeError): - result = pd.eval(expr, engine=self.engine, parser=self.parser) - else: - expect = lhs - result = pd.eval(expr, engine=self.engine, parser=self.parser) - assert_frame_equal(expect, result) + expect = lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + assert_frame_equal(expect, result) def test_series_pos(self): expr = self.ex('+') # float lhs = Series(randn(5)) - if self.engine == 'python': - with pytest.raises(TypeError): - result = pd.eval(expr, engine=self.engine, parser=self.parser) - else: - expect = lhs - result = pd.eval(expr, engine=self.engine, parser=self.parser) - assert_series_equal(expect, result) + expect = lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + assert_series_equal(expect, result) # int lhs = Series(randint(5, size=5)) - if self.engine == 'python': - with pytest.raises(TypeError): - result = pd.eval(expr, engine=self.engine, parser=self.parser) - else: - expect = lhs - result = pd.eval(expr, engine=self.engine, parser=self.parser) - assert_series_equal(expect, result) + expect = lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + assert_series_equal(expect, result) # bool doesn't work with numexpr but works elsewhere lhs = Series(rand(5) > 0.5) - if self.engine == 'python': - with pytest.raises(TypeError): - result = pd.eval(expr, engine=self.engine, parser=self.parser) - else: - expect = lhs - result = pd.eval(expr, engine=self.engine, parser=self.parser) - assert_series_equal(expect, result) + expect = lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + assert_series_equal(expect, result) def test_scalar_unary(self): with pytest.raises(TypeError): diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 1bb8e8edffc6e..a3a799aed1c55 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -245,7 +245,7 @@ def test_ops_frame_period(self): exp = pd.DataFrame({'A': np.array([2, 1], dtype=object), 'B': np.array([14, 13], dtype=object)}) tm.assert_frame_equal(p - df, exp) - tm.assert_frame_equal(df - p, -exp) + tm.assert_frame_equal(df - p, -1 * exp) df2 = pd.DataFrame({'A': [pd.Period('2015-05', freq='M'), pd.Period('2015-06', freq='M')], @@ -257,4 +257,4 @@ def test_ops_frame_period(self): exp = pd.DataFrame({'A': np.array([4, 4], dtype=object), 'B': np.array([16, 16], dtype=object)}) tm.assert_frame_equal(df2 - df, exp) - tm.assert_frame_equal(df - df2, -exp) + tm.assert_frame_equal(df - df2, -1 * exp) diff --git a/pandas/tests/frame/test_operators.py b/pandas/tests/frame/test_operators.py index 26974b6398694..5df50f3d7835b 100644 --- a/pandas/tests/frame/test_operators.py +++ b/pandas/tests/frame/test_operators.py @@ -271,13 +271,50 @@ def test_logical_with_nas(self): expected = Series([True, True]) assert_series_equal(result, expected) - def test_neg(self): - # what to do? - assert_frame_equal(-self.frame, -1 * self.frame) + @pytest.mark.parametrize('df,expected', [ + (pd.DataFrame({'a': [-1, 1]}), pd.DataFrame({'a': [1, -1]})), + (pd.DataFrame({'a': [False, True]}), + pd.DataFrame({'a': [True, False]})), + (pd.DataFrame({'a': pd.Series(pd.to_timedelta([-1, 1]))}), + pd.DataFrame({'a': pd.Series(pd.to_timedelta([1, -1]))})) + ]) + def test_neg_numeric(self, df, expected): + assert_frame_equal(-df, expected) + assert_series_equal(-df['a'], expected['a']) + + @pytest.mark.parametrize('df', [ + pd.DataFrame({'a': ['a', 'b']}), + pd.DataFrame({'a': pd.to_datetime(['2017-01-22', '1970-01-01'])}), + ]) + def test_neg_raises(self, df): + with pytest.raises(TypeError): + (- df) + with pytest.raises(TypeError): + (- df['a']) def test_invert(self): assert_frame_equal(-(self.frame < 0), ~(self.frame < 0)) + @pytest.mark.parametrize('df', [ + pd.DataFrame({'a': [-1, 1]}), + pd.DataFrame({'a': [False, True]}), + pd.DataFrame({'a': pd.Series(pd.to_timedelta([-1, 1]))}), + ]) + def test_pos_numeric(self, df): + # GH 16073 + assert_frame_equal(+df, df) + assert_series_equal(+df['a'], df['a']) + + @pytest.mark.parametrize('df', [ + pd.DataFrame({'a': ['a', 'b']}), + pd.DataFrame({'a': pd.to_datetime(['2017-01-22', '1970-01-01'])}), + ]) + def test_pos_raises(self, df): + with pytest.raises(TypeError): + (+ df) + with pytest.raises(TypeError): + (+ df['a']) + def test_arith_flex_frame(self): ops = ['add', 'sub', 'mul', 'div', 'truediv', 'pow', 'floordiv', 'mod'] if not compat.PY3: diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index 1d9fa9dc15531..94da97ef45301 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -315,7 +315,7 @@ def test_ops_series_period(self): # dtype will be object because of original dtype expected = pd.Series([9, 8], name='xxx', dtype=object) tm.assert_series_equal(per - ser, expected) - tm.assert_series_equal(ser - per, -expected) + tm.assert_series_equal(ser - per, -1 * expected) s2 = pd.Series([pd.Period('2015-01-05', freq='D'), pd.Period('2015-01-04', freq='D')], name='xxx') @@ -323,7 +323,7 @@ def test_ops_series_period(self): expected = pd.Series([4, 2], name='xxx', dtype=object) tm.assert_series_equal(s2 - ser, expected) - tm.assert_series_equal(ser - s2, -expected) + tm.assert_series_equal(ser - s2, -1 * expected) class TestTimestampSeriesArithmetic(object): From f30345f13974ef325118d499cf8c1033443fe6c9 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 9 Feb 2018 06:25:32 -0600 Subject: [PATCH 076/214] Ignore warnings when reading pickle files (#19580) --- pandas/io/pickle.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index fa953f7d876cc..756096dd0c9ce 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -1,4 +1,5 @@ """ pickle compat """ +import warnings import numpy as np from numpy.lib.format import read_array, write_array @@ -96,7 +97,9 @@ def try_read(path, encoding=None): # cpickle # GH 6899 try: - return read_wrapper(lambda f: pkl.load(f)) + with warnings.catch_warnings(record=True): + # We want to silencce any warnings about, e.g. moved modules. + return read_wrapper(lambda f: pkl.load(f)) except Exception: # reg/patched pickle try: From 7dcc86443fd9b1aa94b6f7e4e33b6fbd0210b8db Mon Sep 17 00:00:00 2001 From: samghelms Date: Fri, 9 Feb 2018 07:29:02 -0500 Subject: [PATCH 077/214] ENH: added an optional css id to `` tags created by `frame.to_html()` (#19594) --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/core/frame.py | 10 ++++++++-- pandas/io/formats/format.py | 25 +++++++++++++++++++------ pandas/tests/io/formats/test_format.py | 4 ++-- pandas/tests/io/formats/test_to_html.py | 7 +++++++ 5 files changed, 37 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 5e94b9c15fa57..083242cd69b74 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -717,6 +717,7 @@ I/O ^^^ - :func:`read_html` now rewinds seekable IO objects after parse failure, before attempting to parse with a new parser. If a parser errors and the object is non-seekable, an informative error is raised suggesting the use of a different parser (:issue:`17975`) +- :meth:`DataFrame.to_html` now has an option to add an id to the leading `
` tag (:issue:`8496`) - Bug in :func:`read_msgpack` with a non existent file is passed in Python 2 (:issue:`15296`) - Bug in :func:`read_csv` where a ``MultiIndex`` with duplicate columns was not being mangled appropriately (:issue:`18062`) - Bug in :func:`read_csv` where missing values were not being handled properly when ``keep_default_na=False`` with dictionary ``na_values`` (:issue:`19227`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 28923f0fbf240..6d8dcb8a1ca89 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1727,7 +1727,7 @@ def to_html(self, buf=None, columns=None, col_space=None, header=True, sparsify=None, index_names=True, justify=None, bold_rows=True, classes=None, escape=True, max_rows=None, max_cols=None, show_dimensions=False, notebook=False, decimal='.', - border=None): + border=None, table_id=None): """ Render a DataFrame as an HTML table. @@ -1755,6 +1755,12 @@ def to_html(self, buf=None, columns=None, col_space=None, header=True, `
` tag. Default ``pd.options.html.border``. .. versionadded:: 0.19.0 + + table_id : str, optional + A css id is included in the opening `
` tag if specified. + + .. versionadded:: 0.23.0 + """ if (justify is not None and @@ -1772,7 +1778,7 @@ def to_html(self, buf=None, columns=None, col_space=None, header=True, max_rows=max_rows, max_cols=max_cols, show_dimensions=show_dimensions, - decimal=decimal) + decimal=decimal, table_id=table_id) # TODO: a generic formatter wld b in DataFrameFormatter formatter.to_html(classes=classes, notebook=notebook, border=border) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 269c81b380b5e..621641747f376 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -77,7 +77,11 @@ index_names : bool, optional Prints the names of the indexes, default True line_width : int, optional - Width to wrap a line in characters, default no wrap""" + Width to wrap a line in characters, default no wrap + table_id : str, optional + id for the
element create by to_html + + .. versionadded:: 0.23.0""" _VALID_JUSTIFY_PARAMETERS = ("left", "right", "center", "justify", "justify-all", "start", "end", "inherit", @@ -387,7 +391,8 @@ def __init__(self, frame, buf=None, columns=None, col_space=None, header=True, index=True, na_rep='NaN', formatters=None, justify=None, float_format=None, sparsify=None, index_names=True, line_width=None, max_rows=None, - max_cols=None, show_dimensions=False, decimal='.', **kwds): + max_cols=None, show_dimensions=False, decimal='.', + table_id=None, **kwds): self.frame = frame if buf is not None: self.buf = _expand_user(_stringify_path(buf)) @@ -413,6 +418,7 @@ def __init__(self, frame, buf=None, columns=None, col_space=None, self.max_rows_displayed = min(max_rows or len(self.frame), len(self.frame)) self.show_dimensions = show_dimensions + self.table_id = table_id if justify is None: self.justify = get_option("display.colheader_justify") @@ -740,7 +746,8 @@ def to_html(self, classes=None, notebook=False, border=None): max_rows=self.max_rows, max_cols=self.max_cols, notebook=notebook, - border=border) + border=border, + table_id=self.table_id) if hasattr(self.buf, 'write'): html_renderer.write_result(self.buf) elif isinstance(self.buf, compat.string_types): @@ -1082,7 +1089,7 @@ class HTMLFormatter(TableFormatter): indent_delta = 2 def __init__(self, formatter, classes=None, max_rows=None, max_cols=None, - notebook=False, border=None): + notebook=False, border=None, table_id=None): self.fmt = formatter self.classes = classes @@ -1101,6 +1108,7 @@ def __init__(self, formatter, classes=None, max_rows=None, max_cols=None, if border is None: border = get_option('display.html.border') self.border = border + self.table_id = table_id def write(self, s, indent=0): rs = pprint_thing(s) @@ -1197,6 +1205,7 @@ def write_style(self): def write_result(self, buf): indent = 0 + id_section = "" frame = self.frame _classes = ['dataframe'] # Default class. @@ -1220,8 +1229,12 @@ def write_result(self, buf): self.write(''.format(style=div_style)) self.write_style() - self.write('
' - .format(border=self.border, cls=' '.join(_classes)), indent) + + if self.table_id is not None: + id_section = ' id="{table_id}"'.format(table_id=self.table_id) + self.write('
' + .format(border=self.border, cls=' '.join(_classes), + id_section=id_section), indent) indent += self.indent_delta indent = self._write_header(indent) diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index e0ce27de5c31f..dddba5b425c3b 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -1492,7 +1492,7 @@ def test_repr_html_float(self): 'B': np.arange(41, 41 + h)}).set_index('idx') reg_repr = df._repr_html_() assert '..' not in reg_repr - assert str(40 + h) in reg_repr + assert ''.format(val=str(40 + h)) in reg_repr h = max_rows + 1 df = DataFrame({'idx': np.linspace(-10, 10, h), @@ -1500,7 +1500,7 @@ def test_repr_html_float(self): 'B': np.arange(41, 41 + h)}).set_index('idx') long_repr = df._repr_html_() assert '..' in long_repr - assert '31' not in long_repr + assert ''.format(val='31') not in long_repr assert u('{h} rows ').format(h=h) in long_repr assert u('2 columns') in long_repr diff --git a/pandas/tests/io/formats/test_to_html.py b/pandas/tests/io/formats/test_to_html.py index 9e063c2d176e1..f69cac62513d4 100644 --- a/pandas/tests/io/formats/test_to_html.py +++ b/pandas/tests/io/formats/test_to_html.py @@ -1864,3 +1864,10 @@ def test_to_html_with_index_names_false(self): name='myindexname')) result = df.to_html(index_names=False) assert 'myindexname' not in result + + def test_to_html_with_id(self): + # gh-8496 + df = pd.DataFrame({"A": [1, 2]}, index=pd.Index(['a', 'b'], + name='myindexname')) + result = df.to_html(index_names=False, table_id="TEST_ID") + assert ' id="TEST_ID"' in result From a214915e241ea15f3d072d54930d0e0c8f42ee10 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 9 Feb 2018 10:11:17 -0600 Subject: [PATCH 078/214] CI: Fixed NumPy pinning in conda-build (#19575) * CI: Fixed NumPy pinning in conda-build * Unpin NumPy Quite install * Pin numpy * Unpin everywhere else * Build vs. 1.11 * remove one more pin * Remove one more pin * bump pyarrow --- ci/install_travis.sh | 6 +++--- ci/requirements-3.5_CONDA_BUILD_TEST.build | 2 +- ci/requirements-3.5_CONDA_BUILD_TEST.run | 2 +- ci/requirements-3.5_CONDA_BUILD_TEST.sh | 2 +- ci/requirements-3.6.build | 2 +- conda.recipe/meta.yaml | 4 ++-- 6 files changed, 9 insertions(+), 9 deletions(-) diff --git a/ci/install_travis.sh b/ci/install_travis.sh index 4ec5b0a9d8820..6e270519e60c3 100755 --- a/ci/install_travis.sh +++ b/ci/install_travis.sh @@ -110,7 +110,7 @@ if [ -e ${REQ} ]; then fi time conda install -n pandas pytest>=3.1.0 -time pip install pytest-xdist moto +time pip install -q pytest-xdist moto if [ "$LINT" ]; then conda install flake8=3.4.1 @@ -181,10 +181,10 @@ elif [ "$CONDA_BUILD_TEST" ]; then # build & install testing echo "[building conda recipe]" - time conda build ./conda.recipe --numpy 1.13 --python 3.5 -q --no-test + time conda build ./conda.recipe --python 3.5 -q --no-test || exit 1 echo "[installing]" - conda install pandas --use-local + conda install pandas --use-local || exit 1 else diff --git a/ci/requirements-3.5_CONDA_BUILD_TEST.build b/ci/requirements-3.5_CONDA_BUILD_TEST.build index 6648e3778777c..f7befe3b31865 100644 --- a/ci/requirements-3.5_CONDA_BUILD_TEST.build +++ b/ci/requirements-3.5_CONDA_BUILD_TEST.build @@ -2,5 +2,5 @@ python=3.5* python-dateutil pytz nomkl -numpy=1.13* +numpy cython diff --git a/ci/requirements-3.5_CONDA_BUILD_TEST.run b/ci/requirements-3.5_CONDA_BUILD_TEST.run index 19d9a91e86585..669cf437f2164 100644 --- a/ci/requirements-3.5_CONDA_BUILD_TEST.run +++ b/ci/requirements-3.5_CONDA_BUILD_TEST.run @@ -1,5 +1,5 @@ pytz -numpy=1.13* +numpy openpyxl xlsxwriter xlrd diff --git a/ci/requirements-3.5_CONDA_BUILD_TEST.sh b/ci/requirements-3.5_CONDA_BUILD_TEST.sh index 09d6775cfc894..093fdbcf21d78 100644 --- a/ci/requirements-3.5_CONDA_BUILD_TEST.sh +++ b/ci/requirements-3.5_CONDA_BUILD_TEST.sh @@ -8,4 +8,4 @@ echo "install 35 CONDA_BUILD_TEST" conda remove -n pandas python-dateutil --force pip install python-dateutil -conda install -n pandas -c conda-forge feather-format pyarrow=0.5.0 +conda install -n pandas -c conda-forge feather-format pyarrow=0.7.1 diff --git a/ci/requirements-3.6.build b/ci/requirements-3.6.build index 94e1152450d87..1c4b46aea3865 100644 --- a/ci/requirements-3.6.build +++ b/ci/requirements-3.6.build @@ -2,5 +2,5 @@ python=3.6* python-dateutil pytz nomkl -numpy=1.13.* +numpy cython diff --git a/conda.recipe/meta.yaml b/conda.recipe/meta.yaml index 87a79f7e5a987..86bed996c8aab 100644 --- a/conda.recipe/meta.yaml +++ b/conda.recipe/meta.yaml @@ -14,14 +14,14 @@ requirements: build: - python - cython - - {{ pin_compatible('numpy', upper_bound='1.14') }} + - numpy 1.11.* - setuptools >=3.3 - python-dateutil >=2.5.0 - pytz run: - python - - {{ pin_compatible('numpy', upper_bound='1.14') }} + - numpy >=1.11.* - python-dateutil >=2.5.0 - pytz From 6485a36483884fb817800a8380a4a4197d6df4ad Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 9 Feb 2018 14:53:41 -0600 Subject: [PATCH 079/214] API: Default ExtensionArray.astype (#19604) * API: Default ExtensionArray.astype (cherry picked from commit 943a915562b72bed147c857de927afa0daf31c1a) * Py2 compat * Moved * Moved dtypes --- pandas/core/arrays/base.py | 21 +++++++++ pandas/tests/dtypes/test_dtypes.py | 32 +------------ pandas/tests/extension/__init__.py | 0 pandas/tests/extension/test_common.py | 67 +++++++++++++++++++++++++++ 4 files changed, 89 insertions(+), 31 deletions(-) create mode 100644 pandas/tests/extension/__init__.py create mode 100644 pandas/tests/extension/test_common.py diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 1556b653819a6..553e1e0ac2066 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1,4 +1,6 @@ """An interface for extending pandas with custom arrays.""" +import numpy as np + from pandas.errors import AbstractMethodError _not_implemented_message = "{} does not implement {}." @@ -138,6 +140,25 @@ def nbytes(self): # ------------------------------------------------------------------------ # Additional Methods # ------------------------------------------------------------------------ + def astype(self, dtype, copy=True): + """Cast to a NumPy array with 'dtype'. + + Parameters + ---------- + dtype : str or dtype + Typecode or data-type to which the array is cast. + copy : bool, default True + Whether to copy the data, even if not necessary. If False, + a copy is made only if the old dtype does not match the + new dtype. + + Returns + ------- + array : ndarray + NumPy ndarray with 'dtype' for its dtype. + """ + return np.array(self, dtype=dtype, copy=copy) + def isna(self): # type: () -> np.ndarray """Boolean NumPy array indicating if each value is missing. diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index eca4dd4cf2106..d800a7b92b559 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -10,14 +10,12 @@ Series, Categorical, CategoricalIndex, IntervalIndex, date_range) from pandas.compat import string_types -from pandas.core.arrays import ExtensionArray from pandas.core.dtypes.dtypes import ( DatetimeTZDtype, PeriodDtype, - IntervalDtype, CategoricalDtype, ExtensionDtype) + IntervalDtype, CategoricalDtype) from pandas.core.dtypes.common import ( is_categorical_dtype, is_categorical, is_datetime64tz_dtype, is_datetimetz, - is_extension_array_dtype, is_period_dtype, is_period, is_dtype_equal, is_datetime64_ns_dtype, is_datetime64_dtype, is_interval_dtype, @@ -744,31 +742,3 @@ def test_categorical_categories(self): tm.assert_index_equal(c1.categories, pd.Index(['a', 'b'])) c1 = CategoricalDtype(CategoricalIndex(['a', 'b'])) tm.assert_index_equal(c1.categories, pd.Index(['a', 'b'])) - - -class DummyArray(ExtensionArray): - pass - - -class DummyDtype(ExtensionDtype): - pass - - -class TestExtensionArrayDtype(object): - - @pytest.mark.parametrize('values', [ - pd.Categorical([]), - pd.Categorical([]).dtype, - pd.Series(pd.Categorical([])), - DummyDtype(), - DummyArray(), - ]) - def test_is_extension_array_dtype(self, values): - assert is_extension_array_dtype(values) - - @pytest.mark.parametrize('values', [ - np.array([]), - pd.Series(np.array([])), - ]) - def test_is_not_extension_array_dtype(self, values): - assert not is_extension_array_dtype(values) diff --git a/pandas/tests/extension/__init__.py b/pandas/tests/extension/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/extension/test_common.py b/pandas/tests/extension/test_common.py new file mode 100644 index 0000000000000..1f4582f687415 --- /dev/null +++ b/pandas/tests/extension/test_common.py @@ -0,0 +1,67 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas.util.testing as tm +from pandas.core.arrays import ExtensionArray +from pandas.core.dtypes.common import is_extension_array_dtype +from pandas.core.dtypes.dtypes import ExtensionDtype + + +class DummyDtype(ExtensionDtype): + pass + + +class DummyArray(ExtensionArray): + + def __init__(self, data): + self.data = data + + def __array__(self, dtype): + return self.data + + @property + def dtype(self): + return self.data.dtype + + +class TestExtensionArrayDtype(object): + + @pytest.mark.parametrize('values', [ + pd.Categorical([]), + pd.Categorical([]).dtype, + pd.Series(pd.Categorical([])), + DummyDtype(), + DummyArray(np.array([1, 2])), + ]) + def test_is_extension_array_dtype(self, values): + assert is_extension_array_dtype(values) + + @pytest.mark.parametrize('values', [ + np.array([]), + pd.Series(np.array([])), + ]) + def test_is_not_extension_array_dtype(self, values): + assert not is_extension_array_dtype(values) + + +def test_astype(): + + arr = DummyArray(np.array([1, 2, 3])) + expected = np.array([1, 2, 3], dtype=object) + + result = arr.astype(object) + tm.assert_numpy_array_equal(result, expected) + + result = arr.astype('object') + tm.assert_numpy_array_equal(result, expected) + + +def test_astype_no_copy(): + arr = DummyArray(np.array([1, 2, 3], dtype=np.int64)) + result = arr.astype(arr.dtype, copy=False) + + assert arr.data is result + + result = arr.astype(arr.dtype) + assert arr.data is not result From c1068d9d242c22cb2199156f6fb82eb5759178ae Mon Sep 17 00:00:00 2001 From: William Ayd Date: Sat, 10 Feb 2018 08:05:51 -0800 Subject: [PATCH 080/214] PERF: Cythonize Groupby Rank (#19481) --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/_libs/algos.pxd | 8 ++ pandas/_libs/algos.pyx | 8 -- pandas/_libs/groupby.pyx | 5 +- pandas/_libs/groupby_helper.pxi.in | 165 ++++++++++++++++++++++++++ pandas/core/groupby.py | 76 +++++++++--- pandas/tests/groupby/test_groupby.py | 166 +++++++++++++++++++++++++++ 7 files changed, 406 insertions(+), 23 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 083242cd69b74..cf5a44442045b 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -581,6 +581,7 @@ Performance Improvements - Improved performance of :func:`DataFrame.median` with ``axis=1`` when bottleneck is not installed (:issue:`16468`) - Improved performance of :func:`MultiIndex.get_loc` for large indexes, at the cost of a reduction in performance for small ones (:issue:`18519`) - Improved performance of pairwise ``.rolling()`` and ``.expanding()`` with ``.cov()`` and ``.corr()`` operations (:issue:`17917`) +- Improved performance of :func:`DataFrameGroupBy.rank` (:issue:`15779`) .. _whatsnew_0230.docs: diff --git a/pandas/_libs/algos.pxd b/pandas/_libs/algos.pxd index 6d80e6f0073eb..a535872ff7279 100644 --- a/pandas/_libs/algos.pxd +++ b/pandas/_libs/algos.pxd @@ -11,3 +11,11 @@ cdef inline Py_ssize_t swap(numeric *a, numeric *b) nogil: a[0] = b[0] b[0] = t return 0 + +cdef enum TiebreakEnumType: + TIEBREAK_AVERAGE + TIEBREAK_MIN, + TIEBREAK_MAX + TIEBREAK_FIRST + TIEBREAK_FIRST_DESCENDING + TIEBREAK_DENSE diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 5d17488963b1c..a418e54e4da9b 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -31,14 +31,6 @@ cdef double nan = NaN cdef int64_t iNaT = get_nat() -cdef: - int TIEBREAK_AVERAGE = 0 - int TIEBREAK_MIN = 1 - int TIEBREAK_MAX = 2 - int TIEBREAK_FIRST = 3 - int TIEBREAK_FIRST_DESCENDING = 4 - int TIEBREAK_DENSE = 5 - tiebreakers = { 'average': TIEBREAK_AVERAGE, 'min': TIEBREAK_MIN, diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 55de700c9af52..d75c3a71896e3 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -16,8 +16,9 @@ from numpy cimport (ndarray, from libc.stdlib cimport malloc, free from util cimport numeric, get_nat -from algos cimport swap -from algos import take_2d_axis1_float64_float64, groupsort_indexer +from algos cimport (swap, TiebreakEnumType, TIEBREAK_AVERAGE, TIEBREAK_MIN, + TIEBREAK_MAX, TIEBREAK_FIRST, TIEBREAK_DENSE) +from algos import take_2d_axis1_float64_float64, groupsort_indexer, tiebreakers cdef int64_t iNaT = get_nat() diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index a751fadaf48cf..b24444c422efa 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -444,8 +444,173 @@ def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, else: out[i, j] = resx[i, j] + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, + ndarray[{{c_type}}, ndim=2] values, + ndarray[int64_t] labels, + bint is_datetimelike, object ties_method, + bint ascending, bint pct, object na_option): + """Provides the rank of values within each group + + Parameters + ---------- + out : array of float64_t values which this method will write its results to + values : array of {{c_type}} values to be ranked + labels : array containing unique label for each group, with its ordering + matching up to the corresponding record in `values` + is_datetimelike : bool + unused in this method but provided for call compatability with other + Cython transformations + ties_method : {'keep', 'top', 'bottom'} + * keep: leave NA values where they are + * top: smallest rank if ascending + * bottom: smallest rank if descending + ascending : boolean + False for ranks by high (1) to low (N) + pct : boolean + Compute percentage rank of data within each group + + Notes + ----- + This method modifies the `out` parameter rather than returning an object + """ + cdef: + TiebreakEnumType tiebreak + Py_ssize_t i, j, N, K, val_start=0, grp_start=0, dups=0, sum_ranks=0 + Py_ssize_t grp_vals_seen=1, grp_na_count=0 + ndarray[int64_t] _as + ndarray[float64_t, ndim=2] grp_sizes + ndarray[{{c_type}}] masked_vals + ndarray[uint8_t] mask + bint keep_na + {{c_type}} nan_fill_val + + tiebreak = tiebreakers[ties_method] + keep_na = na_option == 'keep' + N, K = ( values).shape + grp_sizes = np.ones_like(out) + + # Copy values into new array in order to fill missing data + # with mask, without obfuscating location of missing data + # in values array + masked_vals = np.array(values[:, 0], copy=True) + {{if name=='int64'}} + mask = (masked_vals == {{nan_val}}).astype(np.uint8) + {{else}} + mask = np.isnan(masked_vals).astype(np.uint8) + {{endif}} + + if ascending ^ (na_option == 'top'): + {{if name == 'int64'}} + nan_fill_val = np.iinfo(np.int64).max + {{else}} + nan_fill_val = np.inf + {{endif}} + order = (masked_vals, mask, labels) + else: + {{if name == 'int64'}} + nan_fill_val = np.iinfo(np.int64).min + {{else}} + nan_fill_val = -np.inf + {{endif}} + order = (masked_vals, ~mask, labels) + np.putmask(masked_vals, mask, nan_fill_val) + + # lexsort using labels, then mask, then actual values + # each label corresponds to a different group value, + # the mask helps you differentiate missing values before + # performing sort on the actual values + _as = np.lexsort(order) + + if not ascending: + _as = _as[::-1] + + with nogil: + # Loop over the length of the value array + # each incremental i value can be looked up in the _as array + # that we sorted previously, which gives us the location of + # that sorted value for retrieval back from the original + # values / masked_vals arrays + for i in range(N): + # dups and sum_ranks will be incremented each loop where + # the value / group remains the same, and should be reset + # when either of those change + # Used to calculate tiebreakers + dups += 1 + sum_ranks += i - grp_start + 1 + + # if keep_na, check for missing values and assign back + # to the result where appropriate + if keep_na and masked_vals[_as[i]] == nan_fill_val: + grp_na_count += 1 + out[_as[i], 0] = nan + else: + # this implementation is inefficient because it will + # continue overwriting previously encountered dups + # i.e. if 5 duplicated values are encountered it will + # write to the result as follows (assumes avg tiebreaker): + # 1 + # .5 .5 + # .33 .33 .33 + # .25 .25 .25 .25 + # .2 .2 .2 .2 .2 + # + # could potentially be optimized to only write to the + # result once the last duplicate value is encountered + if tiebreak == TIEBREAK_AVERAGE: + for j in range(i - dups + 1, i + 1): + out[_as[j], 0] = sum_ranks / dups + elif tiebreak == TIEBREAK_MIN: + for j in range(i - dups + 1, i + 1): + out[_as[j], 0] = i - grp_start - dups + 2 + elif tiebreak == TIEBREAK_MAX: + for j in range(i - dups + 1, i + 1): + out[_as[j], 0] = i - grp_start + 1 + elif tiebreak == TIEBREAK_FIRST: + for j in range(i - dups + 1, i + 1): + if ascending: + out[_as[j], 0] = j + 1 - grp_start + else: + out[_as[j], 0] = 2 * i - j - dups + 2 - grp_start + elif tiebreak == TIEBREAK_DENSE: + for j in range(i - dups + 1, i + 1): + out[_as[j], 0] = grp_vals_seen + + # look forward to the next value (using the sorting in _as) + # if the value does not equal the current value then we need to + # reset the dups and sum_ranks, knowing that a new value is coming + # up. the conditional also needs to handle nan equality and the + # end of iteration + if (i == N - 1 or ( + (masked_vals[_as[i]] != masked_vals[_as[i+1]]) and not + (mask[_as[i]] and mask[_as[i+1]]))): + dups = sum_ranks = 0 + val_start = i + grp_vals_seen += 1 + + # Similar to the previous conditional, check now if we are moving + # to a new group. If so, keep track of the index where the new + # group occurs, so the tiebreaker calculations can decrement that + # from their position. fill in the size of each group encountered + # (used by pct calculations later). also be sure to reset any of + # the items helping to calculate dups + if i == N - 1 or labels[_as[i]] != labels[_as[i+1]]: + for j in range(grp_start, i + 1): + grp_sizes[_as[j], 0] = i - grp_start + 1 - grp_na_count + dups = sum_ranks = 0 + grp_na_count = 0 + val_start = i + 1 + grp_start = i + 1 + grp_vals_seen = 1 + + if pct: + for i in range(N): + out[i, 0] = out[i, 0] / grp_sizes[i, 0] {{endfor}} + #---------------------------------------------------------------------- # group_min, group_max #---------------------------------------------------------------------- diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 01241db7c0c42..0363bcd02aa16 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -994,7 +994,7 @@ def _transform_should_cast(self, func_nm): return (self.size().fillna(0) > 0).any() and (func_nm not in _cython_cast_blacklist) - def _cython_transform(self, how, numeric_only=True): + def _cython_transform(self, how, numeric_only=True, **kwargs): output = collections.OrderedDict() for name, obj in self._iterate_slices(): is_numeric = is_numeric_dtype(obj.dtype) @@ -1002,12 +1002,16 @@ def _cython_transform(self, how, numeric_only=True): continue try: - result, names = self.grouper.transform(obj.values, how) + result, names = self.grouper.transform(obj.values, how, + **kwargs) except NotImplementedError: continue except AssertionError as e: raise GroupByError(str(e)) - output[name] = self._try_cast(result, obj) + if self._transform_should_cast(how): + output[name] = self._try_cast(result, obj) + else: + output[name] = result if len(output) == 0: raise DataError('No numeric types to aggregate') @@ -1768,6 +1772,37 @@ def cumcount(self, ascending=True): cumcounts = self._cumcount_array(ascending=ascending) return Series(cumcounts, index) + @Substitution(name='groupby') + @Appender(_doc_template) + def rank(self, method='average', ascending=True, na_option='keep', + pct=False, axis=0): + """Provides the rank of values within each group + + Parameters + ---------- + method : {'average', 'min', 'max', 'first', 'dense'}, efault 'average' + * average: average rank of group + * min: lowest rank in group + * max: highest rank in group + * first: ranks assigned in order they appear in the array + * dense: like 'min', but rank always increases by 1 between groups + method : {'keep', 'top', 'bottom'}, default 'keep' + * keep: leave NA values where they are + * top: smallest rank if ascending + * bottom: smallest rank if descending + ascending : boolean, default True + False for ranks by high (1) to low (N) + pct : boolean, default False + Compute percentage rank of data within each group + + Returns + ----- + DataFrame with ranking of values within each group + """ + return self._cython_transform('rank', numeric_only=False, + ties_method=method, ascending=ascending, + na_option=na_option, pct=pct, axis=axis) + @Substitution(name='groupby') @Appender(_doc_template) def cumprod(self, axis=0, *args, **kwargs): @@ -2183,6 +2218,16 @@ def get_group_levels(self): 'cumsum': 'group_cumsum', 'cummin': 'group_cummin', 'cummax': 'group_cummax', + 'rank': { + 'name': 'group_rank', + 'f': lambda func, a, b, c, d, **kwargs: func( + a, b, c, d, + kwargs.get('ties_method', 'average'), + kwargs.get('ascending', True), + kwargs.get('pct', False), + kwargs.get('na_option', 'keep') + ) + } } } @@ -2242,7 +2287,8 @@ def wrapper(*args, **kwargs): (how, dtype_str)) return func - def _cython_operation(self, kind, values, how, axis, min_count=-1): + def _cython_operation(self, kind, values, how, axis, min_count=-1, + **kwargs): assert kind in ['transform', 'aggregate'] # can we do this operation with our cython functions @@ -2314,10 +2360,13 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1): else: raise - if is_numeric: - out_dtype = '%s%d' % (values.dtype.kind, values.dtype.itemsize) + if how == 'rank': + out_dtype = 'float' else: - out_dtype = 'object' + if is_numeric: + out_dtype = '%s%d' % (values.dtype.kind, values.dtype.itemsize) + else: + out_dtype = 'object' labels, _, _ = self.group_info @@ -2334,7 +2383,8 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1): # TODO: min_count result = self._transform( - result, values, labels, func, is_numeric, is_datetimelike) + result, values, labels, func, is_numeric, is_datetimelike, + **kwargs) if is_integer_dtype(result) and not is_datetimelike: mask = result == iNaT @@ -2373,8 +2423,8 @@ def aggregate(self, values, how, axis=0, min_count=-1): return self._cython_operation('aggregate', values, how, axis, min_count=min_count) - def transform(self, values, how, axis=0): - return self._cython_operation('transform', values, how, axis) + def transform(self, values, how, axis=0, **kwargs): + return self._cython_operation('transform', values, how, axis, **kwargs) def _aggregate(self, result, counts, values, comp_ids, agg_func, is_numeric, is_datetimelike, min_count=-1): @@ -2394,7 +2444,7 @@ def _aggregate(self, result, counts, values, comp_ids, agg_func, return result def _transform(self, result, values, comp_ids, transform_func, - is_numeric, is_datetimelike): + is_numeric, is_datetimelike, **kwargs): comp_ids, _, ngroups = self.group_info if values.ndim > 3: @@ -2406,9 +2456,9 @@ def _transform(self, result, values, comp_ids, transform_func, chunk = chunk.squeeze() transform_func(result[:, :, i], values, - comp_ids, is_datetimelike) + comp_ids, is_datetimelike, **kwargs) else: - transform_func(result, values, comp_ids, is_datetimelike) + transform_func(result, values, comp_ids, is_datetimelike, **kwargs) return result diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 5172efe25d697..2db772ac54369 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1895,6 +1895,172 @@ def test_rank_apply(self): expected = expected.reindex(result.index) assert_series_equal(result, expected) + @pytest.mark.parametrize("grps", [ + ['qux'], ['qux', 'quux']]) + @pytest.mark.parametrize("vals", [ + [2, 2, 8, 2, 6], + [pd.Timestamp('2018-01-02'), pd.Timestamp('2018-01-02'), + pd.Timestamp('2018-01-08'), pd.Timestamp('2018-01-02'), + pd.Timestamp('2018-01-06')]]) + @pytest.mark.parametrize("ties_method,ascending,pct,exp", [ + ('average', True, False, [2., 2., 5., 2., 4.]), + ('average', True, True, [0.4, 0.4, 1.0, 0.4, 0.8]), + ('average', False, False, [4., 4., 1., 4., 2.]), + ('average', False, True, [.8, .8, .2, .8, .4]), + ('min', True, False, [1., 1., 5., 1., 4.]), + ('min', True, True, [0.2, 0.2, 1.0, 0.2, 0.8]), + ('min', False, False, [3., 3., 1., 3., 2.]), + ('min', False, True, [.6, .6, .2, .6, .4]), + ('max', True, False, [3., 3., 5., 3., 4.]), + ('max', True, True, [0.6, 0.6, 1.0, 0.6, 0.8]), + ('max', False, False, [5., 5., 1., 5., 2.]), + ('max', False, True, [1., 1., .2, 1., .4]), + ('first', True, False, [1., 2., 5., 3., 4.]), + ('first', True, True, [0.2, 0.4, 1.0, 0.6, 0.8]), + ('first', False, False, [3., 4., 1., 5., 2.]), + ('first', False, True, [.6, .8, .2, 1., .4]), + ('dense', True, False, [1., 1., 3., 1., 2.]), + ('dense', True, True, [0.2, 0.2, 0.6, 0.2, 0.4]), + ('dense', False, False, [3., 3., 1., 3., 2.]), + ('dense', False, True, [.6, .6, .2, .6, .4]), + ]) + def test_rank_args(self, grps, vals, ties_method, ascending, pct, exp): + key = np.repeat(grps, len(vals)) + vals = vals * len(grps) + df = DataFrame({'key': key, 'val': vals}) + result = df.groupby('key').rank(method=ties_method, + ascending=ascending, pct=pct) + + exp_df = DataFrame(exp * len(grps), columns=['val']) + assert_frame_equal(result, exp_df) + + @pytest.mark.parametrize("grps", [ + ['qux'], ['qux', 'quux']]) + @pytest.mark.parametrize("vals", [ + [2, 2, np.nan, 8, 2, 6, np.nan, np.nan], # floats + [pd.Timestamp('2018-01-02'), pd.Timestamp('2018-01-02'), np.nan, + pd.Timestamp('2018-01-08'), pd.Timestamp('2018-01-02'), + pd.Timestamp('2018-01-06'), np.nan, np.nan] + ]) + @pytest.mark.parametrize("ties_method,ascending,na_option,pct,exp", [ + ('average', True, 'keep', False, + [2., 2., np.nan, 5., 2., 4., np.nan, np.nan]), + ('average', True, 'keep', True, + [0.4, 0.4, np.nan, 1.0, 0.4, 0.8, np.nan, np.nan]), + ('average', False, 'keep', False, + [4., 4., np.nan, 1., 4., 2., np.nan, np.nan]), + ('average', False, 'keep', True, + [.8, 0.8, np.nan, 0.2, 0.8, 0.4, np.nan, np.nan]), + ('min', True, 'keep', False, + [1., 1., np.nan, 5., 1., 4., np.nan, np.nan]), + ('min', True, 'keep', True, + [0.2, 0.2, np.nan, 1.0, 0.2, 0.8, np.nan, np.nan]), + ('min', False, 'keep', False, + [3., 3., np.nan, 1., 3., 2., np.nan, np.nan]), + ('min', False, 'keep', True, + [.6, 0.6, np.nan, 0.2, 0.6, 0.4, np.nan, np.nan]), + ('max', True, 'keep', False, + [3., 3., np.nan, 5., 3., 4., np.nan, np.nan]), + ('max', True, 'keep', True, + [0.6, 0.6, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan]), + ('max', False, 'keep', False, + [5., 5., np.nan, 1., 5., 2., np.nan, np.nan]), + ('max', False, 'keep', True, + [1., 1., np.nan, 0.2, 1., 0.4, np.nan, np.nan]), + ('first', True, 'keep', False, + [1., 2., np.nan, 5., 3., 4., np.nan, np.nan]), + ('first', True, 'keep', True, + [0.2, 0.4, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan]), + ('first', False, 'keep', False, + [3., 4., np.nan, 1., 5., 2., np.nan, np.nan]), + ('first', False, 'keep', True, + [.6, 0.8, np.nan, 0.2, 1., 0.4, np.nan, np.nan]), + ('dense', True, 'keep', False, + [1., 1., np.nan, 3., 1., 2., np.nan, np.nan]), + ('dense', True, 'keep', True, + [0.2, 0.2, np.nan, 0.6, 0.2, 0.4, np.nan, np.nan]), + ('dense', False, 'keep', False, + [3., 3., np.nan, 1., 3., 2., np.nan, np.nan]), + ('dense', False, 'keep', True, + [.6, 0.6, np.nan, 0.2, 0.6, 0.4, np.nan, np.nan]), + ('average', True, 'no_na', False, [2., 2., 7., 5., 2., 4., 7., 7.]), + ('average', True, 'no_na', True, + [0.25, 0.25, 0.875, 0.625, 0.25, 0.5, 0.875, 0.875]), + ('average', False, 'no_na', False, [4., 4., 7., 1., 4., 2., 7., 7.]), + ('average', False, 'no_na', True, + [0.5, 0.5, 0.875, 0.125, 0.5, 0.25, 0.875, 0.875]), + ('min', True, 'no_na', False, [1., 1., 6., 5., 1., 4., 6., 6.]), + ('min', True, 'no_na', True, + [0.125, 0.125, 0.75, 0.625, 0.125, 0.5, 0.75, 0.75]), + ('min', False, 'no_na', False, [3., 3., 6., 1., 3., 2., 6., 6.]), + ('min', False, 'no_na', True, + [0.375, 0.375, 0.75, 0.125, 0.375, 0.25, 0.75, 0.75]), + ('max', True, 'no_na', False, [3., 3., 8., 5., 3., 4., 8., 8.]), + ('max', True, 'no_na', True, + [0.375, 0.375, 1., 0.625, 0.375, 0.5, 1., 1.]), + ('max', False, 'no_na', False, [5., 5., 8., 1., 5., 2., 8., 8.]), + ('max', False, 'no_na', True, + [0.625, 0.625, 1., 0.125, 0.625, 0.25, 1., 1.]), + ('first', True, 'no_na', False, [1., 2., 6., 5., 3., 4., 7., 8.]), + ('first', True, 'no_na', True, + [0.125, 0.25, 0.75, 0.625, 0.375, 0.5, 0.875, 1.]), + ('first', False, 'no_na', False, [3., 4., 6., 1., 5., 2., 7., 8.]), + ('first', False, 'no_na', True, + [0.375, 0.5, 0.75, 0.125, 0.625, 0.25, 0.875, 1.]), + ('dense', True, 'no_na', False, [1., 1., 4., 3., 1., 2., 4., 4.]), + ('dense', True, 'no_na', True, + [0.125, 0.125, 0.5, 0.375, 0.125, 0.25, 0.5, 0.5]), + ('dense', False, 'no_na', False, [3., 3., 4., 1., 3., 2., 4., 4.]), + ('dense', False, 'no_na', True, + [0.375, 0.375, 0.5, 0.125, 0.375, 0.25, 0.5, 0.5]) + ]) + def test_rank_args_missing(self, grps, vals, ties_method, ascending, + na_option, pct, exp): + key = np.repeat(grps, len(vals)) + vals = vals * len(grps) + df = DataFrame({'key': key, 'val': vals}) + result = df.groupby('key').rank(method=ties_method, + ascending=ascending, + na_option=na_option, pct=pct) + + exp_df = DataFrame(exp * len(grps), columns=['val']) + assert_frame_equal(result, exp_df) + + @pytest.mark.parametrize("pct,exp", [ + (False, [3., 3., 3., 3., 3.]), + (True, [.6, .6, .6, .6, .6])]) + def test_rank_resets_each_group(self, pct, exp): + df = DataFrame( + {'key': ['a', 'a', 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'b'], + 'val': [1] * 10} + ) + result = df.groupby('key').rank(pct=pct) + exp_df = DataFrame(exp * 2, columns=['val']) + assert_frame_equal(result, exp_df) + + def test_rank_avg_even_vals(self): + df = DataFrame({'key': ['a'] * 4, 'val': [1] * 4}) + result = df.groupby('key').rank() + exp_df = DataFrame([2.5, 2.5, 2.5, 2.5], columns=['val']) + assert_frame_equal(result, exp_df) + + @pytest.mark.parametrize("ties_method", [ + 'average', 'min', 'max', 'first', 'dense']) + @pytest.mark.parametrize("ascending", [True, False]) + @pytest.mark.parametrize("na_option", ["keep", "top", "bottom"]) + @pytest.mark.parametrize("pct", [True, False]) + @pytest.mark.parametrize("vals", [ + ['bar', 'bar', 'foo', 'bar', 'baz'], + ['bar', np.nan, 'foo', np.nan, 'baz'] + ]) + def test_rank_object_raises(self, ties_method, ascending, na_option, + pct, vals): + df = DataFrame({'key': ['foo'] * 5, 'val': vals}) + with tm.assert_raises_regex(TypeError, "not callable"): + df.groupby('key').rank(method=ties_method, + ascending=ascending, + na_option=na_option, pct=pct) + def test_dont_clobber_name_column(self): df = DataFrame({'key': ['a', 'a', 'a', 'b', 'b', 'b'], 'name': ['foo', 'bar', 'baz'] * 2}) From d4730e65fd2dd6235158930f756e1f1afd298488 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Sat, 10 Feb 2018 08:08:58 -0800 Subject: [PATCH 081/214] Consolidate nth / last object Groupby Implementations (#19610) --- pandas/_libs/groupby.pyx | 99 ---------------------------- pandas/_libs/groupby_helper.pxi.in | 32 +++++---- pandas/tests/groupby/test_groupby.py | 56 ++++++++-------- 3 files changed, 47 insertions(+), 140 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index d75c3a71896e3..866683ce378ab 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -26,105 +26,6 @@ cdef double NaN = np.NaN cdef double nan = NaN -# TODO: aggregate multiple columns in single pass -# ---------------------------------------------------------------------- -# first, nth, last - - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_nth_object(ndarray[object, ndim=2] out, - ndarray[int64_t] counts, - ndarray[object, ndim=2] values, - ndarray[int64_t] labels, - int64_t rank, - Py_ssize_t min_count=-1): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab - object val - float64_t count - ndarray[int64_t, ndim=2] nobs - ndarray[object, ndim=2] resx - - assert min_count == -1, "'min_count' only used in add and prod" - - nobs = np.zeros(( out).shape, dtype=np.int64) - resx = np.empty(( out).shape, dtype=object) - - N, K = ( values).shape - - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[lab, j] += 1 - if nobs[lab, j] == rank: - resx[lab, j] = val - - for i in range(len(counts)): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = nan - else: - out[i, j] = resx[i, j] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_last_object(ndarray[object, ndim=2] out, - ndarray[int64_t] counts, - ndarray[object, ndim=2] values, - ndarray[int64_t] labels, - Py_ssize_t min_count=-1): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab - object val - float64_t count - ndarray[object, ndim=2] resx - ndarray[int64_t, ndim=2] nobs - - assert min_count == -1, "'min_count' only used in add and prod" - - nobs = np.zeros(( out).shape, dtype=np.int64) - resx = np.empty(( out).shape, dtype=object) - - N, K = ( values).shape - - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[lab, j] += 1 - resx[lab, j] = val - - for i in range(len(counts)): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = nan - else: - out[i, j] = resx[i, j] - - cdef inline float64_t median_linear(float64_t* a, int n) nogil: cdef int i, j, na_count = 0 cdef float64_t result diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index b24444c422efa..58a944a8241dd 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -325,7 +325,8 @@ def group_ohlc_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, # name, c_type, dest_type2, nan_val dtypes = [('float64', 'float64_t', 'float64_t', 'NAN'), ('float32', 'float32_t', 'float32_t', 'NAN'), - ('int64', 'int64_t', 'int64_t', 'iNaT')] + ('int64', 'int64_t', 'int64_t', 'iNaT'), + ('object', 'object', 'object', 'NAN')] def get_dispatch(dtypes): @@ -350,7 +351,7 @@ def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, """ cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - {{dest_type2}} val, count + {{dest_type2}} val ndarray[{{dest_type2}}, ndim=2] resx ndarray[int64_t, ndim=2] nobs @@ -360,11 +361,19 @@ def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, raise AssertionError("len(index) != len(labels)") nobs = np.zeros(( out).shape, dtype=np.int64) + {{if name=='object'}} + resx = np.empty(( out).shape, dtype=object) + {{else}} resx = np.empty_like(out) + {{endif}} N, K = ( values).shape + {{if name == "object"}} + if True: # make templating happy + {{else}} with nogil: + {{endif}} for i in range(N): lab = labels[i] if lab < 0: @@ -375,11 +384,7 @@ def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, val = values[i, j] # not nan - {{if name == 'int64'}} - if val != {{nan_val}}: - {{else}} if val == val and val != {{nan_val}}: - {{endif}} nobs[lab, j] += 1 resx[lab, j] = val @@ -390,7 +395,6 @@ def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, else: out[i, j] = resx[i, j] - @cython.wraparound(False) @cython.boundscheck(False) def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, @@ -403,7 +407,7 @@ def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, """ cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - {{dest_type2}} val, count + {{dest_type2}} val ndarray[{{dest_type2}}, ndim=2] resx ndarray[int64_t, ndim=2] nobs @@ -413,11 +417,19 @@ def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, raise AssertionError("len(index) != len(labels)") nobs = np.zeros(( out).shape, dtype=np.int64) + {{if name=='object'}} + resx = np.empty(( out).shape, dtype=object) + {{else}} resx = np.empty_like(out) + {{endif}} N, K = ( values).shape + {{if name == "object"}} + if True: # make templating happy + {{else}} with nogil: + {{endif}} for i in range(N): lab = labels[i] if lab < 0: @@ -428,11 +440,7 @@ def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, val = values[i, j] # not nan - {{if name == 'int64'}} - if val != {{nan_val}}: - {{else}} if val == val and val != {{nan_val}}: - {{endif}} nobs[lab, j] += 1 if nobs[lab, j] == rank: resx[lab, j] = val diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 2db772ac54369..6eacd45deb7bc 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2252,7 +2252,19 @@ def test_median_empty_bins(self): expected = df.groupby(bins).agg(lambda x: x.median()) assert_frame_equal(result, expected) - def test_groupby_non_arithmetic_agg_types(self): + @pytest.mark.parametrize("dtype", [ + 'int8', 'int16', 'int32', 'int64', 'float32', 'float64']) + @pytest.mark.parametrize("method,data", [ + ('first', {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]}), + ('last', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]}), + ('min', {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]}), + ('max', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]}), + ('nth', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}], + 'args': [1]}), + ('count', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 2}], + 'out_type': 'int64'}) + ]) + def test_groupby_non_arithmetic_agg_types(self, dtype, method, data): # GH9311, GH6620 df = pd.DataFrame( [{'a': 1, 'b': 1}, @@ -2260,39 +2272,25 @@ def test_groupby_non_arithmetic_agg_types(self): {'a': 2, 'b': 3}, {'a': 2, 'b': 4}]) - dtypes = ['int8', 'int16', 'int32', 'int64', 'float32', 'float64'] - - grp_exp = {'first': {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]}, - 'last': {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]}, - 'min': {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]}, - 'max': {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]}, - 'nth': {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}], - 'args': [1]}, - 'count': {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 2}], - 'out_type': 'int64'}} + df['b'] = df.b.astype(dtype) - for dtype in dtypes: - df_in = df.copy() - df_in['b'] = df_in.b.astype(dtype) + if 'args' not in data: + data['args'] = [] - for method, data in compat.iteritems(grp_exp): - if 'args' not in data: - data['args'] = [] - - if 'out_type' in data: - out_type = data['out_type'] - else: - out_type = dtype + if 'out_type' in data: + out_type = data['out_type'] + else: + out_type = dtype - exp = data['df'] - df_out = pd.DataFrame(exp) + exp = data['df'] + df_out = pd.DataFrame(exp) - df_out['b'] = df_out.b.astype(out_type) - df_out.set_index('a', inplace=True) + df_out['b'] = df_out.b.astype(out_type) + df_out.set_index('a', inplace=True) - grpd = df_in.groupby('a') - t = getattr(grpd, method)(*data['args']) - assert_frame_equal(t, df_out) + grpd = df.groupby('a') + t = getattr(grpd, method)(*data['args']) + assert_frame_equal(t, df_out) def test_groupby_non_arithmetic_agg_intlike_precision(self): # GH9311, GH6620 From 5c76f33a106d071e3b5620f908cfc760d8e3d6b2 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 10 Feb 2018 11:09:58 -0500 Subject: [PATCH 082/214] Revert "Consolidate nth / last object Groupby Implementations (#19610)" This reverts commit d4730e65fd2dd6235158930f756e1f1afd298488. --- pandas/_libs/groupby.pyx | 99 ++++++++++++++++++++++++++++ pandas/_libs/groupby_helper.pxi.in | 32 ++++----- pandas/tests/groupby/test_groupby.py | 56 ++++++++-------- 3 files changed, 140 insertions(+), 47 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 866683ce378ab..d75c3a71896e3 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -26,6 +26,105 @@ cdef double NaN = np.NaN cdef double nan = NaN +# TODO: aggregate multiple columns in single pass +# ---------------------------------------------------------------------- +# first, nth, last + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_nth_object(ndarray[object, ndim=2] out, + ndarray[int64_t] counts, + ndarray[object, ndim=2] values, + ndarray[int64_t] labels, + int64_t rank, + Py_ssize_t min_count=-1): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab + object val + float64_t count + ndarray[int64_t, ndim=2] nobs + ndarray[object, ndim=2] resx + + assert min_count == -1, "'min_count' only used in add and prod" + + nobs = np.zeros(( out).shape, dtype=np.int64) + resx = np.empty(( out).shape, dtype=object) + + N, K = ( values).shape + + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + if nobs[lab, j] == rank: + resx[lab, j] = val + + for i in range(len(counts)): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = resx[i, j] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_last_object(ndarray[object, ndim=2] out, + ndarray[int64_t] counts, + ndarray[object, ndim=2] values, + ndarray[int64_t] labels, + Py_ssize_t min_count=-1): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab + object val + float64_t count + ndarray[object, ndim=2] resx + ndarray[int64_t, ndim=2] nobs + + assert min_count == -1, "'min_count' only used in add and prod" + + nobs = np.zeros(( out).shape, dtype=np.int64) + resx = np.empty(( out).shape, dtype=object) + + N, K = ( values).shape + + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + resx[lab, j] = val + + for i in range(len(counts)): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = resx[i, j] + + cdef inline float64_t median_linear(float64_t* a, int n) nogil: cdef int i, j, na_count = 0 cdef float64_t result diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index 58a944a8241dd..b24444c422efa 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -325,8 +325,7 @@ def group_ohlc_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, # name, c_type, dest_type2, nan_val dtypes = [('float64', 'float64_t', 'float64_t', 'NAN'), ('float32', 'float32_t', 'float32_t', 'NAN'), - ('int64', 'int64_t', 'int64_t', 'iNaT'), - ('object', 'object', 'object', 'NAN')] + ('int64', 'int64_t', 'int64_t', 'iNaT')] def get_dispatch(dtypes): @@ -351,7 +350,7 @@ def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, """ cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - {{dest_type2}} val + {{dest_type2}} val, count ndarray[{{dest_type2}}, ndim=2] resx ndarray[int64_t, ndim=2] nobs @@ -361,19 +360,11 @@ def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, raise AssertionError("len(index) != len(labels)") nobs = np.zeros(( out).shape, dtype=np.int64) - {{if name=='object'}} - resx = np.empty(( out).shape, dtype=object) - {{else}} resx = np.empty_like(out) - {{endif}} N, K = ( values).shape - {{if name == "object"}} - if True: # make templating happy - {{else}} with nogil: - {{endif}} for i in range(N): lab = labels[i] if lab < 0: @@ -384,7 +375,11 @@ def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, val = values[i, j] # not nan + {{if name == 'int64'}} + if val != {{nan_val}}: + {{else}} if val == val and val != {{nan_val}}: + {{endif}} nobs[lab, j] += 1 resx[lab, j] = val @@ -395,6 +390,7 @@ def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, else: out[i, j] = resx[i, j] + @cython.wraparound(False) @cython.boundscheck(False) def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, @@ -407,7 +403,7 @@ def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, """ cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - {{dest_type2}} val + {{dest_type2}} val, count ndarray[{{dest_type2}}, ndim=2] resx ndarray[int64_t, ndim=2] nobs @@ -417,19 +413,11 @@ def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, raise AssertionError("len(index) != len(labels)") nobs = np.zeros(( out).shape, dtype=np.int64) - {{if name=='object'}} - resx = np.empty(( out).shape, dtype=object) - {{else}} resx = np.empty_like(out) - {{endif}} N, K = ( values).shape - {{if name == "object"}} - if True: # make templating happy - {{else}} with nogil: - {{endif}} for i in range(N): lab = labels[i] if lab < 0: @@ -440,7 +428,11 @@ def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, val = values[i, j] # not nan + {{if name == 'int64'}} + if val != {{nan_val}}: + {{else}} if val == val and val != {{nan_val}}: + {{endif}} nobs[lab, j] += 1 if nobs[lab, j] == rank: resx[lab, j] = val diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 6eacd45deb7bc..2db772ac54369 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2252,19 +2252,7 @@ def test_median_empty_bins(self): expected = df.groupby(bins).agg(lambda x: x.median()) assert_frame_equal(result, expected) - @pytest.mark.parametrize("dtype", [ - 'int8', 'int16', 'int32', 'int64', 'float32', 'float64']) - @pytest.mark.parametrize("method,data", [ - ('first', {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]}), - ('last', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]}), - ('min', {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]}), - ('max', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]}), - ('nth', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}], - 'args': [1]}), - ('count', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 2}], - 'out_type': 'int64'}) - ]) - def test_groupby_non_arithmetic_agg_types(self, dtype, method, data): + def test_groupby_non_arithmetic_agg_types(self): # GH9311, GH6620 df = pd.DataFrame( [{'a': 1, 'b': 1}, @@ -2272,25 +2260,39 @@ def test_groupby_non_arithmetic_agg_types(self, dtype, method, data): {'a': 2, 'b': 3}, {'a': 2, 'b': 4}]) - df['b'] = df.b.astype(dtype) + dtypes = ['int8', 'int16', 'int32', 'int64', 'float32', 'float64'] - if 'args' not in data: - data['args'] = [] + grp_exp = {'first': {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]}, + 'last': {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]}, + 'min': {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]}, + 'max': {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]}, + 'nth': {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}], + 'args': [1]}, + 'count': {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 2}], + 'out_type': 'int64'}} - if 'out_type' in data: - out_type = data['out_type'] - else: - out_type = dtype + for dtype in dtypes: + df_in = df.copy() + df_in['b'] = df_in.b.astype(dtype) + + for method, data in compat.iteritems(grp_exp): + if 'args' not in data: + data['args'] = [] + + if 'out_type' in data: + out_type = data['out_type'] + else: + out_type = dtype - exp = data['df'] - df_out = pd.DataFrame(exp) + exp = data['df'] + df_out = pd.DataFrame(exp) - df_out['b'] = df_out.b.astype(out_type) - df_out.set_index('a', inplace=True) + df_out['b'] = df_out.b.astype(out_type) + df_out.set_index('a', inplace=True) - grpd = df.groupby('a') - t = getattr(grpd, method)(*data['args']) - assert_frame_equal(t, df_out) + grpd = df_in.groupby('a') + t = getattr(grpd, method)(*data['args']) + assert_frame_equal(t, df_out) def test_groupby_non_arithmetic_agg_intlike_precision(self): # GH9311, GH6620 From bae38fc6240c28e9ce26feea04d55c6912d2a09d Mon Sep 17 00:00:00 2001 From: Jan Koch Date: Sat, 10 Feb 2018 17:20:17 +0100 Subject: [PATCH 083/214] ENH: df.assign accepting dependent **kwargs (#14207) (#18852) --- doc/source/dsintro.rst | 85 +++++++++++++++++------ doc/source/whatsnew/v0.23.0.txt | 40 +++++++++++ pandas/core/frame.py | 49 ++++++++----- pandas/tests/frame/test_mutate_columns.py | 26 ++++++- 4 files changed, 163 insertions(+), 37 deletions(-) diff --git a/doc/source/dsintro.rst b/doc/source/dsintro.rst index d7650b6b0938f..78e2fdb46f659 100644 --- a/doc/source/dsintro.rst +++ b/doc/source/dsintro.rst @@ -95,7 +95,7 @@ constructed from the sorted keys of the dict, if possible. NaN (not a number) is the standard missing data marker used in pandas. -**From scalar value** +**From scalar value** If ``data`` is a scalar value, an index must be provided. The value will be repeated to match the length of **index**. @@ -154,7 +154,7 @@ See also the :ref:`section on attribute access`. Vectorized operations and label alignment with Series ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -When working with raw NumPy arrays, looping through value-by-value is usually +When working with raw NumPy arrays, looping through value-by-value is usually not necessary. The same is true when working with Series in pandas. Series can also be passed into most NumPy methods expecting an ndarray. @@ -324,7 +324,7 @@ From a list of dicts From a dict of tuples ~~~~~~~~~~~~~~~~~~~~~ -You can automatically create a multi-indexed frame by passing a tuples +You can automatically create a multi-indexed frame by passing a tuples dictionary. .. ipython:: python @@ -347,7 +347,7 @@ column name provided). **Missing Data** Much more will be said on this topic in the :ref:`Missing data ` -section. To construct a DataFrame with missing data, we use ``np.nan`` to +section. To construct a DataFrame with missing data, we use ``np.nan`` to represent missing values. Alternatively, you may pass a ``numpy.MaskedArray`` as the data argument to the DataFrame constructor, and its masked entries will be considered missing. @@ -370,7 +370,7 @@ set to ``'index'`` in order to use the dict keys as row labels. ``DataFrame.from_records`` takes a list of tuples or an ndarray with structured dtype. It works analogously to the normal ``DataFrame`` constructor, except that -the resulting DataFrame index may be a specific field of the structured +the resulting DataFrame index may be a specific field of the structured dtype. For example: .. ipython:: python @@ -506,25 +506,70 @@ to be inserted (for example, a ``Series`` or NumPy array), or a function of one argument to be called on the ``DataFrame``. A *copy* of the original DataFrame is returned, with the new values inserted. +.. versionmodified:: 0.23.0 + +Starting with Python 3.6 the order of ``**kwargs`` is preserved. This allows +for *dependent* assignment, where an expression later in ``**kwargs`` can refer +to a column created earlier in the same :meth:`~DataFrame.assign`. + +.. ipython:: python + + dfa = pd.DataFrame({"A": [1, 2, 3], + "B": [4, 5, 6]}) + dfa.assign(C=lambda x: x['A'] + x['B'], + D=lambda x: x['A'] + x['C']) + +In the second expression, ``x['C']`` will refer to the newly created column, +that's equal to ``dfa['A'] + dfa['B']``. + +To write code compatible with all versions of Python, split the assignment in two. + +.. ipython:: python + + dependent = pd.DataFrame({"A": [1, 1, 1]}) + (dependent.assign(A=lambda x: x['A'] + 1) + .assign(B=lambda x: x['A'] + 2)) + .. warning:: - Since the function signature of ``assign`` is ``**kwargs``, a dictionary, - the order of the new columns in the resulting DataFrame cannot be guaranteed - to match the order you pass in. To make things predictable, items are inserted - alphabetically (by key) at the end of the DataFrame. + Dependent assignment maybe subtly change the behavior of your code between + Python 3.6 and older versions of Python. + + If you wish write code that supports versions of python before and after 3.6, + you'll need to take care when passing ``assign`` expressions that + + * Updating an existing column + * Refering to the newly updated column in the same ``assign`` + + For example, we'll update column "A" and then refer to it when creating "B". + + .. code-block:: python + + >>> dependent = pd.DataFrame({"A": [1, 1, 1]}) + >>> dependent.assign(A=lambda x: x["A"] + 1, + B=lambda x: x["A"] + 2) + + For Python 3.5 and earlier the expression creating ``B`` refers to the + "old" value of ``A``, ``[1, 1, 1]``. The output is then + + .. code-block:: python + + A B + 0 2 3 + 1 2 3 + 2 2 3 + + For Python 3.6 and later, the expression creating ``A`` refers to the + "new" value of ``A``, ``[2, 2, 2]``, which results in + + .. code-block:: python - All expressions are computed first, and then assigned. So you can't refer - to another column being assigned in the same call to ``assign``. For example: + A B + 0 2 4 + 1 2 4 + 2 2 4 - .. ipython:: - :verbatim: - In [1]: # Don't do this, bad reference to `C` - df.assign(C = lambda x: x['A'] + x['B'], - D = lambda x: x['A'] + x['C']) - In [2]: # Instead, break it into two assigns - (df.assign(C = lambda x: x['A'] + x['B']) - .assign(D = lambda x: x['A'] + x['C'])) Indexing / Selection ~~~~~~~~~~~~~~~~~~~~ @@ -914,7 +959,7 @@ For example, using the earlier example data, we could do: Squeezing ~~~~~~~~~ -Another way to change the dimensionality of an object is to ``squeeze`` a 1-len +Another way to change the dimensionality of an object is to ``squeeze`` a 1-len object, similar to ``wp['Item1']``. .. ipython:: python diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index cf5a44442045b..db5c79dcb3c42 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -248,6 +248,46 @@ Current Behavior: pd.RangeIndex(1, 5) / 0 +.. _whatsnew_0230.enhancements.assign_dependent: + +``.assign()`` accepts dependent arguments +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The :func:`DataFrame.assign` now accepts dependent keyword arguments for python version later than 3.6 (see also `PEP 468 +`_). Later keyword arguments may now refer to earlier ones if the argument is a callable. See the +:ref:`documentation here ` (:issue:`14207`) + +.. ipython:: python + + df = pd.DataFrame({'A': [1, 2, 3]}) + df + df.assign(B=df.A, C=lambda x:x['A']+ x['B']) + +.. warning:: + + This may subtly change the behavior of your code when you're + using ``.assign()`` to update an existing column. Previously, callables + referring to other variables being updated would get the "old" values + + Previous Behaviour: + + .. code-block:: ipython + + In [2]: df = pd.DataFrame({"A": [1, 2, 3]}) + + In [3]: df.assign(A=lambda df: df.A + 1, C=lambda df: df.A * -1) + Out[3]: + A C + 0 2 -1 + 1 3 -2 + 2 4 -3 + + New Behaviour: + + .. ipython:: python + + df.assign(A=df.A+1, C= lambda df: df.A* -1) + .. _whatsnew_0230.enhancements.other: Other Enhancements diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6d8dcb8a1ca89..c99c59db1d8cb 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2687,12 +2687,17 @@ def assign(self, **kwargs): Notes ----- - For python 3.6 and above, the columns are inserted in the order of - \*\*kwargs. For python 3.5 and earlier, since \*\*kwargs is unordered, - the columns are inserted in alphabetical order at the end of your - DataFrame. Assigning multiple columns within the same ``assign`` - is possible, but you cannot reference other columns created within - the same ``assign`` call. + Assigning multiple columns within the same ``assign`` is possible. + For Python 3.6 and above, later items in '\*\*kwargs' may refer to + newly created or modified columns in 'df'; items are computed and + assigned into 'df' in order. For Python 3.5 and below, the order of + keyword arguments is not specified, you cannot refer to newly created + or modified columns. All items are computed first, and then assigned + in alphabetical order. + + .. versionmodified :: 0.23.0 + + Keyword argument order is maintained for Python 3.6 and later. Examples -------- @@ -2728,22 +2733,34 @@ def assign(self, **kwargs): 7 8 -1.495604 2.079442 8 9 0.549296 2.197225 9 10 -0.758542 2.302585 + + Where the keyword arguments depend on each other + + >>> df = pd.DataFrame({'A': [1, 2, 3]}) + + >>> df.assign(B=df.A, C=lambda x:x['A']+ x['B']) + A B C + 0 1 1 2 + 1 2 2 4 + 2 3 3 6 """ data = self.copy() - # do all calculations first... - results = OrderedDict() - for k, v in kwargs.items(): - results[k] = com._apply_if_callable(v, data) - - # preserve order for 3.6 and later, but sort by key for 3.5 and earlier + # >= 3.6 preserve order of kwargs if PY36: - results = results.items() + for k, v in kwargs.items(): + data[k] = com._apply_if_callable(v, data) else: + # <= 3.5: do all calculations first... + results = OrderedDict() + for k, v in kwargs.items(): + results[k] = com._apply_if_callable(v, data) + + # <= 3.5 and earlier results = sorted(results.items()) - # ... and then assign - for k, v in results: - data[k] = v + # ... and then assign + for k, v in results: + data[k] = v return data def _sanitize_column(self, key, value, broadcast=True): diff --git a/pandas/tests/frame/test_mutate_columns.py b/pandas/tests/frame/test_mutate_columns.py index 9acdf2f17d86a..8236a41d00243 100644 --- a/pandas/tests/frame/test_mutate_columns.py +++ b/pandas/tests/frame/test_mutate_columns.py @@ -89,11 +89,35 @@ def test_assign_bad(self): df.assign(lambda x: x.A) with pytest.raises(AttributeError): df.assign(C=df.A, D=df.A + df.C) + + @pytest.mark.skipif(PY36, reason="""Issue #14207: valid for python + 3.6 and above""") + def test_assign_dependent_old_python(self): + df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) + + # Key C does not exist at defition time of df with pytest.raises(KeyError): - df.assign(C=lambda df: df.A, D=lambda df: df['A'] + df['C']) + df.assign(C=lambda df: df.A, + D=lambda df: df['A'] + df['C']) with pytest.raises(KeyError): df.assign(C=df.A, D=lambda x: x['A'] + x['C']) + @pytest.mark.skipif(not PY36, reason="""Issue #14207: not valid for + python 3.5 and below""") + def test_assign_dependent(self): + df = DataFrame({'A': [1, 2], 'B': [3, 4]}) + + result = df.assign(C=df.A, D=lambda x: x['A'] + x['C']) + expected = DataFrame([[1, 3, 1, 2], [2, 4, 2, 4]], + columns=list('ABCD')) + assert_frame_equal(result, expected) + + result = df.assign(C=lambda df: df.A, + D=lambda df: df['A'] + df['C']) + expected = DataFrame([[1, 3, 1, 2], [2, 4, 2, 4]], + columns=list('ABCD')) + assert_frame_equal(result, expected) + def test_insert_error_msmgs(self): # GH 7432 From e2ea151dee0079930365865fb3c974c9bfa3c60f Mon Sep 17 00:00:00 2001 From: elrubio <1485187+elrubio@users.noreply.github.com> Date: Sat, 10 Feb 2018 17:53:38 +0100 Subject: [PATCH 084/214] Fix left join turning into outer join (#19624) --- doc/source/whatsnew/v0.23.0.txt | 2 +- pandas/core/frame.py | 13 ++++++------- pandas/tests/frame/test_join.py | 17 +++++++++++++++++ 3 files changed, 24 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index db5c79dcb3c42..03e8bce7e5102 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -813,7 +813,7 @@ Reshaping - Bug in timezone comparisons, manifesting as a conversion of the index to UTC in ``.concat()`` (:issue:`18523`) - Bug in :func:`concat` when concatting sparse and dense series it returns only a ``SparseDataFrame``. Should be a ``DataFrame``. (:issue:`18914`, :issue:`18686`, and :issue:`16874`) - Improved error message for :func:`DataFrame.merge` when there is no common merge key (:issue:`19427`) -- +- Bug in :func:`DataFrame.join` which does an *outer* instead of a *left* join when being called with multiple DataFrames and some have non-unique indices (:issue:`19624`) Other ^^^^^ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c99c59db1d8cb..23579d84a3964 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5345,18 +5345,17 @@ def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix='', raise ValueError('Joining multiple DataFrames only supported' ' for joining on index') - # join indexes only using concat - if how == 'left': - how = 'outer' - join_axes = [self.index] - else: - join_axes = None - frames = [self] + list(other) can_concat = all(df.index.is_unique for df in frames) + # join indexes only using concat if can_concat: + if how == 'left': + how = 'outer' + join_axes = [self.index] + else: + join_axes = None return concat(frames, axis=1, join=how, join_axes=join_axes, verify_integrity=True) diff --git a/pandas/tests/frame/test_join.py b/pandas/tests/frame/test_join.py index afecba2026dd7..ccdba6df2521a 100644 --- a/pandas/tests/frame/test_join.py +++ b/pandas/tests/frame/test_join.py @@ -165,3 +165,20 @@ def test_join_period_index(frame_with_period_index): index=frame_with_period_index.index) tm.assert_frame_equal(joined, expected) + + +def test_join_left_sequence_non_unique_index(): + # https://github.com/pandas-dev/pandas/issues/19607 + df1 = DataFrame({'a': [0, 10, 20]}, index=[1, 2, 3]) + df2 = DataFrame({'b': [100, 200, 300]}, index=[4, 3, 2]) + df3 = DataFrame({'c': [400, 500, 600]}, index=[2, 2, 4]) + + joined = df1.join([df2, df3], how='left') + + expected = DataFrame({ + 'a': [0, 10, 10, 20], + 'b': [np.nan, 300, 300, 200], + 'c': [np.nan, 400, 500, np.nan] + }, index=[1, 2, 2, 3]) + + tm.assert_frame_equal(joined, expected) From 5e7fabc2cae7ead896656459aa81a46e94e0db82 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 10 Feb 2018 08:59:40 -0800 Subject: [PATCH 085/214] function for frequently repeated tzconversion code (#19625) --- pandas/_libs/tslibs/conversion.pxd | 4 +- pandas/_libs/tslibs/conversion.pyx | 118 ++++++++++++++++++----------- pandas/_libs/tslibs/period.pyx | 9 +-- pandas/_libs/tslibs/resolution.pyx | 9 +-- 4 files changed, 85 insertions(+), 55 deletions(-) diff --git a/pandas/_libs/tslibs/conversion.pxd b/pandas/_libs/tslibs/conversion.pxd index 6e7df10e7c424..0d5e9e3fc5152 100644 --- a/pandas/_libs/tslibs/conversion.pxd +++ b/pandas/_libs/tslibs/conversion.pxd @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # cython: profile=False -from cpython.datetime cimport datetime +from cpython.datetime cimport datetime, tzinfo from numpy cimport int64_t, int32_t @@ -30,3 +30,5 @@ cdef int64_t get_datetime64_nanos(object val) except? -1 cpdef int64_t pydt_to_i8(object pydt) except? -1 cdef maybe_datetimelike_to_i8(object val) + +cdef int64_t tz_convert_utc_to_tzlocal(int64_t utc_val, tzinfo tz) diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 4f1a053da6f1d..cfbcb922cb47d 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -499,7 +499,7 @@ cdef inline void _localize_tso(_TSObject obj, object tz): """ cdef: ndarray[int64_t] trans, deltas - int64_t delta + int64_t delta, local_val Py_ssize_t posn datetime dt @@ -510,11 +510,8 @@ cdef inline void _localize_tso(_TSObject obj, object tz): elif obj.value == NPY_NAT: pass elif is_tzlocal(tz): - dt64_to_dtstruct(obj.value, &obj.dts) - dt = datetime(obj.dts.year, obj.dts.month, obj.dts.day, obj.dts.hour, - obj.dts.min, obj.dts.sec, obj.dts.us, tz) - delta = int(get_utcoffset(tz, dt).total_seconds()) * 1000000000 - dt64_to_dtstruct(obj.value + delta, &obj.dts) + local_val = tz_convert_utc_to_tzlocal(obj.value, tz) + dt64_to_dtstruct(local_val, &obj.dts) else: # Adjust datetime64 timestamp, recompute datetimestruct trans, deltas, typ = get_dst_info(tz) @@ -556,6 +553,66 @@ cdef inline datetime _localize_pydatetime(datetime dt, tzinfo tz): # ---------------------------------------------------------------------- # Timezone Conversion +cdef inline int64_t tz_convert_tzlocal_to_utc(int64_t val, tzinfo tz): + """ + Parameters + ---------- + val : int64_t + tz : tzinfo + + Returns + ------- + utc_date : int64_t + + See Also + -------- + tz_convert_utc_to_tzlocal + """ + cdef: + pandas_datetimestruct dts + int64_t utc_date, delta + datetime dt + + dt64_to_dtstruct(val, &dts) + dt = datetime(dts.year, dts.month, dts.day, dts.hour, + dts.min, dts.sec, dts.us, tz) + delta = int(get_utcoffset(tz, dt).total_seconds()) * 1000000000 + utc_date = val - delta + return utc_date + + +cdef inline int64_t tz_convert_utc_to_tzlocal(int64_t utc_val, tzinfo tz): + """ + Parameters + ---------- + utc_val : int64_t + tz : tzinfo + + Returns + ------- + local_val : int64_t + + See Also + -------- + tz_convert_tzlocal_to_utc + + Notes + ----- + The key difference between this and tz_convert_tzlocal_to_utc is a + an addition flipped to a subtraction in the last line. + """ + cdef: + pandas_datetimestruct dts + int64_t local_val, delta + datetime dt + + dt64_to_dtstruct(utc_val, &dts) + dt = datetime(dts.year, dts.month, dts.day, dts.hour, + dts.min, dts.sec, dts.us, tz) + delta = int(get_utcoffset(tz, dt).total_seconds()) * 1000000000 + local_val = utc_val + delta + return local_val + cpdef int64_t tz_convert_single(int64_t val, object tz1, object tz2): """ @@ -590,11 +647,7 @@ cpdef int64_t tz_convert_single(int64_t val, object tz1, object tz2): # Convert to UTC if is_tzlocal(tz1): - dt64_to_dtstruct(val, &dts) - dt = datetime(dts.year, dts.month, dts.day, dts.hour, - dts.min, dts.sec, dts.us, tz1) - delta = int(get_utcoffset(tz1, dt).total_seconds()) * 1000000000 - utc_date = val - delta + utc_date = tz_convert_tzlocal_to_utc(val, tz1) elif get_timezone(tz1) != 'UTC': trans, deltas, typ = get_dst_info(tz1) pos = trans.searchsorted(val, side='right') - 1 @@ -608,11 +661,7 @@ cpdef int64_t tz_convert_single(int64_t val, object tz1, object tz2): if get_timezone(tz2) == 'UTC': return utc_date elif is_tzlocal(tz2): - dt64_to_dtstruct(utc_date, &dts) - dt = datetime(dts.year, dts.month, dts.day, dts.hour, - dts.min, dts.sec, dts.us, tz2) - delta = int(get_utcoffset(tz2, dt).total_seconds()) * 1000000000 - return utc_date + delta + return tz_convert_utc_to_tzlocal(utc_date, tz2) # Convert UTC to other timezone trans, deltas, typ = get_dst_info(tz2) @@ -662,12 +711,7 @@ def tz_convert(ndarray[int64_t] vals, object tz1, object tz2): if v == NPY_NAT: utc_dates[i] = NPY_NAT else: - dt64_to_dtstruct(v, &dts) - dt = datetime(dts.year, dts.month, dts.day, dts.hour, - dts.min, dts.sec, dts.us, tz1) - delta = (int(get_utcoffset(tz1, dt).total_seconds()) * - 1000000000) - utc_dates[i] = v - delta + utc_dates[i] = tz_convert_tzlocal_to_utc(v, tz1) else: trans, deltas, typ = get_dst_info(tz1) @@ -702,12 +746,7 @@ def tz_convert(ndarray[int64_t] vals, object tz1, object tz2): if v == NPY_NAT: result[i] = NPY_NAT else: - dt64_to_dtstruct(v, &dts) - dt = datetime(dts.year, dts.month, dts.day, dts.hour, - dts.min, dts.sec, dts.us, tz2) - delta = (int(get_utcoffset(tz2, dt).total_seconds()) * - 1000000000) - result[i] = v + delta + result[i] = tz_convert_utc_to_tzlocal(v, tz2) return result # Convert UTC to other timezone @@ -777,11 +816,7 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None, if is_tzlocal(tz): for i in range(n): v = vals[i] - dt64_to_dtstruct(v, &dts) - dt = datetime(dts.year, dts.month, dts.day, dts.hour, - dts.min, dts.sec, dts.us, tz) - delta = int(get_utcoffset(tz, dt).total_seconds()) * 1000000000 - result[i] = v - delta + result[i] = tz_convert_tzlocal_to_utc(v, tz) return result if is_string_object(ambiguous): @@ -1024,11 +1059,8 @@ cdef ndarray[int64_t] _normalize_local(ndarray[int64_t] stamps, object tz): if stamps[i] == NPY_NAT: result[i] = NPY_NAT continue - dt64_to_dtstruct(stamps[i], &dts) - dt = datetime(dts.year, dts.month, dts.day, dts.hour, - dts.min, dts.sec, dts.us, tz) - delta = int(get_utcoffset(tz, dt).total_seconds()) * 1000000000 - dt64_to_dtstruct(stamps[i] + delta, &dts) + local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) + dt64_to_dtstruct(local_val, &dts) result[i] = _normalized_stamp(&dts) else: # Adjust datetime64 timestamp, recompute datetimestruct @@ -1097,7 +1129,7 @@ def is_date_array_normalized(ndarray[int64_t] stamps, tz=None): Py_ssize_t i, n = len(stamps) ndarray[int64_t] trans, deltas pandas_datetimestruct dts - datetime dt + int64_t local_val if tz is None or is_utc(tz): for i in range(n): @@ -1106,11 +1138,9 @@ def is_date_array_normalized(ndarray[int64_t] stamps, tz=None): return False elif is_tzlocal(tz): for i in range(n): - dt64_to_dtstruct(stamps[i], &dts) - dt = datetime(dts.year, dts.month, dts.day, dts.hour, dts.min, - dts.sec, dts.us, tz) - dt = dt + tz.utcoffset(dt) - if (dt.hour + dt.minute + dt.second + dt.microsecond) > 0: + local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) + dt64_to_dtstruct(local_val, &dts) + if (dts.hour + dts.min + dts.sec + dts.us) > 0: return False else: trans, deltas, typ = get_dst_info(tz) diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 3c396a9ff4f3c..dc5d058f41d11 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -39,6 +39,7 @@ cimport ccalendar from ccalendar cimport dayofweek, get_day_of_year from ccalendar import MONTH_NUMBERS from ccalendar cimport is_leapyear +from conversion cimport tz_convert_utc_to_tzlocal from frequencies cimport (get_freq_code, get_base_alias, get_to_timestamp_base, get_freq_str, get_rule_month) @@ -591,6 +592,7 @@ cdef ndarray[int64_t] localize_dt64arr_to_period(ndarray[int64_t] stamps, ndarray[int64_t] result = np.empty(n, dtype=np.int64) ndarray[int64_t] trans, deltas, pos pandas_datetimestruct dts + int64_t local_val if is_utc(tz): for i in range(n): @@ -607,11 +609,8 @@ cdef ndarray[int64_t] localize_dt64arr_to_period(ndarray[int64_t] stamps, if stamps[i] == NPY_NAT: result[i] = NPY_NAT continue - dt64_to_dtstruct(stamps[i], &dts) - dt = datetime(dts.year, dts.month, dts.day, dts.hour, - dts.min, dts.sec, dts.us, tz) - delta = int(get_utcoffset(tz, dt).total_seconds()) * 1000000000 - dt64_to_dtstruct(stamps[i] + delta, &dts) + local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) + dt64_to_dtstruct(local_val, &dts) result[i] = get_period_ordinal(dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us, dts.ps, freq) diff --git a/pandas/_libs/tslibs/resolution.pyx b/pandas/_libs/tslibs/resolution.pyx index b166babe5992c..d0a9501afe566 100644 --- a/pandas/_libs/tslibs/resolution.pyx +++ b/pandas/_libs/tslibs/resolution.pyx @@ -23,6 +23,7 @@ from timezones cimport (is_utc, is_tzlocal, maybe_get_tz, get_dst_info, get_utcoffset) from fields import build_field_sarray from conversion import tz_convert +from conversion cimport tz_convert_utc_to_tzlocal from ccalendar import MONTH_ALIASES, int_to_weekday from pandas._libs.properties import cache_readonly @@ -78,6 +79,7 @@ cdef _reso_local(ndarray[int64_t] stamps, object tz): int reso = RESO_DAY, curr_reso ndarray[int64_t] trans, deltas, pos pandas_datetimestruct dts + int64_t local_val if is_utc(tz): for i in range(n): @@ -91,11 +93,8 @@ cdef _reso_local(ndarray[int64_t] stamps, object tz): for i in range(n): if stamps[i] == NPY_NAT: continue - dt64_to_dtstruct(stamps[i], &dts) - dt = datetime(dts.year, dts.month, dts.day, dts.hour, - dts.min, dts.sec, dts.us, tz) - delta = int(get_utcoffset(tz, dt).total_seconds()) * 1000000000 - dt64_to_dtstruct(stamps[i] + delta, &dts) + local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) + dt64_to_dtstruct(local_val, &dts) curr_reso = _reso_stamp(&dts) if curr_reso < reso: reso = curr_reso From fe972fb1310b5545e123a54b90282c8d81f59d40 Mon Sep 17 00:00:00 2001 From: jschendel Date: Sat, 10 Feb 2018 10:02:28 -0700 Subject: [PATCH 086/214] API: Allow ordered=None in CategoricalDtype (#18889) --- doc/source/whatsnew/v0.23.0.txt | 23 ++++ pandas/core/arrays/categorical.py | 12 +- pandas/core/dtypes/dtypes.py | 54 ++++++--- pandas/core/indexes/category.py | 2 +- pandas/tests/dtypes/test_dtypes.py | 179 ++++++++++++++++------------- 5 files changed, 168 insertions(+), 102 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 03e8bce7e5102..6f48d9a6c63c9 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -500,6 +500,29 @@ To restore previous behavior, simply set ``expand`` to ``False``: extracted type(extracted) +.. _whatsnew_0230.api_breaking.cdt_ordered: + +Default value for the ``ordered`` parameter of ``CategoricalDtype`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The default value of the ``ordered`` parameter for :class:`~pandas.api.types.CategoricalDtype` has changed from ``False`` to ``None`` to allow updating of ``categories`` without impacting ``ordered``. Behavior should remain consistent for downstream objects, such as :class:`Categorical` (:issue:`18790`) + +In previous versions, the default value for the ``ordered`` parameter was ``False``. This could potentially lead to the ``ordered`` parameter unintentionally being changed from ``True`` to ``False`` when users attempt to update ``categories`` if ``ordered`` is not explicitly specified, as it would silently default to ``False``. The new behavior for ``ordered=None`` is to retain the existing value of ``ordered``. + +New Behavior: + +.. ipython:: python + + from pandas.api.types import CategoricalDtype + cat = pd.Categorical(list('abcaba'), ordered=True, categories=list('cba')) + cat + cdt = CategoricalDtype(categories=list('cbad')) + cat.astype(cdt) + +Notice in the example above that the converted ``Categorical`` has retained ``ordered=True``. Had the default value for ``ordered`` remained as ``False``, the converted ``Categorical`` would have become unordered, despite ``ordered=False`` never being explicitly specified. To change the value of ``ordered``, explicitly pass it to the new dtype, e.g. ``CategoricalDtype(categories=list('cbad'), ordered=False)``. + +Note that the unintenional conversion of ``ordered`` discussed above did not arise in previous versions due to separate bugs that prevented ``astype`` from doing any type of category to category conversion (:issue:`10696`, :issue:`18593`). These bugs have been fixed in this release, and motivated changing the default value of ``ordered``. + .. _whatsnew_0230.api: Other API Changes diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 62c6a6b16cbe9..93250bdbb5054 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -243,7 +243,7 @@ class Categorical(ExtensionArray, PandasObject): # For comparisons, so that numpy uses our implementation if the compare # ops, which raise __array_priority__ = 1000 - _dtype = CategoricalDtype() + _dtype = CategoricalDtype(ordered=False) _deprecations = frozenset(['labels']) _typ = 'categorical' @@ -294,7 +294,7 @@ def __init__(self, values, categories=None, ordered=None, dtype=None, if fastpath: self._codes = coerce_indexer_dtype(values, categories) - self._dtype = dtype + self._dtype = self._dtype.update_dtype(dtype) return # null_mask indicates missing values we want to exclude from inference. @@ -358,7 +358,7 @@ def __init__(self, values, categories=None, ordered=None, dtype=None, full_codes[~null_mask] = codes codes = full_codes - self._dtype = dtype + self._dtype = self._dtype.update_dtype(dtype) self._codes = coerce_indexer_dtype(codes, dtype.categories) @property @@ -438,7 +438,7 @@ def astype(self, dtype, copy=True): """ if is_categorical_dtype(dtype): # GH 10696/18593 - dtype = self.dtype._update_dtype(dtype) + dtype = self.dtype.update_dtype(dtype) self = self.copy() if copy else self if dtype == self.dtype: return self @@ -560,7 +560,7 @@ def from_codes(cls, codes, categories, ordered=False): raise ValueError( "codes need to be convertible to an arrays of integers") - categories = CategoricalDtype._validate_categories(categories) + categories = CategoricalDtype.validate_categories(categories) if len(codes) and (codes.max() >= len(categories) or codes.min() < -1): raise ValueError("codes need to be between -1 and " @@ -1165,7 +1165,7 @@ def __setstate__(self, state): # Provide compatibility with pre-0.15.0 Categoricals. if '_categories' not in state and '_levels' in state: - state['_categories'] = self.dtype._validate_categories(state.pop( + state['_categories'] = self.dtype.validate_categories(state.pop( '_levels')) if '_codes' not in state and 'labels' in state: state['_codes'] = coerce_indexer_dtype( diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index d8d3a96992757..99e4033f104db 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -159,11 +159,11 @@ class CategoricalDtype(PandasExtensionDtype): _metadata = ['categories', 'ordered'] _cache = {} - def __init__(self, categories=None, ordered=False): + def __init__(self, categories=None, ordered=None): self._finalize(categories, ordered, fastpath=False) @classmethod - def _from_fastpath(cls, categories=None, ordered=False): + def _from_fastpath(cls, categories=None, ordered=None): self = cls.__new__(cls) self._finalize(categories, ordered, fastpath=True) return self @@ -180,14 +180,12 @@ def _from_categorical_dtype(cls, dtype, categories=None, ordered=None): def _finalize(self, categories, ordered, fastpath=False): - if ordered is None: - ordered = False - else: - self._validate_ordered(ordered) + if ordered is not None: + self.validate_ordered(ordered) if categories is not None: - categories = self._validate_categories(categories, - fastpath=fastpath) + categories = self.validate_categories(categories, + fastpath=fastpath) self._categories = categories self._ordered = ordered @@ -208,6 +206,17 @@ def __hash__(self): return int(self._hash_categories(self.categories, self.ordered)) def __eq__(self, other): + """ + Rules for CDT equality: + 1) Any CDT is equal to the string 'category' + 2) Any CDT is equal to a CDT with categories=None regardless of ordered + 3) A CDT with ordered=True is only equal to another CDT with + ordered=True and identical categories in the same order + 4) A CDT with ordered={False, None} is only equal to another CDT with + ordered={False, None} and identical categories, but same order is + not required. There is no distinction between False/None. + 5) Any other comparison returns False + """ if isinstance(other, compat.string_types): return other == self.name @@ -220,12 +229,16 @@ def __eq__(self, other): # CDT(., .) = CDT(None, False) and *all* # CDT(., .) = CDT(None, True). return True - elif self.ordered: - return other.ordered and self.categories.equals(other.categories) - elif other.ordered: - return False + elif self.ordered or other.ordered: + # At least one has ordered=True; equal if both have ordered=True + # and the same values for categories in the same order. + return ((self.ordered == other.ordered) and + self.categories.equals(other.categories)) else: - # both unordered; this could probably be optimized / cached + # Neither has ordered=True; equal if both have the same categories, + # but same order is not necessary. There is no distinction between + # ordered=False and ordered=None: CDT(., False) and CDT(., None) + # will be equal if they have the same categories. return hash(self) == hash(other) def __repr__(self): @@ -288,7 +301,7 @@ def construct_from_string(cls, string): raise TypeError("cannot construct a CategoricalDtype") @staticmethod - def _validate_ordered(ordered): + def validate_ordered(ordered): """ Validates that we have a valid ordered parameter. If it is not a boolean, a TypeError will be raised. @@ -308,7 +321,7 @@ def _validate_ordered(ordered): raise TypeError("'ordered' must either be 'True' or 'False'") @staticmethod - def _validate_categories(categories, fastpath=False): + def validate_categories(categories, fastpath=False): """ Validates that we have good categories @@ -340,7 +353,7 @@ def _validate_categories(categories, fastpath=False): return categories - def _update_dtype(self, dtype): + def update_dtype(self, dtype): """ Returns a CategoricalDtype with categories and ordered taken from dtype if specified, otherwise falling back to self if unspecified @@ -361,11 +374,16 @@ def _update_dtype(self, dtype): 'got {dtype!r}').format(dtype=dtype) raise ValueError(msg) - # dtype is CDT: keep current categories if None (ordered can't be None) + # dtype is CDT: keep current categories/ordered if None new_categories = dtype.categories if new_categories is None: new_categories = self.categories - return CategoricalDtype(new_categories, dtype.ordered) + + new_ordered = dtype.ordered + if new_ordered is None: + new_ordered = self.ordered + + return CategoricalDtype(new_categories, new_ordered) @property def categories(self): diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index b36bc1df23247..60f5552576ea1 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -344,7 +344,7 @@ def astype(self, dtype, copy=True): return IntervalIndex(np.array(self)) elif is_categorical_dtype(dtype): # GH 18630 - dtype = self.dtype._update_dtype(dtype) + dtype = self.dtype.update_dtype(dtype) if dtype == self.dtype: return self.copy() if copy else self diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index d800a7b92b559..cc833af03ae66 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -24,6 +24,11 @@ import pandas.util.testing as tm +@pytest.fixture(params=[True, False, None]) +def ordered(request): + return request.param + + class Base(object): def setup_method(self, method): @@ -124,41 +129,6 @@ def test_tuple_categories(self): result = CategoricalDtype(categories) assert all(result.categories == categories) - @pytest.mark.parametrize('dtype', [ - CategoricalDtype(list('abc'), False), - CategoricalDtype(list('abc'), True)]) - @pytest.mark.parametrize('new_dtype', [ - 'category', - CategoricalDtype(None, False), - CategoricalDtype(None, True), - CategoricalDtype(list('abc'), False), - CategoricalDtype(list('abc'), True), - CategoricalDtype(list('cba'), False), - CategoricalDtype(list('cba'), True), - CategoricalDtype(list('wxyz'), False), - CategoricalDtype(list('wxyz'), True)]) - def test_update_dtype(self, dtype, new_dtype): - if isinstance(new_dtype, string_types) and new_dtype == 'category': - expected_categories = dtype.categories - expected_ordered = dtype.ordered - else: - expected_categories = new_dtype.categories - if expected_categories is None: - expected_categories = dtype.categories - expected_ordered = new_dtype.ordered - - result = dtype._update_dtype(new_dtype) - tm.assert_index_equal(result.categories, expected_categories) - assert result.ordered is expected_ordered - - @pytest.mark.parametrize('bad_dtype', [ - 'foo', object, np.int64, PeriodDtype('Q')]) - def test_update_dtype_errors(self, bad_dtype): - dtype = CategoricalDtype(list('abc'), False) - msg = 'a CategoricalDtype must be passed to perform an update, ' - with tm.assert_raises_regex(ValueError, msg): - dtype._update_dtype(bad_dtype) - class TestDatetimeTZDtype(Base): @@ -609,17 +579,12 @@ def test_caching(self): class TestCategoricalDtypeParametrized(object): - @pytest.mark.parametrize('categories, ordered', [ - (['a', 'b', 'c', 'd'], False), - (['a', 'b', 'c', 'd'], True), - (np.arange(1000), False), - (np.arange(1000), True), - (['a', 'b', 10, 2, 1.3, True], False), - ([True, False], True), - ([True, False], False), - (pd.date_range('2017', periods=4), True), - (pd.date_range('2017', periods=4), False), - ]) + @pytest.mark.parametrize('categories', [ + list('abcd'), + np.arange(1000), + ['a', 'b', 10, 2, 1.3, True], + [True, False], + pd.date_range('2017', periods=4)]) def test_basic(self, categories, ordered): c1 = CategoricalDtype(categories, ordered=ordered) tm.assert_index_equal(c1.categories, pd.Index(categories)) @@ -627,21 +592,24 @@ def test_basic(self, categories, ordered): def test_order_matters(self): categories = ['a', 'b'] - c1 = CategoricalDtype(categories, ordered=False) - c2 = CategoricalDtype(categories, ordered=True) + c1 = CategoricalDtype(categories, ordered=True) + c2 = CategoricalDtype(categories, ordered=False) + c3 = CategoricalDtype(categories, ordered=None) assert c1 is not c2 + assert c1 is not c3 - def test_unordered_same(self): - c1 = CategoricalDtype(['a', 'b']) - c2 = CategoricalDtype(['b', 'a']) + @pytest.mark.parametrize('ordered', [False, None]) + def test_unordered_same(self, ordered): + c1 = CategoricalDtype(['a', 'b'], ordered=ordered) + c2 = CategoricalDtype(['b', 'a'], ordered=ordered) assert hash(c1) == hash(c2) def test_categories(self): result = CategoricalDtype(['a', 'b', 'c']) tm.assert_index_equal(result.categories, pd.Index(['a', 'b', 'c'])) - assert result.ordered is False + assert result.ordered is None - def test_equal_but_different(self): + def test_equal_but_different(self, ordered): c1 = CategoricalDtype([1, 2, 3]) c2 = CategoricalDtype([1., 2., 3.]) assert c1 is not c2 @@ -652,9 +620,11 @@ def test_equal_but_different(self): ([1, 2, 3], [3, 2, 1]), ]) def test_order_hashes_different(self, v1, v2): - c1 = CategoricalDtype(v1) + c1 = CategoricalDtype(v1, ordered=False) c2 = CategoricalDtype(v2, ordered=True) + c3 = CategoricalDtype(v1, ordered=None) assert c1 is not c2 + assert c1 is not c3 def test_nan_invalid(self): with pytest.raises(ValueError): @@ -669,26 +639,46 @@ def test_same_categories_different_order(self): c2 = CategoricalDtype(['b', 'a'], ordered=True) assert c1 is not c2 - @pytest.mark.parametrize('ordered, other, expected', [ - (True, CategoricalDtype(['a', 'b'], True), True), - (False, CategoricalDtype(['a', 'b'], False), True), - (True, CategoricalDtype(['a', 'b'], False), False), - (False, CategoricalDtype(['a', 'b'], True), False), - (True, CategoricalDtype([1, 2], False), False), - (False, CategoricalDtype([1, 2], True), False), - (False, CategoricalDtype(None, True), True), - (True, CategoricalDtype(None, True), True), - (False, CategoricalDtype(None, False), True), - (True, CategoricalDtype(None, False), True), - (True, 'category', True), - (False, 'category', True), - (True, 'not a category', False), - (False, 'not a category', False), - ]) - def test_categorical_equality(self, ordered, other, expected): - c1 = CategoricalDtype(['a', 'b'], ordered) + @pytest.mark.parametrize('ordered1', [True, False, None]) + @pytest.mark.parametrize('ordered2', [True, False, None]) + def test_categorical_equality(self, ordered1, ordered2): + # same categories, same order + # any combination of None/False are equal + # True/True is the only combination with True that are equal + c1 = CategoricalDtype(list('abc'), ordered1) + c2 = CategoricalDtype(list('abc'), ordered2) + result = c1 == c2 + expected = bool(ordered1) is bool(ordered2) + assert result is expected + + # same categories, different order + # any combination of None/False are equal (order doesn't matter) + # any combination with True are not equal (different order of cats) + c1 = CategoricalDtype(list('abc'), ordered1) + c2 = CategoricalDtype(list('cab'), ordered2) + result = c1 == c2 + expected = (bool(ordered1) is False) and (bool(ordered2) is False) + assert result is expected + + # different categories + c2 = CategoricalDtype([1, 2, 3], ordered2) + assert c1 != c2 + + # none categories + c1 = CategoricalDtype(list('abc'), ordered1) + c2 = CategoricalDtype(None, ordered2) + c3 = CategoricalDtype(None, ordered1) + assert c1 == c2 + assert c2 == c1 + assert c2 == c3 + + @pytest.mark.parametrize('categories', [list('abc'), None]) + @pytest.mark.parametrize('other', ['category', 'not a category']) + def test_categorical_equality_strings(self, categories, ordered, other): + c1 = CategoricalDtype(categories, ordered) result = c1 == other - assert result == expected + expected = other == 'category' + assert result is expected def test_invalid_raises(self): with tm.assert_raises_regex(TypeError, 'ordered'): @@ -729,12 +719,12 @@ def test_from_categorical_dtype_both(self): c1, categories=[1, 2], ordered=False) assert result == CategoricalDtype([1, 2], ordered=False) - def test_str_vs_repr(self): - c1 = CategoricalDtype(['a', 'b']) + def test_str_vs_repr(self, ordered): + c1 = CategoricalDtype(['a', 'b'], ordered=ordered) assert str(c1) == 'category' # Py2 will have unicode prefixes - pat = r"CategoricalDtype\(categories=\[.*\], ordered=False\)" - assert re.match(pat, repr(c1)) + pat = r"CategoricalDtype\(categories=\[.*\], ordered={ordered}\)" + assert re.match(pat.format(ordered=ordered), repr(c1)) def test_categorical_categories(self): # GH17884 @@ -742,3 +732,38 @@ def test_categorical_categories(self): tm.assert_index_equal(c1.categories, pd.Index(['a', 'b'])) c1 = CategoricalDtype(CategoricalIndex(['a', 'b'])) tm.assert_index_equal(c1.categories, pd.Index(['a', 'b'])) + + @pytest.mark.parametrize('new_categories', [ + list('abc'), list('cba'), list('wxyz'), None]) + @pytest.mark.parametrize('new_ordered', [True, False, None]) + def test_update_dtype(self, ordered, new_categories, new_ordered): + dtype = CategoricalDtype(list('abc'), ordered) + new_dtype = CategoricalDtype(new_categories, new_ordered) + + expected_categories = new_dtype.categories + if expected_categories is None: + expected_categories = dtype.categories + + expected_ordered = new_dtype.ordered + if expected_ordered is None: + expected_ordered = dtype.ordered + + result = dtype.update_dtype(new_dtype) + tm.assert_index_equal(result.categories, expected_categories) + assert result.ordered is expected_ordered + + def test_update_dtype_string(self, ordered): + dtype = CategoricalDtype(list('abc'), ordered) + expected_categories = dtype.categories + expected_ordered = dtype.ordered + result = dtype.update_dtype('category') + tm.assert_index_equal(result.categories, expected_categories) + assert result.ordered is expected_ordered + + @pytest.mark.parametrize('bad_dtype', [ + 'foo', object, np.int64, PeriodDtype('Q')]) + def test_update_dtype_errors(self, bad_dtype): + dtype = CategoricalDtype(list('abc'), False) + msg = 'a CategoricalDtype must be passed to perform an update, ' + with tm.assert_raises_regex(ValueError, msg): + dtype.update_dtype(bad_dtype) From d7797b44b12f80a6f6e447b3523b820fadd85b7b Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 10 Feb 2018 09:08:40 -0800 Subject: [PATCH 087/214] order of exceptions in array_to_datetime (#19621) --- pandas/_libs/tslib.pyx | 84 ++++++++++---------- pandas/tests/indexes/datetimes/test_tools.py | 9 ++- 2 files changed, 52 insertions(+), 41 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 877d7deff6ff4..a035bab2a7049 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -7,7 +7,7 @@ import numpy as np cnp.import_array() -from cpython cimport PyFloat_Check +from cpython cimport PyFloat_Check, PyUnicode_Check from util cimport (is_integer_object, is_float_object, is_string_object, is_datetime64_object) @@ -56,6 +56,8 @@ from tslibs.timestamps cimport (create_timestamp_from_ts, _NS_UPPER_BOUND, _NS_LOWER_BOUND) from tslibs.timestamps import Timestamp +cdef bint PY2 = str == bytes + cdef inline object create_datetime_from_ts( int64_t value, pandas_datetimestruct dts, @@ -549,10 +551,10 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', raise elif PyDate_Check(val): + seen_datetime = 1 iresult[i] = pydate_to_dt64(val, &dts) try: check_dts_bounds(&dts) - seen_datetime = 1 except ValueError: if is_coerce: iresult[i] = NPY_NAT @@ -560,12 +562,12 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', raise elif is_datetime64_object(val): + seen_datetime = 1 if get_datetime64_value(val) == NPY_NAT: iresult[i] = NPY_NAT else: try: iresult[i] = get_datetime64_nanos(val) - seen_datetime = 1 except ValueError: if is_coerce: iresult[i] = NPY_NAT @@ -574,19 +576,18 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', elif is_integer_object(val) or is_float_object(val): # these must be ns unit by-definition + seen_integer = 1 if val != val or val == NPY_NAT: iresult[i] = NPY_NAT elif is_raise or is_ignore: iresult[i] = val - seen_integer = 1 else: # coerce # we now need to parse this as if unit='ns' # we can ONLY accept integers at this point # if we have previously (or in future accept # datetimes/strings, then we must coerce) - seen_integer = 1 try: iresult[i] = cast_from_unit(val, 'ns') except: @@ -594,46 +595,25 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', elif is_string_object(val): # string + seen_string = 1 if len(val) == 0 or val in nat_strings: iresult[i] = NPY_NAT continue - - seen_string = 1 + if PyUnicode_Check(val) and PY2: + val = val.encode('utf-8') try: _string_to_dts(val, &dts, &out_local, &out_tzoffset) - value = dtstruct_to_dt64(&dts) - if out_local == 1: - tz = pytz.FixedOffset(out_tzoffset) - value = tz_convert_single(value, tz, 'UTC') - iresult[i] = value - check_dts_bounds(&dts) - except OutOfBoundsDatetime: - # GH#19382 for just-barely-OutOfBounds falling back to - # dateutil parser will return incorrect result because - # it will ignore nanoseconds - if require_iso8601: - if _parse_today_now(val, &iresult[i]): - continue - elif is_coerce: - iresult[i] = NPY_NAT - continue - elif is_raise: - raise ValueError("time data {val} doesn't match " - "format specified" - .format(val=val)) - return values - elif is_coerce: - iresult[i] = NPY_NAT - continue - raise except ValueError: - # if requiring iso8601 strings, skip trying other formats - if require_iso8601: - if _parse_today_now(val, &iresult[i]): - continue - elif is_coerce: + # A ValueError at this point is a _parsing_ error + # specifically _not_ OutOfBoundsDatetime + if _parse_today_now(val, &iresult[i]): + continue + elif require_iso8601: + # if requiring iso8601 strings, skip trying + # other formats + if is_coerce: iresult[i] = NPY_NAT continue elif is_raise: @@ -646,8 +626,6 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', py_dt = parse_datetime_string(val, dayfirst=dayfirst, yearfirst=yearfirst) except Exception: - if _parse_today_now(val, &iresult[i]): - continue if is_coerce: iresult[i] = NPY_NAT continue @@ -656,16 +634,42 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', try: _ts = convert_datetime_to_tsobject(py_dt, None) iresult[i] = _ts.value - except ValueError: + except OutOfBoundsDatetime: if is_coerce: iresult[i] = NPY_NAT continue raise except: + # TODO: What exception are we concerned with here? if is_coerce: iresult[i] = NPY_NAT continue raise + else: + # No error raised by string_to_dts, pick back up + # where we left off + value = dtstruct_to_dt64(&dts) + if out_local == 1: + tz = pytz.FixedOffset(out_tzoffset) + value = tz_convert_single(value, tz, 'UTC') + iresult[i] = value + try: + check_dts_bounds(&dts) + except OutOfBoundsDatetime: + # GH#19382 for just-barely-OutOfBounds falling back to + # dateutil parser will return incorrect result because + # it will ignore nanoseconds + if is_coerce: + iresult[i] = NPY_NAT + continue + elif require_iso8601: + if is_raise: + raise ValueError("time data {val} doesn't " + "match format specified" + .format(val=val)) + return values + raise + else: if is_coerce: iresult[i] = NPY_NAT diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index f8b1f68ba33ce..b95ae07052ecb 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -18,7 +18,7 @@ from pandas.core.tools import datetimes as tools from pandas.errors import OutOfBoundsDatetime -from pandas.compat import lmap +from pandas.compat import lmap, PY3 from pandas.compat.numpy import np_array_datetime64_compat from pandas.core.dtypes.common import is_datetime64_ns_dtype from pandas.util import testing as tm @@ -238,6 +238,13 @@ def test_to_datetime_today(self): assert pdtoday.tzinfo is None assert pdtoday2.tzinfo is None + def test_to_datetime_today_now_unicode_bytes(self): + to_datetime([u'now']) + to_datetime([u'today']) + if not PY3: + to_datetime(['now']) + to_datetime(['today']) + @pytest.mark.parametrize('cache', [True, False]) def test_to_datetime_dt64s(self, cache): in_bound_dts = [ From 13bd008edbbe0780600072d404fa989accb5e762 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Sat, 10 Feb 2018 10:48:15 -0800 Subject: [PATCH 088/214] Consolidated Groupby nth / last object Templates (#19635) --- pandas/_libs/groupby.pyx | 99 ---------------------------- pandas/_libs/groupby_helper.pxi.in | 36 ++++++---- pandas/tests/groupby/test_groupby.py | 56 ++++++++-------- 3 files changed, 50 insertions(+), 141 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index d75c3a71896e3..866683ce378ab 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -26,105 +26,6 @@ cdef double NaN = np.NaN cdef double nan = NaN -# TODO: aggregate multiple columns in single pass -# ---------------------------------------------------------------------- -# first, nth, last - - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_nth_object(ndarray[object, ndim=2] out, - ndarray[int64_t] counts, - ndarray[object, ndim=2] values, - ndarray[int64_t] labels, - int64_t rank, - Py_ssize_t min_count=-1): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab - object val - float64_t count - ndarray[int64_t, ndim=2] nobs - ndarray[object, ndim=2] resx - - assert min_count == -1, "'min_count' only used in add and prod" - - nobs = np.zeros(( out).shape, dtype=np.int64) - resx = np.empty(( out).shape, dtype=object) - - N, K = ( values).shape - - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[lab, j] += 1 - if nobs[lab, j] == rank: - resx[lab, j] = val - - for i in range(len(counts)): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = nan - else: - out[i, j] = resx[i, j] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_last_object(ndarray[object, ndim=2] out, - ndarray[int64_t] counts, - ndarray[object, ndim=2] values, - ndarray[int64_t] labels, - Py_ssize_t min_count=-1): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab - object val - float64_t count - ndarray[object, ndim=2] resx - ndarray[int64_t, ndim=2] nobs - - assert min_count == -1, "'min_count' only used in add and prod" - - nobs = np.zeros(( out).shape, dtype=np.int64) - resx = np.empty(( out).shape, dtype=object) - - N, K = ( values).shape - - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[lab, j] += 1 - resx[lab, j] = val - - for i in range(len(counts)): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = nan - else: - out[i, j] = resx[i, j] - - cdef inline float64_t median_linear(float64_t* a, int n) nogil: cdef int i, j, na_count = 0 cdef float64_t result diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index b24444c422efa..48dac7bf10362 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -317,7 +317,7 @@ def group_ohlc_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, {{endfor}} #---------------------------------------------------------------------- -# group_nth, group_last +# group_nth, group_last, group_rank #---------------------------------------------------------------------- {{py: @@ -325,7 +325,8 @@ def group_ohlc_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, # name, c_type, dest_type2, nan_val dtypes = [('float64', 'float64_t', 'float64_t', 'NAN'), ('float32', 'float32_t', 'float32_t', 'NAN'), - ('int64', 'int64_t', 'int64_t', 'iNaT')] + ('int64', 'int64_t', 'int64_t', 'iNaT'), + ('object', 'object', 'object', 'NAN')] def get_dispatch(dtypes): @@ -350,7 +351,7 @@ def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, """ cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - {{dest_type2}} val, count + {{dest_type2}} val ndarray[{{dest_type2}}, ndim=2] resx ndarray[int64_t, ndim=2] nobs @@ -360,11 +361,19 @@ def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, raise AssertionError("len(index) != len(labels)") nobs = np.zeros(( out).shape, dtype=np.int64) + {{if name=='object'}} + resx = np.empty(( out).shape, dtype=object) + {{else}} resx = np.empty_like(out) + {{endif}} N, K = ( values).shape + {{if name == "object"}} + if True: # make templating happy + {{else}} with nogil: + {{endif}} for i in range(N): lab = labels[i] if lab < 0: @@ -375,11 +384,7 @@ def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, val = values[i, j] # not nan - {{if name == 'int64'}} - if val != {{nan_val}}: - {{else}} if val == val and val != {{nan_val}}: - {{endif}} nobs[lab, j] += 1 resx[lab, j] = val @@ -390,7 +395,6 @@ def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, else: out[i, j] = resx[i, j] - @cython.wraparound(False) @cython.boundscheck(False) def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, @@ -403,7 +407,7 @@ def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, """ cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - {{dest_type2}} val, count + {{dest_type2}} val ndarray[{{dest_type2}}, ndim=2] resx ndarray[int64_t, ndim=2] nobs @@ -413,11 +417,19 @@ def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, raise AssertionError("len(index) != len(labels)") nobs = np.zeros(( out).shape, dtype=np.int64) + {{if name=='object'}} + resx = np.empty(( out).shape, dtype=object) + {{else}} resx = np.empty_like(out) + {{endif}} N, K = ( values).shape + {{if name == "object"}} + if True: # make templating happy + {{else}} with nogil: + {{endif}} for i in range(N): lab = labels[i] if lab < 0: @@ -428,11 +440,7 @@ def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, val = values[i, j] # not nan - {{if name == 'int64'}} - if val != {{nan_val}}: - {{else}} if val == val and val != {{nan_val}}: - {{endif}} nobs[lab, j] += 1 if nobs[lab, j] == rank: resx[lab, j] = val @@ -445,6 +453,7 @@ def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, out[i, j] = resx[i, j] +{{if name != 'object'}} @cython.boundscheck(False) @cython.wraparound(False) def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, @@ -608,6 +617,7 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, if pct: for i in range(N): out[i, 0] = out[i, 0] / grp_sizes[i, 0] +{{endif}} {{endfor}} diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 2db772ac54369..6eacd45deb7bc 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2252,7 +2252,19 @@ def test_median_empty_bins(self): expected = df.groupby(bins).agg(lambda x: x.median()) assert_frame_equal(result, expected) - def test_groupby_non_arithmetic_agg_types(self): + @pytest.mark.parametrize("dtype", [ + 'int8', 'int16', 'int32', 'int64', 'float32', 'float64']) + @pytest.mark.parametrize("method,data", [ + ('first', {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]}), + ('last', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]}), + ('min', {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]}), + ('max', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]}), + ('nth', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}], + 'args': [1]}), + ('count', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 2}], + 'out_type': 'int64'}) + ]) + def test_groupby_non_arithmetic_agg_types(self, dtype, method, data): # GH9311, GH6620 df = pd.DataFrame( [{'a': 1, 'b': 1}, @@ -2260,39 +2272,25 @@ def test_groupby_non_arithmetic_agg_types(self): {'a': 2, 'b': 3}, {'a': 2, 'b': 4}]) - dtypes = ['int8', 'int16', 'int32', 'int64', 'float32', 'float64'] - - grp_exp = {'first': {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]}, - 'last': {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]}, - 'min': {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]}, - 'max': {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]}, - 'nth': {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}], - 'args': [1]}, - 'count': {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 2}], - 'out_type': 'int64'}} + df['b'] = df.b.astype(dtype) - for dtype in dtypes: - df_in = df.copy() - df_in['b'] = df_in.b.astype(dtype) + if 'args' not in data: + data['args'] = [] - for method, data in compat.iteritems(grp_exp): - if 'args' not in data: - data['args'] = [] - - if 'out_type' in data: - out_type = data['out_type'] - else: - out_type = dtype + if 'out_type' in data: + out_type = data['out_type'] + else: + out_type = dtype - exp = data['df'] - df_out = pd.DataFrame(exp) + exp = data['df'] + df_out = pd.DataFrame(exp) - df_out['b'] = df_out.b.astype(out_type) - df_out.set_index('a', inplace=True) + df_out['b'] = df_out.b.astype(out_type) + df_out.set_index('a', inplace=True) - grpd = df_in.groupby('a') - t = getattr(grpd, method)(*data['args']) - assert_frame_equal(t, df_out) + grpd = df.groupby('a') + t = getattr(grpd, method)(*data['args']) + assert_frame_equal(t, df_out) def test_groupby_non_arithmetic_agg_intlike_precision(self): # GH9311, GH6620 From 507a2a24c6b8cfa8484c6b98ece1603c28a85519 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 10 Feb 2018 13:12:37 -0800 Subject: [PATCH 089/214] Continue porting period_helper to cython (#19608) --- pandas/_libs/src/period_helper.c | 286 ++----------------------- pandas/_libs/src/period_helper.h | 12 +- pandas/_libs/tslibs/period.pyx | 356 ++++++++++++++++++++++++++++--- 3 files changed, 349 insertions(+), 305 deletions(-) diff --git a/pandas/_libs/src/period_helper.c b/pandas/_libs/src/period_helper.c index f0e24fec685d0..7c4de8e42e73b 100644 --- a/pandas/_libs/src/period_helper.c +++ b/pandas/_libs/src/period_helper.c @@ -45,7 +45,7 @@ static int monthToQuarter(int month) { return ((month - 1) / 3) + 1; } /* Find the absdate (days elapsed since datetime(1, 1, 1) * for the given year/month/day. * Assumes GREGORIAN_CALENDAR */ -static npy_int64 dInfoCalc_SetFromDateAndTime(int year, int month, int day) { +npy_int64 absdate_from_ymd(int year, int month, int day) { /* Calculate the absolute date */ pandas_datetimestruct dts; npy_int64 unix_date; @@ -68,8 +68,6 @@ static int dInfoCalc_SetFromAbsDate(register struct date_info *dinfo, dinfo->year = dts.year; dinfo->month = dts.month; dinfo->day = dts.day; - - dinfo->absdate = absdate; return 0; } @@ -100,8 +98,7 @@ PANDAS_INLINE int get_freq_group(int freq) { return (freq / 1000) * 1000; } PANDAS_INLINE int get_freq_group_index(int freq) { return freq / 1000; } -PANDAS_INLINE npy_int64 get_daytime_conversion_factor(int from_index, - int to_index) { +npy_int64 get_daytime_conversion_factor(int from_index, int to_index) { int row = min_value(from_index, to_index); int col = max_value(from_index, to_index); // row or col < 6 means frequency strictly lower than Daily, which @@ -144,9 +141,9 @@ static npy_int64 DtoB_weekday(npy_int64 absdate) { return (((absdate) / 7) * 5) + (absdate) % 7 - BDAY_OFFSET; } -static npy_int64 DtoB(struct date_info *dinfo, int roll_back) { +static npy_int64 DtoB(struct date_info *dinfo, + int roll_back, npy_int64 absdate) { int day_of_week = dayofweek(dinfo->year, dinfo->month, dinfo->day); - npy_int64 absdate = dinfo->absdate; if (roll_back == 1) { if (day_of_week > 4) { @@ -162,9 +159,6 @@ static npy_int64 DtoB(struct date_info *dinfo, int roll_back) { return DtoB_weekday(absdate); } -static npy_int64 absdate_from_ymd(int y, int m, int d) { - return dInfoCalc_SetFromDateAndTime(y, m, d); -} //************ FROM DAILY *************** @@ -224,15 +218,16 @@ static npy_int64 asfreq_DTtoW(npy_int64 ordinal, asfreq_info *af_info) { static npy_int64 asfreq_DTtoB(npy_int64 ordinal, asfreq_info *af_info) { struct date_info dinfo; + npy_int64 absdate; int roll_back; ordinal = downsample_daytime(ordinal, af_info); - - dInfoCalc_SetFromAbsDate(&dinfo, ordinal + ORD_OFFSET); + absdate = ordinal + ORD_OFFSET; + dInfoCalc_SetFromAbsDate(&dinfo, absdate); // This usage defines roll_back the opposite way from the others roll_back = 1 - af_info->is_end; - return DtoB(&dinfo, roll_back); + return DtoB(&dinfo, roll_back, absdate); } // all intra day calculations are now done within one function @@ -298,11 +293,11 @@ static npy_int64 asfreq_WtoW(npy_int64 ordinal, asfreq_info *af_info) { static npy_int64 asfreq_WtoB(npy_int64 ordinal, asfreq_info *af_info) { struct date_info dinfo; + npy_int64 absdate = asfreq_WtoDT(ordinal, af_info) + ORD_OFFSET; int roll_back = af_info->is_end; - dInfoCalc_SetFromAbsDate( - &dinfo, asfreq_WtoDT(ordinal, af_info) + ORD_OFFSET); + dInfoCalc_SetFromAbsDate(&dinfo, absdate); - return DtoB(&dinfo, roll_back); + return DtoB(&dinfo, roll_back, absdate); } //************ FROM MONTHLY *************** @@ -338,12 +333,12 @@ static npy_int64 asfreq_MtoW(npy_int64 ordinal, asfreq_info *af_info) { static npy_int64 asfreq_MtoB(npy_int64 ordinal, asfreq_info *af_info) { struct date_info dinfo; + npy_int64 absdate = asfreq_MtoDT(ordinal, af_info) + ORD_OFFSET; int roll_back = af_info->is_end; - dInfoCalc_SetFromAbsDate( - &dinfo, asfreq_MtoDT(ordinal, af_info) + ORD_OFFSET); + dInfoCalc_SetFromAbsDate(&dinfo, absdate); - return DtoB(&dinfo, roll_back); + return DtoB(&dinfo, roll_back, absdate); } //************ FROM QUARTERLY *************** @@ -393,12 +388,12 @@ static npy_int64 asfreq_QtoW(npy_int64 ordinal, asfreq_info *af_info) { static npy_int64 asfreq_QtoB(npy_int64 ordinal, asfreq_info *af_info) { struct date_info dinfo; + npy_int64 absdate = asfreq_QtoDT(ordinal, af_info) + ORD_OFFSET; int roll_back = af_info->is_end; - dInfoCalc_SetFromAbsDate( - &dinfo, asfreq_QtoDT(ordinal, af_info) + ORD_OFFSET); + dInfoCalc_SetFromAbsDate(&dinfo, absdate); - return DtoB(&dinfo, roll_back); + return DtoB(&dinfo, roll_back, absdate); } //************ FROM ANNUAL *************** @@ -439,11 +434,11 @@ static npy_int64 asfreq_AtoW(npy_int64 ordinal, asfreq_info *af_info) { static npy_int64 asfreq_AtoB(npy_int64 ordinal, asfreq_info *af_info) { struct date_info dinfo; + npy_int64 absdate = asfreq_AtoDT(ordinal, af_info) + ORD_OFFSET; int roll_back = af_info->is_end; - dInfoCalc_SetFromAbsDate( - &dinfo, asfreq_AtoDT(ordinal, af_info) + ORD_OFFSET); + dInfoCalc_SetFromAbsDate(&dinfo, absdate); - return DtoB(&dinfo, roll_back); + return DtoB(&dinfo, roll_back, absdate); } static npy_int64 nofunc(npy_int64 ordinal, asfreq_info *af_info) { @@ -675,65 +670,6 @@ freq_conv_func get_asfreq_func(int fromFreq, int toFreq) { } } -double get_abs_time(int freq, npy_int64 date_ordinal, npy_int64 ordinal) { - int freq_index, day_index, base_index; - npy_int64 per_day, start_ord; - double unit, result; - - if (freq <= FR_DAY) { - return 0; - } - - freq_index = get_freq_group_index(freq); - day_index = get_freq_group_index(FR_DAY); - base_index = get_freq_group_index(FR_SEC); - - per_day = get_daytime_conversion_factor(day_index, freq_index); - unit = get_daytime_conversion_factor(freq_index, base_index); - - if (base_index < freq_index) { - unit = 1 / unit; - } - - start_ord = date_ordinal * per_day; - result = (double)(unit * (ordinal - start_ord)); - return result; -} - -/* Sets the time part of the DateTime object. */ -static int dInfoCalc_SetFromAbsTime(struct date_info *dinfo, double abstime) { - int inttime; - int hour, minute; - double second; - - inttime = (int)abstime; - hour = inttime / 3600; - minute = (inttime % 3600) / 60; - second = abstime - (double)(hour * 3600 + minute * 60); - - dinfo->hour = hour; - dinfo->minute = minute; - dinfo->second = second; - return 0; -} - -/* Set the instance's value using the given date and time. - Assumes GREGORIAN_CALENDAR. */ -static int dInfoCalc_SetFromAbsDateTime(struct date_info *dinfo, - npy_int64 absdate, double abstime) { - /* Bounds check */ - // The calling function is responsible for ensuring that - // abstime >= 0.0 && abstime <= 86400 - - /* Calculate the date */ - dInfoCalc_SetFromAbsDate(dinfo, absdate); - - /* Calculate the time */ - dInfoCalc_SetFromAbsTime(dinfo, abstime); - - return 0; -} - /* ------------------------------------------------------------------ * New pandas API-helper code, to expose to cython * ------------------------------------------------------------------*/ @@ -750,185 +686,3 @@ npy_int64 asfreq(npy_int64 period_ordinal, int freq1, int freq2, val = (*func)(period_ordinal, &finfo); return val; } - -/* generate an ordinal in period space */ -npy_int64 get_period_ordinal(int year, int month, int day, int hour, int minute, - int second, int microseconds, int picoseconds, - int freq) { - npy_int64 absdays, delta, seconds; - npy_int64 weeks, days; - npy_int64 ordinal, day_adj; - int freq_group, fmonth, mdiff; - freq_group = get_freq_group(freq); - - if (freq == FR_SEC || freq == FR_MS || freq == FR_US || freq == FR_NS) { - absdays = absdate_from_ymd(year, month, day); - delta = (absdays - ORD_OFFSET); - seconds = - (npy_int64)(delta * 86400 + hour * 3600 + minute * 60 + second); - - switch (freq) { - case FR_MS: - return seconds * 1000 + microseconds / 1000; - - case FR_US: - return seconds * 1000000 + microseconds; - - case FR_NS: - return seconds * 1000000000 + microseconds * 1000 + - picoseconds / 1000; - } - - return seconds; - } - - if (freq == FR_MIN) { - absdays = absdate_from_ymd(year, month, day); - delta = (absdays - ORD_OFFSET); - return (npy_int64)(delta * 1440 + hour * 60 + minute); - } - - if (freq == FR_HR) { - absdays = absdate_from_ymd(year, month, day); - delta = (absdays - ORD_OFFSET); - return (npy_int64)(delta * 24 + hour); - } - - if (freq == FR_DAY) { - return (npy_int64)(absdate_from_ymd(year, month, day) - ORD_OFFSET); - } - - if (freq == FR_UND) { - return (npy_int64)(absdate_from_ymd(year, month, day) - ORD_OFFSET); - } - - if (freq == FR_BUS) { - days = absdate_from_ymd(year, month, day); - // calculate the current week assuming sunday as last day of a week - weeks = (days - BASE_WEEK_TO_DAY_OFFSET) / DAYS_PER_WEEK; - // calculate the current weekday (in range 1 .. 7) - delta = (days - BASE_WEEK_TO_DAY_OFFSET) % DAYS_PER_WEEK + 1; - // return the number of business days in full weeks plus the business - // days in the last - possible partial - week - return (npy_int64)(weeks * BUSINESS_DAYS_PER_WEEK) + - (delta <= BUSINESS_DAYS_PER_WEEK ? delta - : BUSINESS_DAYS_PER_WEEK + 1) - - BDAY_OFFSET; - } - - if (freq_group == FR_WK) { - ordinal = (npy_int64)absdate_from_ymd(year, month, day); - day_adj = freq - FR_WK; - return (ordinal - (1 + day_adj)) / 7 + 1 - WEEK_OFFSET; - } - - if (freq == FR_MTH) { - return (year - BASE_YEAR) * 12 + month - 1; - } - - if (freq_group == FR_QTR) { - fmonth = freq - FR_QTR; - if (fmonth == 0) fmonth = 12; - - mdiff = month - fmonth; - if (mdiff < 0) mdiff += 12; - if (month >= fmonth) mdiff += 12; - - return (year - BASE_YEAR) * 4 + (mdiff - 1) / 3; - } - - if (freq_group == FR_ANN) { - fmonth = freq - FR_ANN; - if (fmonth == 0) fmonth = 12; - if (month <= fmonth) { - return year - BASE_YEAR; - } else { - return year - BASE_YEAR + 1; - } - } - - Py_Error(PyExc_RuntimeError, "Unable to generate frequency ordinal"); - -onError: - return INT_ERR_CODE; -} - -/* - Returns the proleptic Gregorian ordinal of the date, as an integer. - This corresponds to the number of days since Jan., 1st, 1AD. - When the instance has a frequency less than daily, the proleptic date - is calculated for the last day of the period. - */ - -npy_int64 get_python_ordinal(npy_int64 period_ordinal, int freq) { - asfreq_info af_info; - freq_conv_func toDaily = NULL; - - if (freq == FR_DAY) return period_ordinal + ORD_OFFSET; - - toDaily = get_asfreq_func(freq, FR_DAY); - get_asfreq_info(freq, FR_DAY, 'E', &af_info); - - return toDaily(period_ordinal, &af_info) + ORD_OFFSET; -} - - -int get_yq(npy_int64 ordinal, int freq, int *quarter, int *year) { - asfreq_info af_info; - int qtr_freq; - npy_int64 daily_ord; - freq_conv_func toDaily = NULL; - - toDaily = get_asfreq_func(freq, FR_DAY); - get_asfreq_info(freq, FR_DAY, 'E', &af_info); - - daily_ord = toDaily(ordinal, &af_info); - - if (get_freq_group(freq) == FR_QTR) { - qtr_freq = freq; - } else { - qtr_freq = FR_QTR; - } - get_asfreq_info(FR_DAY, qtr_freq, 'E', &af_info); - - DtoQ_yq(daily_ord, &af_info, year, quarter); - return 0; -} - -int _quarter_year(npy_int64 ordinal, int freq, int *year, int *quarter) { - asfreq_info af_info; - int qtr_freq; - - ordinal = get_python_ordinal(ordinal, freq) - ORD_OFFSET; - - if (get_freq_group(freq) == FR_QTR) - qtr_freq = freq; - else - qtr_freq = FR_QTR; - - get_asfreq_info(FR_DAY, qtr_freq, 'E', &af_info); - - DtoQ_yq(ordinal, &af_info, year, quarter); - - if ((qtr_freq % 1000) > 12) *year -= 1; - - return 0; -} - - -int get_date_info(npy_int64 ordinal, int freq, struct date_info *dinfo) { - npy_int64 absdate = get_python_ordinal(ordinal, freq); - double abstime = get_abs_time(freq, absdate - ORD_OFFSET, ordinal); - - while (abstime < 0) { - abstime += 86400; - absdate -= 1; - } - while (abstime >= 86400) { - abstime -= 86400; - absdate += 1; - } - - dInfoCalc_SetFromAbsDateTime(dinfo, absdate, abstime); - return 0; -} diff --git a/pandas/_libs/src/period_helper.h b/pandas/_libs/src/period_helper.h index f14aec268a1fb..1573b1eeec74b 100644 --- a/pandas/_libs/src/period_helper.h +++ b/pandas/_libs/src/period_helper.h @@ -118,8 +118,6 @@ typedef struct asfreq_info { } asfreq_info; typedef struct date_info { - npy_int64 absdate; - double second; int minute; int hour; @@ -136,18 +134,10 @@ typedef npy_int64 (*freq_conv_func)(npy_int64, asfreq_info *af_info); npy_int64 asfreq(npy_int64 period_ordinal, int freq1, int freq2, char relation); -npy_int64 get_period_ordinal(int year, int month, int day, int hour, int minute, - int second, int microseconds, int picoseconds, - int freq); - -npy_int64 get_python_ordinal(npy_int64 period_ordinal, int freq); - -int get_date_info(npy_int64 ordinal, int freq, struct date_info *dinfo); freq_conv_func get_asfreq_func(int fromFreq, int toFreq); void get_asfreq_info(int fromFreq, int toFreq, char relation, asfreq_info *af_info); -int get_yq(npy_int64 ordinal, int freq, int *quarter, int *year); -int _quarter_year(npy_int64 ordinal, int freq, int *year, int *quarter); +npy_int64 get_daytime_conversion_factor(int from_index, int to_index); #endif // PANDAS__LIBS_SRC_PERIOD_HELPER_H_ diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index dc5d058f41d11..c11a8b149bc13 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -13,7 +13,7 @@ import_array() from libc.stdlib cimport free, malloc from libc.time cimport strftime, tm -from libc.string cimport strlen +from libc.string cimport strlen, memset from pandas.compat import PY2 @@ -24,7 +24,15 @@ from cpython.datetime cimport PyDateTime_Check, PyDateTime_IMPORT PyDateTime_IMPORT from np_datetime cimport (pandas_datetimestruct, dtstruct_to_dt64, - dt64_to_dtstruct) + dt64_to_dtstruct, + PANDAS_FR_D, + pandas_datetime_to_datetimestruct, + PANDAS_DATETIMEUNIT) + +cdef extern from "../src/datetime/np_datetime.h": + int64_t pandas_datetimestruct_to_datetime(PANDAS_DATETIMEUNIT fr, + pandas_datetimestruct *d + ) nogil cimport util from util cimport is_period_object, is_string_object, INT32_MIN @@ -53,6 +61,24 @@ from pandas.tseries import frequencies cdef extern from "period_helper.h": + int FR_ANN + int FR_QTR + int FR_MTH + int FR_WK + int FR_DAY + int FR_HR + int FR_MIN + int FR_SEC + int FR_MS + int FR_US + int FR_NS + int FR_BUS + int FR_UND + + int ORD_OFFSET + int WEEK_OFFSET + int BDAY_OFFSET + ctypedef struct date_info: double second int minute @@ -73,24 +99,15 @@ cdef extern from "period_helper.h": int from_q_year_end int to_q_year_end - ctypedef int64_t (*freq_conv_func)(int64_t, asfreq_info*) + ctypedef int64_t (*freq_conv_func)(int64_t, asfreq_info*) nogil int64_t asfreq(int64_t dtordinal, int freq1, int freq2, char relation) except INT32_MIN - freq_conv_func get_asfreq_func(int fromFreq, int toFreq) + freq_conv_func get_asfreq_func(int fromFreq, int toFreq) nogil void get_asfreq_info(int fromFreq, int toFreq, char relation, - asfreq_info *af_info) - - int64_t get_period_ordinal(int year, int month, int day, - int hour, int minute, int second, - int microseconds, int picoseconds, - int freq) nogil except INT32_MIN - - int get_date_info(int64_t ordinal, int freq, - date_info *dinfo) nogil + asfreq_info *af_info) nogil - int get_yq(int64_t ordinal, int freq, int *quarter, int *year) - int _quarter_year(int64_t ordinal, int freq, int *year, int *quarter) + int64_t get_daytime_conversion_factor(int from_index, int to_index) nogil @cython.cdivision @@ -130,6 +147,285 @@ cdef char* c_strftime(date_info *dinfo, char *fmt): return result +# ---------------------------------------------------------------------- +# Conversion between date_info and pandas_datetimestruct + +cdef inline int get_freq_group(int freq) nogil: + return (freq // 1000) * 1000 + + +@cython.cdivision +cdef int64_t get_period_ordinal(int year, int month, int day, + int hour, int minute, int second, + int microseconds, int picoseconds, + int freq) nogil: + """generate an ordinal in period space""" + cdef: + int64_t absdays, unix_date, seconds, delta + int64_t weeks + int64_t day_adj + int freq_group, fmonth, mdiff + + freq_group = get_freq_group(freq) + + if freq_group == FR_ANN: + fmonth = freq - FR_ANN + if fmonth == 0: + fmonth = 12 + if month <= fmonth: + return year - 1970 + else: + return year - 1970 + 1 + + elif freq_group == FR_QTR: + fmonth = freq - FR_QTR + if fmonth == 0: + fmonth = 12 + + mdiff = month - fmonth + # TODO: Aren't the next two conditions equivalent to + # unconditional incrementing? + if mdiff < 0: + mdiff += 12 + if month >= fmonth: + mdiff += 12 + + return (year - 1970) * 4 + (mdiff - 1) / 3 + + elif freq == FR_MTH: + return (year - 1970) * 12 + month - 1 + + absdays = absdate_from_ymd(year, month, day) + unix_date = absdays - ORD_OFFSET + + if freq >= FR_SEC: + seconds = unix_date * 86400 + hour * 3600 + minute * 60 + second + + if freq == FR_MS: + return seconds * 1000 + microseconds / 1000 + + elif freq == FR_US: + return seconds * 1000000 + microseconds + + elif freq == FR_NS: + return (seconds * 1000000000 + + microseconds * 1000 + picoseconds / 1000) + + else: + return seconds + + elif freq == FR_MIN: + return unix_date * 1440 + hour * 60 + minute + + elif freq == FR_HR: + return unix_date * 24 + hour + + elif freq == FR_DAY: + return unix_date + + elif freq == FR_UND: + return unix_date + + elif freq == FR_BUS: + # calculate the current week assuming sunday as last day of a week + # Jan 1 0001 is a Monday, so subtract 1 to get to end-of-week + weeks = (unix_date + ORD_OFFSET - 1) / 7 + # calculate the current weekday (in range 1 .. 7) + delta = (unix_date + ORD_OFFSET - 1) % 7 + 1 + # return the number of business days in full weeks plus the business + # days in the last - possible partial - week + if delta <= 5: + return (weeks * 5) + delta - BDAY_OFFSET + else: + return (weeks * 5) + (5 + 1) - BDAY_OFFSET + + elif freq_group == FR_WK: + day_adj = freq - FR_WK + return (unix_date + ORD_OFFSET - (1 + day_adj)) / 7 + 1 - WEEK_OFFSET + + # raise ValueError + + +cdef int get_date_info(int64_t ordinal, int freq, date_info *dinfo) nogil: + cdef: + int64_t absdate + double abstime + + absdate = get_python_ordinal(ordinal, freq); + abstime = get_abs_time(freq, absdate - ORD_OFFSET, ordinal) + + while abstime < 0: + abstime += 86400 + absdate -= 1 + + while abstime >= 86400: + abstime -= 86400 + absdate += 1 + + dInfoCalc_SetFromAbsDateTime(dinfo, absdate, abstime) + return 0 + + +cdef int64_t get_python_ordinal(int64_t period_ordinal, int freq) nogil: + """ + Returns the proleptic Gregorian ordinal of the date, as an integer. + This corresponds to the number of days since Jan., 1st, 1AD. + When the instance has a frequency less than daily, the proleptic date + is calculated for the last day of the period. + """ + cdef: + asfreq_info af_info + freq_conv_func toDaily = NULL + + if freq == FR_DAY: + return period_ordinal + ORD_OFFSET + + toDaily = get_asfreq_func(freq, FR_DAY) + get_asfreq_info(freq, FR_DAY, 'E', &af_info) + return toDaily(period_ordinal, &af_info) + ORD_OFFSET + + +cdef int dInfoCalc_SetFromAbsDateTime(date_info *dinfo, + int64_t absdate, double abstime) nogil: + """ + Set the instance's value using the given date and time. + Assumes GREGORIAN_CALENDAR. + """ + # Bounds check + # The calling function is responsible for ensuring that + # abstime >= 0.0 and abstime <= 86400 + + # Calculate the date + dInfoCalc_SetFromAbsDate(dinfo, absdate) + + # Calculate the time + dInfoCalc_SetFromAbsTime(dinfo, abstime) + return 0 + + +cdef int dInfoCalc_SetFromAbsDate(date_info *dinfo, int64_t absdate) nogil: + """ + Sets the date part of the date_info struct + Assumes GREGORIAN_CALENDAR + """ + cdef: + pandas_datetimestruct dts + + pandas_datetime_to_datetimestruct(absdate - ORD_OFFSET, PANDAS_FR_D, &dts) + dinfo.year = dts.year + dinfo.month = dts.month + dinfo.day = dts.day + return 0 + + +@cython.cdivision +cdef int dInfoCalc_SetFromAbsTime(date_info *dinfo, double abstime) nogil: + """ + Sets the time part of the DateTime object. + """ + cdef: + int inttime + int hour, minute + double second + + inttime = abstime + hour = inttime / 3600 + minute = (inttime % 3600) / 60 + second = abstime - (hour * 3600 + minute * 60) + + dinfo.hour = hour + dinfo.minute = minute + dinfo.second = second + return 0 + + +@cython.cdivision +cdef double get_abs_time(int freq, int64_t date_ordinal, + int64_t ordinal) nogil: + cdef: + int freq_index, day_index, base_index + int64_t per_day, start_ord + double unit, result + + if freq <= FR_DAY: + return 0 + + freq_index = freq // 1000 + day_index = FR_DAY // 1000 + base_index = FR_SEC // 1000 + + per_day = get_daytime_conversion_factor(day_index, freq_index) + unit = get_daytime_conversion_factor(freq_index, base_index) + + if base_index < freq_index: + unit = 1 / unit + + start_ord = date_ordinal * per_day + result = (unit * (ordinal - start_ord)) + return result + + +cdef int64_t absdate_from_ymd(int year, int month, int day) nogil: + """ + Find the absdate (days elapsed since datetime(1, 1, 1) + for the given year/month/day. + Assumes GREGORIAN_CALENDAR + """ + # /* Calculate the absolute date + cdef: + pandas_datetimestruct dts + int64_t unix_date + + memset(&dts, 0, sizeof(pandas_datetimestruct)) + dts.year = year + dts.month = month + dts.day = day + unix_date = pandas_datetimestruct_to_datetime(PANDAS_FR_D, &dts) + return ORD_OFFSET + unix_date + + +cdef int get_yq(int64_t ordinal, int freq, int *quarter, int *year): + cdef: + asfreq_info af_info + int qtr_freq + int64_t daily_ord + + daily_ord = get_python_ordinal(ordinal, freq) - ORD_OFFSET + + if get_freq_group(freq) == FR_QTR: + qtr_freq = freq + else: + qtr_freq = FR_QTR + + get_asfreq_info(FR_DAY, qtr_freq, 'E', &af_info) + + DtoQ_yq(daily_ord, &af_info, year, quarter) + return qtr_freq + + +cdef int64_t DtoQ_yq(int64_t ordinal, asfreq_info *af_info, + int *year, int *quarter): + cdef: + date_info dinfo + + dInfoCalc_SetFromAbsDate(&dinfo, ordinal + ORD_OFFSET) + + if af_info.to_q_year_end != 12: + dinfo.month -= af_info.to_q_year_end + if dinfo.month <= 0: + dinfo.month += 12 + else: + dinfo.year += 1 + + year[0] = dinfo.year + quarter[0] = monthToQuarter(dinfo.month) + return 0 + + +cdef inline int monthToQuarter(int month): + return (month - 1) // 3 + 1 + + # ---------------------------------------------------------------------- # Period logic @@ -194,8 +490,7 @@ cdef char START = 'S' cdef char END = 'E' -cpdef int64_t period_asfreq(int64_t period_ordinal, int freq1, int freq2, - bint end): +cpdef int64_t period_asfreq(int64_t ordinal, int freq1, int freq2, bint end): """ Convert period ordinal from one frequency to another, and if upsampling, choose to use start ('S') or end ('E') of period. @@ -203,13 +498,13 @@ cpdef int64_t period_asfreq(int64_t period_ordinal, int freq1, int freq2, cdef: int64_t retval - if period_ordinal == iNaT: + if ordinal == iNaT: return iNaT if end: - retval = asfreq(period_ordinal, freq1, freq2, END) + retval = asfreq(ordinal, freq1, freq2, END) else: - retval = asfreq(period_ordinal, freq1, freq2, START) + retval = asfreq(ordinal, freq1, freq2, START) if retval == INT32_MIN: raise ValueError('Frequency conversion failed') @@ -226,7 +521,7 @@ def period_asfreq_arr(ndarray[int64_t] arr, int freq1, int freq2, bint end): ndarray[int64_t] result Py_ssize_t i, n freq_conv_func func - asfreq_info finfo + asfreq_info af_info int64_t val char relation @@ -239,20 +534,20 @@ def period_asfreq_arr(ndarray[int64_t] arr, int freq1, int freq2, bint end): relation = START func = get_asfreq_func(freq1, freq2) - get_asfreq_info(freq1, freq2, relation, &finfo) + get_asfreq_info(freq1, freq2, relation, &af_info) mask = arr == iNaT if mask.any(): # NaT process for i in range(n): val = arr[i] if val != iNaT: - val = func(val, &finfo) + val = func(val, &af_info) if val == INT32_MIN: raise ValueError("Unable to convert to desired frequency.") result[i] = val else: for i in range(n): - val = func(arr[i], &finfo) + val = func(arr[i], &af_info) if val == INT32_MIN: raise ValueError("Unable to convert to desired frequency.") result[i] = val @@ -404,17 +699,22 @@ cdef int pyear(int64_t ordinal, int freq): return dinfo.year +@cython.cdivision cdef int pqyear(int64_t ordinal, int freq): cdef: - int year, quarter - _quarter_year(ordinal, freq, &year, &quarter) + int year, quarter, qtr_freq + qtr_freq = get_yq(ordinal, freq, &quarter, &year) + if (qtr_freq % 1000) > 12: + year -= 1 return year cdef int pquarter(int64_t ordinal, int freq): cdef: - int year, quarter - _quarter_year(ordinal, freq, &year, &quarter) + int year, quarter, qtr_freq + qtr_freq = get_yq(ordinal, freq, &quarter, &year) + if (qtr_freq % 1000) > 12: + year -= 1 return quarter From c0e75a59b8fd2870c55b8e15565d1f5f8be9ec00 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 10 Feb 2018 16:04:50 -0800 Subject: [PATCH 090/214] fix overflows in Timestamp.tz_localize near boundaries (#19626) --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/_libs/tslibs/conversion.pxd | 2 - pandas/_libs/tslibs/conversion.pyx | 56 ++++++++++++++++--- .../tests/scalar/timestamp/test_timezones.py | 17 ++++++ 4 files changed, 67 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 6f48d9a6c63c9..6fdd551accbf1 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -727,6 +727,7 @@ Timezones - Bug in tz-aware :class:`DatetimeIndex` where addition/subtraction with a :class:`TimedeltaIndex` or array with ``dtype='timedelta64[ns]'`` was incorrect (:issue:`17558`) - Bug in :func:`DatetimeIndex.insert` where inserting ``NaT`` into a timezone-aware index incorrectly raised (:issue:`16357`) - Bug in the :class:`DataFrame` constructor, where tz-aware Datetimeindex and a given column name will result in an empty ``DataFrame`` (:issue:`19157`) +- Bug in :func:`Timestamp.tz_localize` where localizing a timestamp near the minimum or maximum valid values could overflow and return a timestamp with an incorrect nanosecond value (:issue:`12677`) Offsets ^^^^^^^ diff --git a/pandas/_libs/tslibs/conversion.pxd b/pandas/_libs/tslibs/conversion.pxd index 0d5e9e3fc5152..868c2641b34db 100644 --- a/pandas/_libs/tslibs/conversion.pxd +++ b/pandas/_libs/tslibs/conversion.pxd @@ -21,8 +21,6 @@ cdef convert_to_tsobject(object ts, object tz, object unit, cdef _TSObject convert_datetime_to_tsobject(datetime ts, object tz, int32_t nanos=*) -cdef void _localize_tso(_TSObject obj, object tz) - cpdef int64_t tz_convert_single(int64_t val, object tz1, object tz2) cdef int64_t get_datetime64_nanos(object val) except? -1 diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index cfbcb922cb47d..beaca1a8483c7 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -309,12 +309,13 @@ cdef convert_to_tsobject(object ts, object tz, object unit, raise TypeError('Cannot convert input [{}] of type {} to ' 'Timestamp'.format(ts, type(ts))) - if obj.value != NPY_NAT: - check_dts_bounds(&obj.dts) - if tz is not None: - _localize_tso(obj, tz) + localize_tso(obj, tz) + if obj.value != NPY_NAT: + # check_overflows needs to run after localize_tso + check_dts_bounds(&obj.dts) + check_overflows(obj) return obj @@ -391,6 +392,7 @@ cdef _TSObject convert_datetime_to_tsobject(datetime ts, object tz, obj.dts.ps = nanos * 1000 check_dts_bounds(&obj.dts) + check_overflows(obj) return obj @@ -454,6 +456,7 @@ cdef _TSObject convert_str_to_tsobject(object ts, object tz, object unit, obj.value = tz_convert_single(obj.value, obj.tzinfo, 'UTC') if tz is None: check_dts_bounds(&obj.dts) + check_overflows(obj) return obj else: # Keep the converter same as PyDateTime's @@ -469,7 +472,7 @@ cdef _TSObject convert_str_to_tsobject(object ts, object tz, object unit, else: ts = obj.value if tz is not None: - # shift for _localize_tso + # shift for localize_tso ts = tz_localize_to_utc(np.array([ts], dtype='i8'), tz, ambiguous='raise', errors='raise')[0] @@ -490,12 +493,51 @@ cdef _TSObject convert_str_to_tsobject(object ts, object tz, object unit, return convert_to_tsobject(ts, tz, unit, dayfirst, yearfirst) +cdef inline check_overflows(_TSObject obj): + """ + Check that we haven't silently overflowed in timezone conversion + + Parameters + ---------- + obj : _TSObject + + Returns + ------- + None + + Raises + ------ + OutOfBoundsDatetime + """ + # GH#12677 + if obj.dts.year == 1677: + if not (obj.value < 0): + raise OutOfBoundsDatetime + elif obj.dts.year == 2262: + if not (obj.value > 0): + raise OutOfBoundsDatetime + + # ---------------------------------------------------------------------- # Localization -cdef inline void _localize_tso(_TSObject obj, object tz): +cdef inline void localize_tso(_TSObject obj, tzinfo tz): """ - Take a TSObject in UTC and localizes to timezone tz. + Given the UTC nanosecond timestamp in obj.value, find the wall-clock + representation of that timestamp in the given timezone. + + Parameters + ---------- + obj : _TSObject + tz : tzinfo + + Returns + ------- + None + + Notes + ----- + Sets obj.tzinfo inplace, alters obj.dts inplace. """ cdef: ndarray[int64_t] trans, deltas diff --git a/pandas/tests/scalar/timestamp/test_timezones.py b/pandas/tests/scalar/timestamp/test_timezones.py index 7a5c6feb8b651..f43651dc6f0db 100644 --- a/pandas/tests/scalar/timestamp/test_timezones.py +++ b/pandas/tests/scalar/timestamp/test_timezones.py @@ -15,12 +15,29 @@ import pandas.util._test_decorators as td from pandas import Timestamp, NaT +from pandas.errors import OutOfBoundsDatetime class TestTimestampTZOperations(object): # -------------------------------------------------------------- # Timestamp.tz_localize + def test_tz_localize_pushes_out_of_bounds(self): + # GH#12677 + # tz_localize that pushes away from the boundary is OK + pac = Timestamp.min.tz_localize('US/Pacific') + assert pac.value > Timestamp.min.value + pac.tz_convert('Asia/Tokyo') # tz_convert doesn't change value + with pytest.raises(OutOfBoundsDatetime): + Timestamp.min.tz_localize('Asia/Tokyo') + + # tz_localize that pushes away from the boundary is OK + tokyo = Timestamp.max.tz_localize('Asia/Tokyo') + assert tokyo.value < Timestamp.max.value + tokyo.tz_convert('US/Pacific') # tz_convert doesn't change value + with pytest.raises(OutOfBoundsDatetime): + Timestamp.max.tz_localize('US/Pacific') + def test_tz_localize_ambiguous_bool(self): # make sure that we are correctly accepting bool values as ambiguous # GH#14402 From cd484cc525951320ee03c620f581c8bd9fa4000d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 10 Feb 2018 16:49:22 -0800 Subject: [PATCH 091/214] move shift_months test to test_arithmetic (#19636) --- .../tests/indexes/datetimes/test_arithmetic.py | 17 +++++++++++++++++ pandas/tests/indexes/datetimes/test_ops.py | 15 --------------- 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/pandas/tests/indexes/datetimes/test_arithmetic.py b/pandas/tests/indexes/datetimes/test_arithmetic.py index f6f8eccf4e30c..ddc97636ae0a8 100644 --- a/pandas/tests/indexes/datetimes/test_arithmetic.py +++ b/pandas/tests/indexes/datetimes/test_arithmetic.py @@ -15,6 +15,7 @@ DatetimeIndex, TimedeltaIndex, date_range) from pandas._libs import tslib +from pandas._libs.tslibs.offsets import shift_months @pytest.fixture(params=[None, 'UTC', 'Asia/Tokyo', @@ -933,3 +934,19 @@ def test_datetime64_with_DateOffset(klass, assert_func): Timestamp('2000-02-29', tz='US/Central')], name='a') assert_func(result, exp) assert_func(result2, exp) + + +@pytest.mark.parametrize('years', [-1, 0, 1]) +@pytest.mark.parametrize('months', [-2, 0, 2]) +def test_shift_months(years, months): + s = DatetimeIndex([Timestamp('2000-01-05 00:15:00'), + Timestamp('2000-01-31 00:23:00'), + Timestamp('2000-01-01'), + Timestamp('2000-02-29'), + Timestamp('2000-12-31')]) + actual = DatetimeIndex(shift_months(s.asi8, years * 12 + months)) + + raw = [x + pd.offsets.DateOffset(years=years, months=months) + for x in s] + expected = DatetimeIndex(raw) + tm.assert_index_equal(actual, expected) diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py index 4f386eb28cc0f..440478100ddd5 100644 --- a/pandas/tests/indexes/datetimes/test_ops.py +++ b/pandas/tests/indexes/datetimes/test_ops.py @@ -5,10 +5,8 @@ import numpy as np from datetime import datetime -from itertools import product import pandas as pd import pandas._libs.tslib as tslib -from pandas._libs.tslibs.offsets import shift_months import pandas.util.testing as tm from pandas import (DatetimeIndex, PeriodIndex, Series, Timestamp, date_range, _np_version_under1p10, Index, @@ -568,19 +566,6 @@ def test_equals(self): assert not idx.equals(pd.Series(idx3)) -@pytest.mark.parametrize('years,months', product([-1, 0, 1], [-2, 0, 2])) -def test_shift_months(years, months): - s = DatetimeIndex([Timestamp('2000-01-05 00:15:00'), - Timestamp('2000-01-31 00:23:00'), - Timestamp('2000-01-01'), - Timestamp('2000-02-29'), - Timestamp('2000-12-31')]) - actual = DatetimeIndex(shift_months(s.asi8, years * 12 + months)) - expected = DatetimeIndex([x + pd.offsets.DateOffset( - years=years, months=months) for x in s]) - tm.assert_index_equal(actual, expected) - - class TestBusinessDatetimeIndex(object): def setup_method(self, method): From 605a837ecdbbbdac1d666bde89457e19e5fdb983 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 11 Feb 2018 06:44:07 -0800 Subject: [PATCH 092/214] move libfreqs and liboffsets tests to test_tslibs, move parsing tests, with to_datetime test moved to test_tools (#19638) --- pandas/tests/indexes/datetimes/test_tools.py | 12 ++++++++++++ .../{tseries => tslibs}/test_libfrequencies.py | 0 .../{tseries/offsets => tslibs}/test_liboffsets.py | 0 pandas/tests/{scalar => tslibs}/test_parsing.py | 13 ------------- 4 files changed, 12 insertions(+), 13 deletions(-) rename pandas/tests/{tseries => tslibs}/test_libfrequencies.py (100%) rename pandas/tests/{tseries/offsets => tslibs}/test_liboffsets.py (100%) rename pandas/tests/{scalar => tslibs}/test_parsing.py (96%) diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index b95ae07052ecb..35f34dc3a4974 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -187,6 +187,18 @@ def test_to_datetime_format_weeks(self, cache): class TestToDatetime(object): + def test_to_datetime_pydatetime(self): + actual = pd.to_datetime(datetime(2008, 1, 15)) + assert actual == datetime(2008, 1, 15) + + def test_to_datetime_YYYYMMDD(self): + actual = pd.to_datetime('20080115') + assert actual == datetime(2008, 1, 15) + + def test_to_datetime_unparseable_ignore(self): + # unparseable + s = 'Month 1, 1999' + assert pd.to_datetime(s, errors='ignore') == s @td.skip_if_windows # `tm.set_timezone` does not work in windows def test_to_datetime_now(self): diff --git a/pandas/tests/tseries/test_libfrequencies.py b/pandas/tests/tslibs/test_libfrequencies.py similarity index 100% rename from pandas/tests/tseries/test_libfrequencies.py rename to pandas/tests/tslibs/test_libfrequencies.py diff --git a/pandas/tests/tseries/offsets/test_liboffsets.py b/pandas/tests/tslibs/test_liboffsets.py similarity index 100% rename from pandas/tests/tseries/offsets/test_liboffsets.py rename to pandas/tests/tslibs/test_liboffsets.py diff --git a/pandas/tests/scalar/test_parsing.py b/pandas/tests/tslibs/test_parsing.py similarity index 96% rename from pandas/tests/scalar/test_parsing.py rename to pandas/tests/tslibs/test_parsing.py index bff0de649ac5e..34cce088a8b42 100644 --- a/pandas/tests/scalar/test_parsing.py +++ b/pandas/tests/tslibs/test_parsing.py @@ -7,7 +7,6 @@ import pytest from dateutil.parser import parse -import pandas as pd import pandas.util._test_decorators as td from pandas.conftest import is_dateutil_le_261, is_dateutil_gt_261 from pandas import compat @@ -16,18 +15,6 @@ from pandas._libs.tslibs.parsing import parse_time_string -def test_to_datetime1(): - actual = pd.to_datetime(datetime(2008, 1, 15)) - assert actual == datetime(2008, 1, 15) - - actual = pd.to_datetime('20080115') - assert actual == datetime(2008, 1, 15) - - # unparseable - s = 'Month 1, 1999' - assert pd.to_datetime(s, errors='ignore') == s - - class TestParseQuarters(object): def test_parse_time_string(self): From efce4928ece189ceed43729b826f89021ea7988f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 11 Feb 2018 06:48:36 -0800 Subject: [PATCH 093/214] Fix uncaught OutOfBounds in array_to_datetime (#19612) --- doc/source/whatsnew/v0.23.0.txt | 3 ++- pandas/_libs/tslib.pyx | 13 ++++++------- pandas/tests/indexes/datetimes/test_tools.py | 11 ++++++++++- 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 6fdd551accbf1..acab9d0bbebf8 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -703,7 +703,7 @@ Datetimelike - Bug in :class:`Series` floor-division where operating on a scalar ``timedelta`` raises an exception (:issue:`18846`) - Bug in :class:`Series`` with ``dtype='timedelta64[ns]`` where addition or subtraction of ``TimedeltaIndex`` had results cast to ``dtype='int64'`` (:issue:`17250`) - Bug in :class:`TimedeltaIndex` where division by a ``Series`` would return a ``TimedeltaIndex`` instead of a ``Series`` (issue:`19042`) -- Bug in :class:`Series` with ``dtype='timedelta64[ns]`` where addition or subtraction of ``TimedeltaIndex`` could return a ``Series`` with an incorrect name (issue:`19043`) +- Bug in :class:`Series` with ``dtype='timedelta64[ns]`` where addition or subtraction of ``TimedeltaIndex`` could return a ``Series`` with an incorrect name (:issue:`19043`) - Bug in :class:`DatetimeIndex` where the repr was not showing high-precision time values at the end of a day (e.g., 23:59:59.999999999) (:issue:`19030`) - Bug where dividing a scalar timedelta-like object with :class:`TimedeltaIndex` performed the reciprocal operation (:issue:`19125`) - Bug in ``.astype()`` to non-ns timedelta units would hold the incorrect dtype (:issue:`19176`, :issue:`19223`, :issue:`12425`) @@ -713,6 +713,7 @@ Datetimelike - Bug in comparison of :class:`DatetimeIndex` against ``None`` or ``datetime.date`` objects raising ``TypeError`` for ``==`` and ``!=`` comparisons instead of all-``False`` and all-``True``, respectively (:issue:`19301`) - Bug in :class:`Timestamp` and :func:`to_datetime` where a string representing a barely out-of-bounds timestamp would be incorrectly rounded down instead of raising ``OutOfBoundsDatetime`` (:issue:`19382`) - Bug in :func:`Timestamp.floor` :func:`DatetimeIndex.floor` where time stamps far in the future and past were not rounded correctly (:issue:`19206`) +- Bug in :func:`to_datetime` where passing an out-of-bounds datetime with ``errors='coerce'`` and ``utc=True`` would raise ``OutOfBoundsDatetime`` instead of parsing to ``NaT`` (:issue:`19612`) - Timezones diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index a035bab2a7049..85e667521e5f2 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -524,11 +524,10 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', seen_datetime = 1 if val.tzinfo is not None: if utc_convert: - _ts = convert_datetime_to_tsobject(val, None) - iresult[i] = _ts.value try: - check_dts_bounds(&_ts.dts) - except ValueError: + _ts = convert_datetime_to_tsobject(val, None) + iresult[i] = _ts.value + except OutOfBoundsDatetime: if is_coerce: iresult[i] = NPY_NAT continue @@ -544,7 +543,7 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', iresult[i] += val.nanosecond try: check_dts_bounds(&dts) - except ValueError: + except OutOfBoundsDatetime: if is_coerce: iresult[i] = NPY_NAT continue @@ -555,7 +554,7 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', iresult[i] = pydate_to_dt64(val, &dts) try: check_dts_bounds(&dts) - except ValueError: + except OutOfBoundsDatetime: if is_coerce: iresult[i] = NPY_NAT continue @@ -568,7 +567,7 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', else: try: iresult[i] = get_datetime64_nanos(val) - except ValueError: + except OutOfBoundsDatetime: if is_coerce: iresult[i] = NPY_NAT continue diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index 35f34dc3a4974..bd3fa5e73cd11 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -8,7 +8,7 @@ import dateutil import numpy as np from dateutil.parser import parse -from datetime import datetime, date, time +from datetime import datetime, date, time, timedelta from distutils.version import LooseVersion import pandas as pd @@ -1503,6 +1503,15 @@ def test_parsers_iso8601(self): class TestArrayToDatetime(object): + def test_coerce_out_of_bounds_utc(self): + # GH#19612 + ts = Timestamp('1900-01-01', tz='US/Pacific') + dt = ts.to_pydatetime() - timedelta(days=365 * 300) # ~1600AD + arr = np.array([dt]) + result = tslib.array_to_datetime(arr, utc=True, errors='coerce') + expected = np.array(['NaT'], dtype='datetime64[ns]') + tm.assert_numpy_array_equal(result, expected) + def test_parsing_valid_dates(self): arr = np.array(['01-01-2013', '01-02-2013'], dtype=object) tm.assert_numpy_array_equal( From b9b6408441b29c98bc8b44d7b6a9e05e0a94afcc Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 11 Feb 2018 06:50:46 -0800 Subject: [PATCH 094/214] test_astype portion of #19627 (#19637) --- pandas/tests/indexes/datetimes/test_astype.py | 24 +++++++++++ pandas/tests/indexes/datetimes/test_ops.py | 43 ------------------- 2 files changed, 24 insertions(+), 43 deletions(-) diff --git a/pandas/tests/indexes/datetimes/test_astype.py b/pandas/tests/indexes/datetimes/test_astype.py index 4b989eb35e900..8acdd301f241a 100644 --- a/pandas/tests/indexes/datetimes/test_astype.py +++ b/pandas/tests/indexes/datetimes/test_astype.py @@ -138,6 +138,30 @@ def test_astype_object(self): tm.assert_index_equal(casted, Index(exp_values, dtype=np.object_)) assert casted.tolist() == exp_values + @pytest.mark.parametrize('tz', [None, 'Asia/Tokyo']) + def test_astype_object_tz(self, tz): + idx = pd.date_range(start='2013-01-01', periods=4, freq='M', + name='idx', tz=tz) + expected_list = [Timestamp('2013-01-31', tz=tz), + Timestamp('2013-02-28', tz=tz), + Timestamp('2013-03-31', tz=tz), + Timestamp('2013-04-30', tz=tz)] + expected = pd.Index(expected_list, dtype=object, name='idx') + result = idx.astype(object) + tm.assert_index_equal(result, expected) + assert idx.tolist() == expected_list + + def test_astype_object_with_nat(self): + idx = DatetimeIndex([datetime(2013, 1, 1), datetime(2013, 1, 2), + pd.NaT, datetime(2013, 1, 4)], name='idx') + expected_list = [Timestamp('2013-01-01'), + Timestamp('2013-01-02'), pd.NaT, + Timestamp('2013-01-04')] + expected = pd.Index(expected_list, dtype=object, name='idx') + result = idx.astype(object) + tm.assert_index_equal(result, expected) + assert idx.tolist() == expected_list + @pytest.mark.parametrize('dtype', [ float, 'timedelta64', 'timedelta64[ns]', 'datetime64', 'datetime64[D]']) diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py index 440478100ddd5..bc43b427fe0aa 100644 --- a/pandas/tests/indexes/datetimes/test_ops.py +++ b/pandas/tests/indexes/datetimes/test_ops.py @@ -49,49 +49,6 @@ def test_ops_properties_basic(self): assert s.day == 10 pytest.raises(AttributeError, lambda: s.weekday) - def test_astype_object(self): - idx = pd.date_range(start='2013-01-01', periods=4, freq='M', - name='idx') - expected_list = [Timestamp('2013-01-31'), - Timestamp('2013-02-28'), - Timestamp('2013-03-31'), - Timestamp('2013-04-30')] - expected = pd.Index(expected_list, dtype=object, name='idx') - result = idx.astype(object) - assert isinstance(result, Index) - - assert result.dtype == object - tm.assert_index_equal(result, expected) - assert result.name == expected.name - assert idx.tolist() == expected_list - - idx = pd.date_range(start='2013-01-01', periods=4, freq='M', - name='idx', tz='Asia/Tokyo') - expected_list = [Timestamp('2013-01-31', tz='Asia/Tokyo'), - Timestamp('2013-02-28', tz='Asia/Tokyo'), - Timestamp('2013-03-31', tz='Asia/Tokyo'), - Timestamp('2013-04-30', tz='Asia/Tokyo')] - expected = pd.Index(expected_list, dtype=object, name='idx') - result = idx.astype(object) - assert isinstance(result, Index) - assert result.dtype == object - tm.assert_index_equal(result, expected) - assert result.name == expected.name - assert idx.tolist() == expected_list - - idx = DatetimeIndex([datetime(2013, 1, 1), datetime(2013, 1, 2), - pd.NaT, datetime(2013, 1, 4)], name='idx') - expected_list = [Timestamp('2013-01-01'), - Timestamp('2013-01-02'), pd.NaT, - Timestamp('2013-01-04')] - expected = pd.Index(expected_list, dtype=object, name='idx') - result = idx.astype(object) - assert isinstance(result, Index) - assert result.dtype == object - tm.assert_index_equal(result, expected) - assert result.name == expected.name - assert idx.tolist() == expected_list - def test_minmax(self): for tz in self.tz: # monotonic From 13eac4bbf0133530a867fd92f65b4433b28aaaef Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 11 Feb 2018 06:51:47 -0800 Subject: [PATCH 095/214] move timedelta test_astype test (#19639) --- .../tests/indexes/timedeltas/test_astype.py | 20 ++++++++++++++ pandas/tests/indexes/timedeltas/test_ops.py | 27 +------------------ 2 files changed, 21 insertions(+), 26 deletions(-) diff --git a/pandas/tests/indexes/timedeltas/test_astype.py b/pandas/tests/indexes/timedeltas/test_astype.py index 6c644d239069a..329f0c2467e8b 100644 --- a/pandas/tests/indexes/timedeltas/test_astype.py +++ b/pandas/tests/indexes/timedeltas/test_astype.py @@ -1,3 +1,5 @@ +from datetime import timedelta + import pytest import numpy as np @@ -8,6 +10,24 @@ class TestTimedeltaIndex(object): + def test_astype_object(self): + idx = timedelta_range(start='1 days', periods=4, freq='D', name='idx') + expected_list = [Timedelta('1 days'), Timedelta('2 days'), + Timedelta('3 days'), Timedelta('4 days')] + result = idx.astype(object) + expected = Index(expected_list, dtype=object, name='idx') + tm.assert_index_equal(result, expected) + assert idx.tolist() == expected_list + + def test_astype_object_with_nat(self): + idx = TimedeltaIndex([timedelta(days=1), timedelta(days=2), NaT, + timedelta(days=4)], name='idx') + expected_list = [Timedelta('1 days'), Timedelta('2 days'), NaT, + Timedelta('4 days')] + result = idx.astype(object) + expected = Index(expected_list, dtype=object, name='idx') + tm.assert_index_equal(result, expected) + assert idx.tolist() == expected_list def test_astype(self): # GH 13149, GH 13209 diff --git a/pandas/tests/indexes/timedeltas/test_ops.py b/pandas/tests/indexes/timedeltas/test_ops.py index 86d7dd4e1b117..d154aa2172ef7 100644 --- a/pandas/tests/indexes/timedeltas/test_ops.py +++ b/pandas/tests/indexes/timedeltas/test_ops.py @@ -8,7 +8,7 @@ from pandas import to_timedelta from pandas import (Series, Timedelta, Timestamp, TimedeltaIndex, timedelta_range, - _np_version_under1p10, Index) + _np_version_under1p10) from pandas._libs.tslib import iNaT from pandas.tests.test_base import Ops @@ -25,31 +25,6 @@ def test_ops_properties(self): self.check_ops_properties(TimedeltaIndex._field_ops, f) self.check_ops_properties(TimedeltaIndex._object_ops, f) - def test_astype_object(self): - idx = timedelta_range(start='1 days', periods=4, freq='D', name='idx') - expected_list = [Timedelta('1 days'), Timedelta('2 days'), - Timedelta('3 days'), Timedelta('4 days')] - expected = pd.Index(expected_list, dtype=object, name='idx') - result = idx.astype(object) - assert isinstance(result, Index) - - assert result.dtype == object - tm.assert_index_equal(result, expected) - assert result.name == expected.name - assert idx.tolist() == expected_list - - idx = TimedeltaIndex([timedelta(days=1), timedelta(days=2), pd.NaT, - timedelta(days=4)], name='idx') - expected_list = [Timedelta('1 days'), Timedelta('2 days'), pd.NaT, - Timedelta('4 days')] - expected = pd.Index(expected_list, dtype=object, name='idx') - result = idx.astype(object) - assert isinstance(result, Index) - assert result.dtype == object - tm.assert_index_equal(result, expected) - assert result.name == expected.name - assert idx.tolist() == expected_list - def test_minmax(self): # monotonic From 324379ce75269aa6bced90ecf3edb692539a2742 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 11 Feb 2018 06:56:28 -0800 Subject: [PATCH 096/214] Organize PeriodIndex tests (#19641) --- .../tests/indexes/period/test_arithmetic.py | 256 ++++++++++++++++++ pandas/tests/indexes/period/test_ops.py | 192 +------------ pandas/tests/indexes/period/test_period.py | 78 +----- .../indexes/period/test_scalar_compat.py | 17 ++ pandas/tests/indexes/period/test_tools.py | 79 ------ pandas/tests/tslibs/test_period_asfreq.py | 81 ++++++ 6 files changed, 358 insertions(+), 345 deletions(-) create mode 100644 pandas/tests/indexes/period/test_scalar_compat.py create mode 100644 pandas/tests/tslibs/test_period_asfreq.py diff --git a/pandas/tests/indexes/period/test_arithmetic.py b/pandas/tests/indexes/period/test_arithmetic.py index 356ea5fc656de..81171920f635f 100644 --- a/pandas/tests/indexes/period/test_arithmetic.py +++ b/pandas/tests/indexes/period/test_arithmetic.py @@ -11,6 +11,171 @@ import pandas.core.indexes.period as period +class TestPeriodIndexComparisons(object): + @pytest.mark.parametrize('freq', ['M', '2M', '3M']) + def test_pi_cmp_pi(self, freq): + base = PeriodIndex(['2011-01', '2011-02', '2011-03', '2011-04'], + freq=freq) + per = Period('2011-02', freq=freq) + + exp = np.array([False, True, False, False]) + tm.assert_numpy_array_equal(base == per, exp) + tm.assert_numpy_array_equal(per == base, exp) + + exp = np.array([True, False, True, True]) + tm.assert_numpy_array_equal(base != per, exp) + tm.assert_numpy_array_equal(per != base, exp) + + exp = np.array([False, False, True, True]) + tm.assert_numpy_array_equal(base > per, exp) + tm.assert_numpy_array_equal(per < base, exp) + + exp = np.array([True, False, False, False]) + tm.assert_numpy_array_equal(base < per, exp) + tm.assert_numpy_array_equal(per > base, exp) + + exp = np.array([False, True, True, True]) + tm.assert_numpy_array_equal(base >= per, exp) + tm.assert_numpy_array_equal(per <= base, exp) + + exp = np.array([True, True, False, False]) + tm.assert_numpy_array_equal(base <= per, exp) + tm.assert_numpy_array_equal(per >= base, exp) + + idx = PeriodIndex(['2011-02', '2011-01', '2011-03', '2011-05'], + freq=freq) + + exp = np.array([False, False, True, False]) + tm.assert_numpy_array_equal(base == idx, exp) + + exp = np.array([True, True, False, True]) + tm.assert_numpy_array_equal(base != idx, exp) + + exp = np.array([False, True, False, False]) + tm.assert_numpy_array_equal(base > idx, exp) + + exp = np.array([True, False, False, True]) + tm.assert_numpy_array_equal(base < idx, exp) + + exp = np.array([False, True, True, False]) + tm.assert_numpy_array_equal(base >= idx, exp) + + exp = np.array([True, False, True, True]) + tm.assert_numpy_array_equal(base <= idx, exp) + + @pytest.mark.parametrize('freq', ['M', '2M', '3M']) + def test_pi_cmp_pi_mismatched_freq_raises(self, freq): + # different base freq + base = PeriodIndex(['2011-01', '2011-02', '2011-03', '2011-04'], + freq=freq) + + msg = "Input has different freq=A-DEC from PeriodIndex" + with tm.assert_raises_regex(period.IncompatibleFrequency, msg): + base <= Period('2011', freq='A') + + with tm.assert_raises_regex(period.IncompatibleFrequency, msg): + Period('2011', freq='A') >= base + + idx = PeriodIndex(['2011', '2012', '2013', '2014'], freq='A') + with tm.assert_raises_regex(period.IncompatibleFrequency, msg): + base <= idx + + # Different frequency + msg = "Input has different freq=4M from PeriodIndex" + with tm.assert_raises_regex(period.IncompatibleFrequency, msg): + base <= Period('2011', freq='4M') + + with tm.assert_raises_regex(period.IncompatibleFrequency, msg): + Period('2011', freq='4M') >= base + + idx = PeriodIndex(['2011', '2012', '2013', '2014'], freq='4M') + with tm.assert_raises_regex(period.IncompatibleFrequency, msg): + base <= idx + + @pytest.mark.parametrize('freq', ['M', '2M', '3M']) + def test_pi_cmp_nat(self, freq): + idx1 = PeriodIndex(['2011-01', '2011-02', 'NaT', '2011-05'], freq=freq) + + result = idx1 > Period('2011-02', freq=freq) + exp = np.array([False, False, False, True]) + tm.assert_numpy_array_equal(result, exp) + result = Period('2011-02', freq=freq) < idx1 + tm.assert_numpy_array_equal(result, exp) + + result = idx1 == Period('NaT', freq=freq) + exp = np.array([False, False, False, False]) + tm.assert_numpy_array_equal(result, exp) + result = Period('NaT', freq=freq) == idx1 + tm.assert_numpy_array_equal(result, exp) + + result = idx1 != Period('NaT', freq=freq) + exp = np.array([True, True, True, True]) + tm.assert_numpy_array_equal(result, exp) + result = Period('NaT', freq=freq) != idx1 + tm.assert_numpy_array_equal(result, exp) + + idx2 = PeriodIndex(['2011-02', '2011-01', '2011-04', 'NaT'], freq=freq) + result = idx1 < idx2 + exp = np.array([True, False, False, False]) + tm.assert_numpy_array_equal(result, exp) + + result = idx1 == idx2 + exp = np.array([False, False, False, False]) + tm.assert_numpy_array_equal(result, exp) + + result = idx1 != idx2 + exp = np.array([True, True, True, True]) + tm.assert_numpy_array_equal(result, exp) + + result = idx1 == idx1 + exp = np.array([True, True, False, True]) + tm.assert_numpy_array_equal(result, exp) + + result = idx1 != idx1 + exp = np.array([False, False, True, False]) + tm.assert_numpy_array_equal(result, exp) + + @pytest.mark.parametrize('freq', ['M', '2M', '3M']) + def test_pi_cmp_nat_mismatched_freq_raises(self, freq): + idx1 = PeriodIndex(['2011-01', '2011-02', 'NaT', '2011-05'], freq=freq) + + diff = PeriodIndex(['2011-02', '2011-01', '2011-04', 'NaT'], freq='4M') + msg = "Input has different freq=4M from PeriodIndex" + with tm.assert_raises_regex(period.IncompatibleFrequency, msg): + idx1 > diff + + with tm.assert_raises_regex(period.IncompatibleFrequency, msg): + idx1 == diff + + # TODO: De-duplicate with test_pi_cmp_nat + def test_comp_nat(self): + left = pd.PeriodIndex([pd.Period('2011-01-01'), pd.NaT, + pd.Period('2011-01-03')]) + right = pd.PeriodIndex([pd.NaT, pd.NaT, pd.Period('2011-01-03')]) + + for lhs, rhs in [(left, right), + (left.astype(object), right.astype(object))]: + result = lhs == rhs + expected = np.array([False, False, True]) + tm.assert_numpy_array_equal(result, expected) + + result = lhs != rhs + expected = np.array([True, True, False]) + tm.assert_numpy_array_equal(result, expected) + + expected = np.array([False, False, False]) + tm.assert_numpy_array_equal(lhs == pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT == rhs, expected) + + expected = np.array([True, True, True]) + tm.assert_numpy_array_equal(lhs != pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT != lhs, expected) + + expected = np.array([False, False, False]) + tm.assert_numpy_array_equal(lhs < pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT > lhs, expected) + + class TestPeriodIndexArithmetic(object): def test_pi_add_offset_array(self): # GH#18849 @@ -250,6 +415,97 @@ def test_sub_isub(self): rng -= 1 tm.assert_index_equal(rng, expected) + # --------------------------------------------------------------- + # PeriodIndex.shift is used by __add__ and __sub__ + + def test_pi_shift_ndarray(self): + idx = PeriodIndex(['2011-01', '2011-02', 'NaT', + '2011-04'], freq='M', name='idx') + result = idx.shift(np.array([1, 2, 3, 4])) + expected = PeriodIndex(['2011-02', '2011-04', 'NaT', + '2011-08'], freq='M', name='idx') + tm.assert_index_equal(result, expected) + + idx = PeriodIndex(['2011-01', '2011-02', 'NaT', + '2011-04'], freq='M', name='idx') + result = idx.shift(np.array([1, -2, 3, -4])) + expected = PeriodIndex(['2011-02', '2010-12', 'NaT', + '2010-12'], freq='M', name='idx') + tm.assert_index_equal(result, expected) + + def test_shift(self): + pi1 = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') + pi2 = PeriodIndex(freq='A', start='1/1/2002', end='12/1/2010') + + tm.assert_index_equal(pi1.shift(0), pi1) + + assert len(pi1) == len(pi2) + tm.assert_index_equal(pi1.shift(1), pi2) + + pi1 = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') + pi2 = PeriodIndex(freq='A', start='1/1/2000', end='12/1/2008') + assert len(pi1) == len(pi2) + tm.assert_index_equal(pi1.shift(-1), pi2) + + pi1 = PeriodIndex(freq='M', start='1/1/2001', end='12/1/2009') + pi2 = PeriodIndex(freq='M', start='2/1/2001', end='1/1/2010') + assert len(pi1) == len(pi2) + tm.assert_index_equal(pi1.shift(1), pi2) + + pi1 = PeriodIndex(freq='M', start='1/1/2001', end='12/1/2009') + pi2 = PeriodIndex(freq='M', start='12/1/2000', end='11/1/2009') + assert len(pi1) == len(pi2) + tm.assert_index_equal(pi1.shift(-1), pi2) + + pi1 = PeriodIndex(freq='D', start='1/1/2001', end='12/1/2009') + pi2 = PeriodIndex(freq='D', start='1/2/2001', end='12/2/2009') + assert len(pi1) == len(pi2) + tm.assert_index_equal(pi1.shift(1), pi2) + + pi1 = PeriodIndex(freq='D', start='1/1/2001', end='12/1/2009') + pi2 = PeriodIndex(freq='D', start='12/31/2000', end='11/30/2009') + assert len(pi1) == len(pi2) + tm.assert_index_equal(pi1.shift(-1), pi2) + + def test_shift_corner_cases(self): + # GH#9903 + idx = pd.PeriodIndex([], name='xxx', freq='H') + + with pytest.raises(TypeError): + # period shift doesn't accept freq + idx.shift(1, freq='H') + + tm.assert_index_equal(idx.shift(0), idx) + tm.assert_index_equal(idx.shift(3), idx) + + idx = pd.PeriodIndex(['2011-01-01 10:00', '2011-01-01 11:00' + '2011-01-01 12:00'], name='xxx', freq='H') + tm.assert_index_equal(idx.shift(0), idx) + exp = pd.PeriodIndex(['2011-01-01 13:00', '2011-01-01 14:00' + '2011-01-01 15:00'], name='xxx', freq='H') + tm.assert_index_equal(idx.shift(3), exp) + exp = pd.PeriodIndex(['2011-01-01 07:00', '2011-01-01 08:00' + '2011-01-01 09:00'], name='xxx', freq='H') + tm.assert_index_equal(idx.shift(-3), exp) + + def test_shift_nat(self): + idx = PeriodIndex(['2011-01', '2011-02', 'NaT', + '2011-04'], freq='M', name='idx') + result = idx.shift(1) + expected = PeriodIndex(['2011-02', '2011-03', 'NaT', + '2011-05'], freq='M', name='idx') + tm.assert_index_equal(result, expected) + assert result.name == expected.name + + def test_shift_gh8083(self): + # test shift for PeriodIndex + # GH#8083 + drange = pd.period_range('20130101', periods=5, freq='D') + result = drange.shift(1) + expected = PeriodIndex(['2013-01-02', '2013-01-03', '2013-01-04', + '2013-01-05', '2013-01-06'], freq='D') + tm.assert_index_equal(result, expected) + class TestPeriodIndexSeriesMethods(object): """ Test PeriodIndex and Period Series Ops consistency """ diff --git a/pandas/tests/indexes/period/test_ops.py b/pandas/tests/indexes/period/test_ops.py index 21a9ffdde3444..8745de0c2a7aa 100644 --- a/pandas/tests/indexes/period/test_ops.py +++ b/pandas/tests/indexes/period/test_ops.py @@ -1,11 +1,9 @@ -import pytest import numpy as np import pandas as pd import pandas._libs.tslib as tslib import pandas.util.testing as tm -import pandas.core.indexes.period as period from pandas import (DatetimeIndex, PeriodIndex, Series, Period, _np_version_under1p10, Index) @@ -521,25 +519,8 @@ def test_nat_new(self): tm.assert_numpy_array_equal(result, exp) def test_shift(self): - # GH 9903 - idx = pd.PeriodIndex([], name='xxx', freq='H') - - with pytest.raises(TypeError): - # period shift doesn't accept freq - idx.shift(1, freq='H') - - tm.assert_index_equal(idx.shift(0), idx) - tm.assert_index_equal(idx.shift(3), idx) - - idx = pd.PeriodIndex(['2011-01-01 10:00', '2011-01-01 11:00' - '2011-01-01 12:00'], name='xxx', freq='H') - tm.assert_index_equal(idx.shift(0), idx) - exp = pd.PeriodIndex(['2011-01-01 13:00', '2011-01-01 14:00' - '2011-01-01 15:00'], name='xxx', freq='H') - tm.assert_index_equal(idx.shift(3), exp) - exp = pd.PeriodIndex(['2011-01-01 07:00', '2011-01-01 08:00' - '2011-01-01 09:00'], name='xxx', freq='H') - tm.assert_index_equal(idx.shift(-3), exp) + # This is tested in test_arithmetic + pass def test_repeat(self): index = pd.period_range('2001-01-01', periods=2, freq='D') @@ -703,172 +684,3 @@ def test_pi_comp_period_nat(self): f = lambda x: tslib.NaT >= x exp = np.array([False, False, False, False], dtype=np.bool) self._check(idx, f, exp) - - -class TestPeriodIndexComparisons(object): - - def test_pi_pi_comp(self): - - for freq in ['M', '2M', '3M']: - base = PeriodIndex(['2011-01', '2011-02', - '2011-03', '2011-04'], freq=freq) - p = Period('2011-02', freq=freq) - - exp = np.array([False, True, False, False]) - tm.assert_numpy_array_equal(base == p, exp) - tm.assert_numpy_array_equal(p == base, exp) - - exp = np.array([True, False, True, True]) - tm.assert_numpy_array_equal(base != p, exp) - tm.assert_numpy_array_equal(p != base, exp) - - exp = np.array([False, False, True, True]) - tm.assert_numpy_array_equal(base > p, exp) - tm.assert_numpy_array_equal(p < base, exp) - - exp = np.array([True, False, False, False]) - tm.assert_numpy_array_equal(base < p, exp) - tm.assert_numpy_array_equal(p > base, exp) - - exp = np.array([False, True, True, True]) - tm.assert_numpy_array_equal(base >= p, exp) - tm.assert_numpy_array_equal(p <= base, exp) - - exp = np.array([True, True, False, False]) - tm.assert_numpy_array_equal(base <= p, exp) - tm.assert_numpy_array_equal(p >= base, exp) - - idx = PeriodIndex(['2011-02', '2011-01', '2011-03', - '2011-05'], freq=freq) - - exp = np.array([False, False, True, False]) - tm.assert_numpy_array_equal(base == idx, exp) - - exp = np.array([True, True, False, True]) - tm.assert_numpy_array_equal(base != idx, exp) - - exp = np.array([False, True, False, False]) - tm.assert_numpy_array_equal(base > idx, exp) - - exp = np.array([True, False, False, True]) - tm.assert_numpy_array_equal(base < idx, exp) - - exp = np.array([False, True, True, False]) - tm.assert_numpy_array_equal(base >= idx, exp) - - exp = np.array([True, False, True, True]) - tm.assert_numpy_array_equal(base <= idx, exp) - - # different base freq - msg = "Input has different freq=A-DEC from PeriodIndex" - with tm.assert_raises_regex( - period.IncompatibleFrequency, msg): - base <= Period('2011', freq='A') - - with tm.assert_raises_regex( - period.IncompatibleFrequency, msg): - Period('2011', freq='A') >= base - - with tm.assert_raises_regex( - period.IncompatibleFrequency, msg): - idx = PeriodIndex(['2011', '2012', '2013', '2014'], freq='A') - base <= idx - - # Different frequency - msg = "Input has different freq=4M from PeriodIndex" - with tm.assert_raises_regex( - period.IncompatibleFrequency, msg): - base <= Period('2011', freq='4M') - - with tm.assert_raises_regex( - period.IncompatibleFrequency, msg): - Period('2011', freq='4M') >= base - - with tm.assert_raises_regex( - period.IncompatibleFrequency, msg): - idx = PeriodIndex(['2011', '2012', '2013', '2014'], freq='4M') - base <= idx - - def test_pi_nat_comp(self): - for freq in ['M', '2M', '3M']: - idx1 = PeriodIndex( - ['2011-01', '2011-02', 'NaT', '2011-05'], freq=freq) - - result = idx1 > Period('2011-02', freq=freq) - exp = np.array([False, False, False, True]) - tm.assert_numpy_array_equal(result, exp) - result = Period('2011-02', freq=freq) < idx1 - tm.assert_numpy_array_equal(result, exp) - - result = idx1 == Period('NaT', freq=freq) - exp = np.array([False, False, False, False]) - tm.assert_numpy_array_equal(result, exp) - result = Period('NaT', freq=freq) == idx1 - tm.assert_numpy_array_equal(result, exp) - - result = idx1 != Period('NaT', freq=freq) - exp = np.array([True, True, True, True]) - tm.assert_numpy_array_equal(result, exp) - result = Period('NaT', freq=freq) != idx1 - tm.assert_numpy_array_equal(result, exp) - - idx2 = PeriodIndex(['2011-02', '2011-01', '2011-04', - 'NaT'], freq=freq) - result = idx1 < idx2 - exp = np.array([True, False, False, False]) - tm.assert_numpy_array_equal(result, exp) - - result = idx1 == idx2 - exp = np.array([False, False, False, False]) - tm.assert_numpy_array_equal(result, exp) - - result = idx1 != idx2 - exp = np.array([True, True, True, True]) - tm.assert_numpy_array_equal(result, exp) - - result = idx1 == idx1 - exp = np.array([True, True, False, True]) - tm.assert_numpy_array_equal(result, exp) - - result = idx1 != idx1 - exp = np.array([False, False, True, False]) - tm.assert_numpy_array_equal(result, exp) - - diff = PeriodIndex(['2011-02', '2011-01', '2011-04', - 'NaT'], freq='4M') - msg = "Input has different freq=4M from PeriodIndex" - with tm.assert_raises_regex( - period.IncompatibleFrequency, msg): - idx1 > diff - - with tm.assert_raises_regex( - period.IncompatibleFrequency, msg): - idx1 == diff - - # TODO: De-duplicate with test_pi_nat_comp - def test_comp_nat(self): - left = pd.PeriodIndex([pd.Period('2011-01-01'), pd.NaT, - pd.Period('2011-01-03')]) - right = pd.PeriodIndex([pd.NaT, pd.NaT, pd.Period('2011-01-03')]) - - for lhs, rhs in [(left, right), - (left.astype(object), right.astype(object))]: - result = lhs == rhs - expected = np.array([False, False, True]) - tm.assert_numpy_array_equal(result, expected) - - result = lhs != rhs - expected = np.array([True, True, False]) - tm.assert_numpy_array_equal(result, expected) - - expected = np.array([False, False, False]) - tm.assert_numpy_array_equal(lhs == pd.NaT, expected) - tm.assert_numpy_array_equal(pd.NaT == rhs, expected) - - expected = np.array([True, True, True]) - tm.assert_numpy_array_equal(lhs != pd.NaT, expected) - tm.assert_numpy_array_equal(pd.NaT != lhs, expected) - - expected = np.array([False, False, False]) - tm.assert_numpy_array_equal(lhs < pd.NaT, expected) - tm.assert_numpy_array_equal(pd.NaT > lhs, expected) diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index 6fc7fa5486f82..f3469b829f8a3 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -453,16 +453,6 @@ def test_periods_number_check(self): with pytest.raises(ValueError): period_range('2011-1-1', '2012-1-1', 'B') - def test_start_time(self): - index = PeriodIndex(freq='M', start='2016-01-01', end='2016-05-31') - expected_index = date_range('2016-01-01', end='2016-05-31', freq='MS') - tm.assert_index_equal(index.start_time, expected_index) - - def test_end_time(self): - index = PeriodIndex(freq='M', start='2016-01-01', end='2016-05-31') - expected_index = date_range('2016-01-01', end='2016-05-31', freq='M') - tm.assert_index_equal(index.end_time, expected_index) - def test_index_duplicate_periods(self): # monotonic idx = PeriodIndex([2000, 2007, 2007, 2009, 2009], freq='A-JUN') @@ -495,78 +485,14 @@ def test_index_unique(self): tm.assert_index_equal(idx.unique(), expected) assert idx.nunique() == 3 - def test_shift_gh8083(self): - - # test shift for PeriodIndex - # GH8083 - drange = self.create_index() - result = drange.shift(1) - expected = PeriodIndex(['2013-01-02', '2013-01-03', '2013-01-04', - '2013-01-05', '2013-01-06'], freq='D') - tm.assert_index_equal(result, expected) - def test_shift(self): - pi1 = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') - pi2 = PeriodIndex(freq='A', start='1/1/2002', end='12/1/2010') - - tm.assert_index_equal(pi1.shift(0), pi1) - - assert len(pi1) == len(pi2) - tm.assert_index_equal(pi1.shift(1), pi2) - - pi1 = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') - pi2 = PeriodIndex(freq='A', start='1/1/2000', end='12/1/2008') - assert len(pi1) == len(pi2) - tm.assert_index_equal(pi1.shift(-1), pi2) - - pi1 = PeriodIndex(freq='M', start='1/1/2001', end='12/1/2009') - pi2 = PeriodIndex(freq='M', start='2/1/2001', end='1/1/2010') - assert len(pi1) == len(pi2) - tm.assert_index_equal(pi1.shift(1), pi2) - - pi1 = PeriodIndex(freq='M', start='1/1/2001', end='12/1/2009') - pi2 = PeriodIndex(freq='M', start='12/1/2000', end='11/1/2009') - assert len(pi1) == len(pi2) - tm.assert_index_equal(pi1.shift(-1), pi2) - - pi1 = PeriodIndex(freq='D', start='1/1/2001', end='12/1/2009') - pi2 = PeriodIndex(freq='D', start='1/2/2001', end='12/2/2009') - assert len(pi1) == len(pi2) - tm.assert_index_equal(pi1.shift(1), pi2) - - pi1 = PeriodIndex(freq='D', start='1/1/2001', end='12/1/2009') - pi2 = PeriodIndex(freq='D', start='12/31/2000', end='11/30/2009') - assert len(pi1) == len(pi2) - tm.assert_index_equal(pi1.shift(-1), pi2) - - def test_shift_nat(self): - idx = PeriodIndex(['2011-01', '2011-02', 'NaT', - '2011-04'], freq='M', name='idx') - result = idx.shift(1) - expected = PeriodIndex(['2011-02', '2011-03', 'NaT', - '2011-05'], freq='M', name='idx') - tm.assert_index_equal(result, expected) - assert result.name == expected.name + # This is tested in test_arithmetic + pass @td.skip_if_32bit def test_ndarray_compat_properties(self): super(TestPeriodIndex, self).test_ndarray_compat_properties() - def test_shift_ndarray(self): - idx = PeriodIndex(['2011-01', '2011-02', 'NaT', - '2011-04'], freq='M', name='idx') - result = idx.shift(np.array([1, 2, 3, 4])) - expected = PeriodIndex(['2011-02', '2011-04', 'NaT', - '2011-08'], freq='M', name='idx') - tm.assert_index_equal(result, expected) - - idx = PeriodIndex(['2011-01', '2011-02', 'NaT', - '2011-04'], freq='M', name='idx') - result = idx.shift(np.array([1, -2, 3, -4])) - expected = PeriodIndex(['2011-02', '2010-12', 'NaT', - '2010-12'], freq='M', name='idx') - tm.assert_index_equal(result, expected) - def test_negative_ordinals(self): Period(ordinal=-1000, freq='A') Period(ordinal=0, freq='A') diff --git a/pandas/tests/indexes/period/test_scalar_compat.py b/pandas/tests/indexes/period/test_scalar_compat.py new file mode 100644 index 0000000000000..56bd2adf58719 --- /dev/null +++ b/pandas/tests/indexes/period/test_scalar_compat.py @@ -0,0 +1,17 @@ +# -*- coding: utf-8 -*- +"""Tests for PeriodIndex behaving like a vectorized Period scalar""" + +from pandas import PeriodIndex, date_range +import pandas.util.testing as tm + + +class TestPeriodIndexOps(object): + def test_start_time(self): + index = PeriodIndex(freq='M', start='2016-01-01', end='2016-05-31') + expected_index = date_range('2016-01-01', end='2016-05-31', freq='MS') + tm.assert_index_equal(index.start_time, expected_index) + + def test_end_time(self): + index = PeriodIndex(freq='M', start='2016-01-01', end='2016-05-31') + expected_index = date_range('2016-01-01', end='2016-05-31', freq='M') + tm.assert_index_equal(index.end_time, expected_index) diff --git a/pandas/tests/indexes/period/test_tools.py b/pandas/tests/indexes/period/test_tools.py index 0e72cadb5d494..f5e7c8269dc4f 100644 --- a/pandas/tests/indexes/period/test_tools.py +++ b/pandas/tests/indexes/period/test_tools.py @@ -6,8 +6,6 @@ import pandas.core.indexes.period as period from pandas.compat import lrange -from pandas._libs.tslibs.frequencies import get_freq -from pandas._libs.tslibs.period import period_ordinal, period_asfreq from pandas._libs.tslibs.ccalendar import MONTHS from pandas import (PeriodIndex, Period, DatetimeIndex, Timestamp, Series, @@ -76,83 +74,6 @@ def test_negone_ordinals(self): repr(period) -class TestTslib(object): - def test_intraday_conversion_factors(self): - assert period_asfreq(1, get_freq('D'), get_freq('H'), False) == 24 - assert period_asfreq(1, get_freq('D'), get_freq('T'), False) == 1440 - assert period_asfreq(1, get_freq('D'), get_freq('S'), False) == 86400 - assert period_asfreq(1, get_freq('D'), - get_freq('L'), False) == 86400000 - assert period_asfreq(1, get_freq('D'), - get_freq('U'), False) == 86400000000 - assert period_asfreq(1, get_freq('D'), - get_freq('N'), False) == 86400000000000 - - assert period_asfreq(1, get_freq('H'), get_freq('T'), False) == 60 - assert period_asfreq(1, get_freq('H'), get_freq('S'), False) == 3600 - assert period_asfreq(1, get_freq('H'), - get_freq('L'), False) == 3600000 - assert period_asfreq(1, get_freq('H'), - get_freq('U'), False) == 3600000000 - assert period_asfreq(1, get_freq('H'), - get_freq('N'), False) == 3600000000000 - - assert period_asfreq(1, get_freq('T'), get_freq('S'), False) == 60 - assert period_asfreq(1, get_freq('T'), get_freq('L'), False) == 60000 - assert period_asfreq(1, get_freq('T'), - get_freq('U'), False) == 60000000 - assert period_asfreq(1, get_freq('T'), - get_freq('N'), False) == 60000000000 - - assert period_asfreq(1, get_freq('S'), get_freq('L'), False) == 1000 - assert period_asfreq(1, get_freq('S'), - get_freq('U'), False) == 1000000 - assert period_asfreq(1, get_freq('S'), - get_freq('N'), False) == 1000000000 - - assert period_asfreq(1, get_freq('L'), get_freq('U'), False) == 1000 - assert period_asfreq(1, get_freq('L'), - get_freq('N'), False) == 1000000 - - assert period_asfreq(1, get_freq('U'), get_freq('N'), False) == 1000 - - def test_period_ordinal_start_values(self): - # information for 1.1.1970 - assert period_ordinal(1970, 1, 1, 0, 0, 0, 0, 0, get_freq('A')) == 0 - assert period_ordinal(1970, 1, 1, 0, 0, 0, 0, 0, get_freq('M')) == 0 - assert period_ordinal(1970, 1, 1, 0, 0, 0, 0, 0, get_freq('W')) == 1 - assert period_ordinal(1970, 1, 1, 0, 0, 0, 0, 0, get_freq('D')) == 0 - assert period_ordinal(1970, 1, 1, 0, 0, 0, 0, 0, get_freq('B')) == 0 - - def test_period_ordinal_week(self): - assert period_ordinal(1970, 1, 4, 0, 0, 0, 0, 0, get_freq('W')) == 1 - assert period_ordinal(1970, 1, 5, 0, 0, 0, 0, 0, get_freq('W')) == 2 - assert period_ordinal(2013, 10, 6, 0, - 0, 0, 0, 0, get_freq('W')) == 2284 - assert period_ordinal(2013, 10, 7, 0, - 0, 0, 0, 0, get_freq('W')) == 2285 - - def test_period_ordinal_business_day(self): - # Thursday - assert period_ordinal(2013, 10, 3, 0, - 0, 0, 0, 0, get_freq('B')) == 11415 - # Friday - assert period_ordinal(2013, 10, 4, 0, - 0, 0, 0, 0, get_freq('B')) == 11416 - # Saturday - assert period_ordinal(2013, 10, 5, 0, - 0, 0, 0, 0, get_freq('B')) == 11417 - # Sunday - assert period_ordinal(2013, 10, 6, 0, - 0, 0, 0, 0, get_freq('B')) == 11417 - # Monday - assert period_ordinal(2013, 10, 7, 0, - 0, 0, 0, 0, get_freq('B')) == 11417 - # Tuesday - assert period_ordinal(2013, 10, 8, 0, - 0, 0, 0, 0, get_freq('B')) == 11418 - - class TestPeriodIndex(object): def setup_method(self, method): diff --git a/pandas/tests/tslibs/test_period_asfreq.py b/pandas/tests/tslibs/test_period_asfreq.py new file mode 100644 index 0000000000000..98959adf6fda4 --- /dev/null +++ b/pandas/tests/tslibs/test_period_asfreq.py @@ -0,0 +1,81 @@ +# -*- coding: utf-8 -*- + +from pandas._libs.tslibs.frequencies import get_freq +from pandas._libs.tslibs.period import period_ordinal, period_asfreq + + +class TestPeriodFreqConversion(object): + def test_intraday_conversion_factors(self): + assert period_asfreq(1, get_freq('D'), get_freq('H'), False) == 24 + assert period_asfreq(1, get_freq('D'), get_freq('T'), False) == 1440 + assert period_asfreq(1, get_freq('D'), get_freq('S'), False) == 86400 + assert period_asfreq(1, get_freq('D'), + get_freq('L'), False) == 86400000 + assert period_asfreq(1, get_freq('D'), + get_freq('U'), False) == 86400000000 + assert period_asfreq(1, get_freq('D'), + get_freq('N'), False) == 86400000000000 + + assert period_asfreq(1, get_freq('H'), get_freq('T'), False) == 60 + assert period_asfreq(1, get_freq('H'), get_freq('S'), False) == 3600 + assert period_asfreq(1, get_freq('H'), + get_freq('L'), False) == 3600000 + assert period_asfreq(1, get_freq('H'), + get_freq('U'), False) == 3600000000 + assert period_asfreq(1, get_freq('H'), + get_freq('N'), False) == 3600000000000 + + assert period_asfreq(1, get_freq('T'), get_freq('S'), False) == 60 + assert period_asfreq(1, get_freq('T'), get_freq('L'), False) == 60000 + assert period_asfreq(1, get_freq('T'), + get_freq('U'), False) == 60000000 + assert period_asfreq(1, get_freq('T'), + get_freq('N'), False) == 60000000000 + + assert period_asfreq(1, get_freq('S'), get_freq('L'), False) == 1000 + assert period_asfreq(1, get_freq('S'), + get_freq('U'), False) == 1000000 + assert period_asfreq(1, get_freq('S'), + get_freq('N'), False) == 1000000000 + + assert period_asfreq(1, get_freq('L'), get_freq('U'), False) == 1000 + assert period_asfreq(1, get_freq('L'), + get_freq('N'), False) == 1000000 + + assert period_asfreq(1, get_freq('U'), get_freq('N'), False) == 1000 + + def test_period_ordinal_start_values(self): + # information for 1.1.1970 + assert period_ordinal(1970, 1, 1, 0, 0, 0, 0, 0, get_freq('A')) == 0 + assert period_ordinal(1970, 1, 1, 0, 0, 0, 0, 0, get_freq('M')) == 0 + assert period_ordinal(1970, 1, 1, 0, 0, 0, 0, 0, get_freq('W')) == 1 + assert period_ordinal(1970, 1, 1, 0, 0, 0, 0, 0, get_freq('D')) == 0 + assert period_ordinal(1970, 1, 1, 0, 0, 0, 0, 0, get_freq('B')) == 0 + + def test_period_ordinal_week(self): + assert period_ordinal(1970, 1, 4, 0, 0, 0, 0, 0, get_freq('W')) == 1 + assert period_ordinal(1970, 1, 5, 0, 0, 0, 0, 0, get_freq('W')) == 2 + assert period_ordinal(2013, 10, 6, 0, + 0, 0, 0, 0, get_freq('W')) == 2284 + assert period_ordinal(2013, 10, 7, 0, + 0, 0, 0, 0, get_freq('W')) == 2285 + + def test_period_ordinal_business_day(self): + # Thursday + assert period_ordinal(2013, 10, 3, 0, + 0, 0, 0, 0, get_freq('B')) == 11415 + # Friday + assert period_ordinal(2013, 10, 4, 0, + 0, 0, 0, 0, get_freq('B')) == 11416 + # Saturday + assert period_ordinal(2013, 10, 5, 0, + 0, 0, 0, 0, get_freq('B')) == 11417 + # Sunday + assert period_ordinal(2013, 10, 6, 0, + 0, 0, 0, 0, get_freq('B')) == 11417 + # Monday + assert period_ordinal(2013, 10, 7, 0, + 0, 0, 0, 0, get_freq('B')) == 11417 + # Tuesday + assert period_ordinal(2013, 10, 8, 0, + 0, 0, 0, 0, get_freq('B')) == 11418 From b9d8b264a36c58261706a93b34dab81e4f0930a3 Mon Sep 17 00:00:00 2001 From: Licht Takeuchi Date: Mon, 12 Feb 2018 06:11:10 +0900 Subject: [PATCH 097/214] TST: Add to_csv test when writing the single column CSV (#19091) Closes gh-18676 --- pandas/tests/io/formats/test_to_csv.py | 32 ++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index e12a7196dce6b..dfa3751bff57a 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- +import sys import numpy as np import pandas as pd import pytest @@ -9,6 +10,37 @@ class TestToCSV(object): + @pytest.mark.xfail((3, 6, 5) > sys.version_info >= (3, 5), + reason=("Python csv library bug " + "(see https://bugs.python.org/issue32255)")) + def test_to_csv_with_single_column(self): + # see gh-18676, https://bugs.python.org/issue32255 + # + # Python's CSV library adds an extraneous '""' + # before the newline when the NaN-value is in + # the first row. Otherwise, only the newline + # character is added. This behavior is inconsistent + # and was patched in https://bugs.python.org/pull_request4672. + df1 = DataFrame([None, 1]) + expected1 = """\ +"" +1.0 +""" + with tm.ensure_clean('test.csv') as path: + df1.to_csv(path, header=None, index=None) + with open(path, 'r') as f: + assert f.read() == expected1 + + df2 = DataFrame([1, None]) + expected2 = """\ +1.0 +"" +""" + with tm.ensure_clean('test.csv') as path: + df2.to_csv(path, header=None, index=None) + with open(path, 'r') as f: + assert f.read() == expected2 + def test_to_csv_defualt_encoding(self): # GH17097 df = DataFrame({'col': [u"AAAAA", u"ÄÄÄÄÄ", u"ßßßßß", u"聞聞聞聞聞"]}) From 7a5634e79f0b42ddbf602720dc184e9ce69f929c Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 11 Feb 2018 23:24:34 +0100 Subject: [PATCH 098/214] TST: set multi_statement flag for pymysql tests (#19619) * Revert "CI: pin pymysql<0.8.0 (#19461)" This reverts commit 44bbd5a4d33643c9270bbefd7419f45aecaa4667. * Enable multi-statements for pymysql connection --- ci/requirements-3.6.run | 2 +- pandas/tests/io/test_sql.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/ci/requirements-3.6.run b/ci/requirements-3.6.run index e30461d06b8ea..822144a80bc9a 100644 --- a/ci/requirements-3.6.run +++ b/ci/requirements-3.6.run @@ -13,7 +13,7 @@ lxml html5lib jinja2 sqlalchemy -pymysql<0.8.0 +pymysql feather-format pyarrow psycopg2 diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 0cc4101cd6304..f3ab74d37a2bc 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -1731,13 +1731,16 @@ class _TestMySQLAlchemy(object): @classmethod def connect(cls): url = 'mysql+{driver}://root@localhost/pandas_nosetest' - return sqlalchemy.create_engine(url.format(driver=cls.driver)) + return sqlalchemy.create_engine(url.format(driver=cls.driver), + connect_args=cls.connect_args) @classmethod def setup_driver(cls): try: import pymysql # noqa cls.driver = 'pymysql' + from pymysql.constants import CLIENT + cls.connect_args = {'client_flag': CLIENT.MULTI_STATEMENTS} except ImportError: pytest.skip('pymysql not installed') From a2771089d87afe8104765f12a07a6cd125e532ac Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 12 Feb 2018 03:33:33 -0800 Subject: [PATCH 099/214] move array_to_datetime timests (#19640) --- pandas/tests/indexes/datetimes/test_tools.py | 186 +----------------- pandas/tests/tslibs/test_array_to_datetime.py | 145 ++++++++++++++ 2 files changed, 155 insertions(+), 176 deletions(-) create mode 100644 pandas/tests/tslibs/test_array_to_datetime.py diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index bd3fa5e73cd11..b5926933544e8 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -8,7 +8,7 @@ import dateutil import numpy as np from dateutil.parser import parse -from datetime import datetime, date, time, timedelta +from datetime import datetime, date, time from distutils.version import LooseVersion import pandas as pd @@ -19,7 +19,6 @@ from pandas.errors import OutOfBoundsDatetime from pandas.compat import lmap, PY3 -from pandas.compat.numpy import np_array_datetime64_compat from pandas.core.dtypes.common import is_datetime64_ns_dtype from pandas.util import testing as tm import pandas.util._test_decorators as td @@ -803,6 +802,15 @@ def test_dataframe_dtypes(self, cache): class TestToDatetimeMisc(object): + def test_to_datetime_barely_out_of_bounds(self): + # GH#19529 + # GH#19382 close enough to bounds that dropping nanos would result + # in an in-bounds datetime + arr = np.array(['2262-04-11 23:47:16.854775808'], dtype=object) + + with pytest.raises(OutOfBoundsDatetime): + to_datetime(arr) + @pytest.mark.parametrize('cache', [True, False]) def test_to_datetime_iso8601(self, cache): result = to_datetime(["2012-01-01 00:00:00"], cache=cache) @@ -1464,180 +1472,6 @@ def test_parsers_timezone_minute_offsets_roundtrip(self, cache): converted_time = dt_time.tz_localize('UTC').tz_convert(tz) assert dt_string_repr == repr(converted_time) - def test_parsers_iso8601(self): - # GH 12060 - # test only the iso parser - flexibility to different - # separators and leadings 0s - # Timestamp construction falls back to dateutil - cases = {'2011-01-02': datetime(2011, 1, 2), - '2011-1-2': datetime(2011, 1, 2), - '2011-01': datetime(2011, 1, 1), - '2011-1': datetime(2011, 1, 1), - '2011 01 02': datetime(2011, 1, 2), - '2011.01.02': datetime(2011, 1, 2), - '2011/01/02': datetime(2011, 1, 2), - '2011\\01\\02': datetime(2011, 1, 2), - '2013-01-01 05:30:00': datetime(2013, 1, 1, 5, 30), - '2013-1-1 5:30:00': datetime(2013, 1, 1, 5, 30)} - for date_str, exp in compat.iteritems(cases): - actual = tslib._test_parse_iso8601(date_str) - assert actual == exp - - # separators must all match - YYYYMM not valid - invalid_cases = ['2011-01/02', '2011^11^11', - '201401', '201111', '200101', - # mixed separated and unseparated - '2005-0101', '200501-01', - '20010101 12:3456', '20010101 1234:56', - # HHMMSS must have two digits in each component - # if unseparated - '20010101 1', '20010101 123', '20010101 12345', - '20010101 12345Z', - # wrong separator for HHMMSS - '2001-01-01 12-34-56'] - for date_str in invalid_cases: - with pytest.raises(ValueError): - tslib._test_parse_iso8601(date_str) - # If no ValueError raised, let me know which case failed. - raise Exception(date_str) - - -class TestArrayToDatetime(object): - def test_coerce_out_of_bounds_utc(self): - # GH#19612 - ts = Timestamp('1900-01-01', tz='US/Pacific') - dt = ts.to_pydatetime() - timedelta(days=365 * 300) # ~1600AD - arr = np.array([dt]) - result = tslib.array_to_datetime(arr, utc=True, errors='coerce') - expected = np.array(['NaT'], dtype='datetime64[ns]') - tm.assert_numpy_array_equal(result, expected) - - def test_parsing_valid_dates(self): - arr = np.array(['01-01-2013', '01-02-2013'], dtype=object) - tm.assert_numpy_array_equal( - tslib.array_to_datetime(arr), - np_array_datetime64_compat( - [ - '2013-01-01T00:00:00.000000000-0000', - '2013-01-02T00:00:00.000000000-0000' - ], - dtype='M8[ns]' - ) - ) - - arr = np.array(['Mon Sep 16 2013', 'Tue Sep 17 2013'], dtype=object) - tm.assert_numpy_array_equal( - tslib.array_to_datetime(arr), - np_array_datetime64_compat( - [ - '2013-09-16T00:00:00.000000000-0000', - '2013-09-17T00:00:00.000000000-0000' - ], - dtype='M8[ns]' - ) - ) - - def test_parsing_timezone_offsets(self): - # All of these datetime strings with offsets are equivalent - # to the same datetime after the timezone offset is added - dt_strings = [ - '01-01-2013 08:00:00+08:00', - '2013-01-01T08:00:00.000000000+0800', - '2012-12-31T16:00:00.000000000-0800', - '12-31-2012 23:00:00-01:00' - ] - - expected_output = tslib.array_to_datetime(np.array( - ['01-01-2013 00:00:00'], dtype=object)) - - for dt_string in dt_strings: - tm.assert_numpy_array_equal( - tslib.array_to_datetime( - np.array([dt_string], dtype=object) - ), - expected_output - ) - - def test_number_looking_strings_not_into_datetime(self): - # #4601 - # These strings don't look like datetimes so they shouldn't be - # attempted to be converted - arr = np.array(['-352.737091', '183.575577'], dtype=object) - tm.assert_numpy_array_equal( - tslib.array_to_datetime(arr, errors='ignore'), arr) - - arr = np.array(['1', '2', '3', '4', '5'], dtype=object) - tm.assert_numpy_array_equal( - tslib.array_to_datetime(arr, errors='ignore'), arr) - - def test_coercing_dates_outside_of_datetime64_ns_bounds(self): - invalid_dates = [ - date(1000, 1, 1), - datetime(1000, 1, 1), - '1000-01-01', - 'Jan 1, 1000', - np.datetime64('1000-01-01'), - ] - - for invalid_date in invalid_dates: - pytest.raises(ValueError, - tslib.array_to_datetime, - np.array([invalid_date], dtype='object'), - errors='raise', ) - tm.assert_numpy_array_equal( - tslib.array_to_datetime( - np.array([invalid_date], dtype='object'), - errors='coerce'), - np.array([tslib.iNaT], dtype='M8[ns]') - ) - - arr = np.array(['1/1/1000', '1/1/2000'], dtype=object) - tm.assert_numpy_array_equal( - tslib.array_to_datetime(arr, errors='coerce'), - np_array_datetime64_compat( - [ - tslib.iNaT, - '2000-01-01T00:00:00.000000000-0000' - ], - dtype='M8[ns]' - ) - ) - - def test_coerce_of_invalid_datetimes(self): - arr = np.array(['01-01-2013', 'not_a_date', '1'], dtype=object) - - # Without coercing, the presence of any invalid dates prevents - # any values from being converted - tm.assert_numpy_array_equal( - tslib.array_to_datetime(arr, errors='ignore'), arr) - - # With coercing, the invalid dates becomes iNaT - tm.assert_numpy_array_equal( - tslib.array_to_datetime(arr, errors='coerce'), - np_array_datetime64_compat( - [ - '2013-01-01T00:00:00.000000000-0000', - tslib.iNaT, - tslib.iNaT - ], - dtype='M8[ns]' - ) - ) - - def test_to_datetime_barely_out_of_bounds(self): - # GH#19529 - # GH#19382 close enough to bounds that dropping nanos would result - # in an in-bounds datetime - arr = np.array(['2262-04-11 23:47:16.854775808'], dtype=object) - - with pytest.raises(OutOfBoundsDatetime): - to_datetime(arr) - - with pytest.raises(OutOfBoundsDatetime): - # Essentially the same as above, but more directly calling - # the relevant function - tslib.array_to_datetime(arr) - def test_normalize_date(): value = date(2012, 9, 7) diff --git a/pandas/tests/tslibs/test_array_to_datetime.py b/pandas/tests/tslibs/test_array_to_datetime.py new file mode 100644 index 0000000000000..eb77e52e7c91d --- /dev/null +++ b/pandas/tests/tslibs/test_array_to_datetime.py @@ -0,0 +1,145 @@ +# -*- coding: utf-8 -*- +from datetime import datetime, date + +import numpy as np +import pytest + +from pandas._libs import tslib +from pandas.compat.numpy import np_array_datetime64_compat +import pandas.util.testing as tm + + +class TestParseISO8601(object): + @pytest.mark.parametrize('date_str, exp', [ + ('2011-01-02', datetime(2011, 1, 2)), + ('2011-1-2', datetime(2011, 1, 2)), + ('2011-01', datetime(2011, 1, 1)), + ('2011-1', datetime(2011, 1, 1)), + ('2011 01 02', datetime(2011, 1, 2)), + ('2011.01.02', datetime(2011, 1, 2)), + ('2011/01/02', datetime(2011, 1, 2)), + ('2011\\01\\02', datetime(2011, 1, 2)), + ('2013-01-01 05:30:00', datetime(2013, 1, 1, 5, 30)), + ('2013-1-1 5:30:00', datetime(2013, 1, 1, 5, 30))]) + def test_parsers_iso8601(self, date_str, exp): + # GH#12060 + # test only the iso parser - flexibility to different + # separators and leadings 0s + # Timestamp construction falls back to dateutil + actual = tslib._test_parse_iso8601(date_str) + assert actual == exp + + @pytest.mark.parametrize( + 'date_str', + ['2011-01/02', '2011^11^11', + '201401', '201111', '200101', + # mixed separated and unseparated + '2005-0101', '200501-01', + '20010101 12:3456', + '20010101 1234:56', + # HHMMSS must have two digits in + # each component if unseparated + '20010101 1', '20010101 123', + '20010101 12345', '20010101 12345Z', + # wrong separator for HHMMSS + '2001-01-01 12-34-56']) + def test_parsers_iso8601_invalid(self, date_str): + # separators must all match - YYYYMM not valid + with pytest.raises(ValueError): + tslib._test_parse_iso8601(date_str) + + +class TestArrayToDatetime(object): + def test_parsing_valid_dates(self): + arr = np.array(['01-01-2013', '01-02-2013'], dtype=object) + result = tslib.array_to_datetime(arr) + expected = ['2013-01-01T00:00:00.000000000-0000', + '2013-01-02T00:00:00.000000000-0000'] + tm.assert_numpy_array_equal( + result, + np_array_datetime64_compat(expected, dtype='M8[ns]')) + + arr = np.array(['Mon Sep 16 2013', 'Tue Sep 17 2013'], dtype=object) + result = tslib.array_to_datetime(arr) + expected = ['2013-09-16T00:00:00.000000000-0000', + '2013-09-17T00:00:00.000000000-0000'] + tm.assert_numpy_array_equal( + result, + np_array_datetime64_compat(expected, dtype='M8[ns]')) + + @pytest.mark.parametrize('dt_string', [ + '01-01-2013 08:00:00+08:00', + '2013-01-01T08:00:00.000000000+0800', + '2012-12-31T16:00:00.000000000-0800', + '12-31-2012 23:00:00-01:00']) + def test_parsing_timezone_offsets(self, dt_string): + # All of these datetime strings with offsets are equivalent + # to the same datetime after the timezone offset is added + arr = np.array(['01-01-2013 00:00:00'], dtype=object) + expected = tslib.array_to_datetime(arr) + + arr = np.array([dt_string], dtype=object) + result = tslib.array_to_datetime(arr) + tm.assert_numpy_array_equal(result, expected) + + def test_number_looking_strings_not_into_datetime(self): + # GH#4601 + # These strings don't look like datetimes so they shouldn't be + # attempted to be converted + arr = np.array(['-352.737091', '183.575577'], dtype=object) + result = tslib.array_to_datetime(arr, errors='ignore') + tm.assert_numpy_array_equal(result, arr) + + arr = np.array(['1', '2', '3', '4', '5'], dtype=object) + result = tslib.array_to_datetime(arr, errors='ignore') + tm.assert_numpy_array_equal(result, arr) + + @pytest.mark.parametrize('invalid_date', [ + date(1000, 1, 1), + datetime(1000, 1, 1), + '1000-01-01', + 'Jan 1, 1000', + np.datetime64('1000-01-01')]) + def test_coerce_outside_ns_bounds(self, invalid_date): + arr = np.array([invalid_date], dtype='object') + with pytest.raises(ValueError): + tslib.array_to_datetime(arr, errors='raise') + + result = tslib.array_to_datetime(arr, errors='coerce') + expected = np.array([tslib.iNaT], dtype='M8[ns]') + tm.assert_numpy_array_equal(result, expected) + + def test_coerce_outside_ns_bounds_one_valid(self): + arr = np.array(['1/1/1000', '1/1/2000'], dtype=object) + result = tslib.array_to_datetime(arr, errors='coerce') + expected = [tslib.iNaT, + '2000-01-01T00:00:00.000000000-0000'] + tm.assert_numpy_array_equal( + result, + np_array_datetime64_compat(expected, dtype='M8[ns]')) + + def test_coerce_of_invalid_datetimes(self): + arr = np.array(['01-01-2013', 'not_a_date', '1'], dtype=object) + + # Without coercing, the presence of any invalid dates prevents + # any values from being converted + result = tslib.array_to_datetime(arr, errors='ignore') + tm.assert_numpy_array_equal(result, arr) + + # With coercing, the invalid dates becomes iNaT + result = tslib.array_to_datetime(arr, errors='coerce') + expected = ['2013-01-01T00:00:00.000000000-0000', + tslib.iNaT, + tslib.iNaT] + + tm.assert_numpy_array_equal( + result, + np_array_datetime64_compat(expected, dtype='M8[ns]')) + + def test_to_datetime_barely_out_of_bounds(self): + # GH#19529 + # GH#19382 close enough to bounds that dropping nanos would result + # in an in-bounds datetime + arr = np.array(['2262-04-11 23:47:16.854775808'], dtype=object) + with pytest.raises(tslib.OutOfBoundsDatetime): + tslib.array_to_datetime(arr) From 569bc7a35c06ef8eb9a121ca983ba8951399a124 Mon Sep 17 00:00:00 2001 From: Matt Kirk Date: Mon, 12 Feb 2018 19:06:11 +0700 Subject: [PATCH 100/214] BUG: assign doesnt cast SparseDataFrame to DataFrame (#19178) --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/core/sparse/array.py | 9 +++++---- pandas/tests/sparse/frame/test_frame.py | 11 +++++++++++ pandas/tests/sparse/test_array.py | 15 +++++++++++++++ 4 files changed, 32 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index acab9d0bbebf8..72f63a4da0f4d 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -822,6 +822,7 @@ Sparse - Bug in which creating a ``SparseDataFrame`` from a dense ``Series`` or an unsupported type raised an uncontrolled exception (:issue:`19374`) - Bug in :class:`SparseDataFrame.to_csv` causing exception (:issue:`19384`) - Bug in :class:`SparseSeries.memory_usage` which caused segfault by accessing non sparse elements (:issue:`19368`) +- Bug in constructing a ``SparseArray``: if ``data`` is a scalar and ``index`` is defined it will coerce to ``float64`` regardless of scalar's dtype. (:issue:`19163`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/sparse/array.py b/pandas/core/sparse/array.py index 65aefd9fb8c0a..3cbae717d0e07 100644 --- a/pandas/core/sparse/array.py +++ b/pandas/core/sparse/array.py @@ -26,7 +26,8 @@ is_scalar, is_dtype_equal) from pandas.core.dtypes.cast import ( maybe_convert_platform, maybe_promote, - astype_nansafe, find_common_type) + astype_nansafe, find_common_type, infer_dtype_from_scalar, + construct_1d_arraylike_from_scalar) from pandas.core.dtypes.missing import isna, notna, na_value_for_dtype import pandas._libs.sparse as splib @@ -162,9 +163,9 @@ def __new__(cls, data, sparse_index=None, index=None, kind='integer', data = np.nan if not is_scalar(data): raise Exception("must only pass scalars with an index ") - values = np.empty(len(index), dtype='float64') - values.fill(data) - data = values + dtype = infer_dtype_from_scalar(data)[0] + data = construct_1d_arraylike_from_scalar( + data, len(index), dtype) if isinstance(data, ABCSparseSeries): data = data.values diff --git a/pandas/tests/sparse/frame/test_frame.py b/pandas/tests/sparse/frame/test_frame.py index 29fad3c8eefaf..0e8b2161cafc4 100644 --- a/pandas/tests/sparse/frame/test_frame.py +++ b/pandas/tests/sparse/frame/test_frame.py @@ -1257,3 +1257,14 @@ def test_quantile_multi(self): tm.assert_frame_equal(result, dense_expected) tm.assert_sp_frame_equal(result, sparse_expected) + + def test_assign_with_sparse_frame(self): + # GH 19163 + df = pd.DataFrame({"a": [1, 2, 3]}) + res = df.to_sparse(fill_value=False).assign(newcol=False) + exp = df.assign(newcol=False).to_sparse(fill_value=False) + + tm.assert_sp_frame_equal(res, exp) + + for column in res.columns: + assert type(res[column]) is SparseSeries diff --git a/pandas/tests/sparse/test_array.py b/pandas/tests/sparse/test_array.py index 8de93ff320961..6c0c83cf65ff7 100644 --- a/pandas/tests/sparse/test_array.py +++ b/pandas/tests/sparse/test_array.py @@ -113,6 +113,21 @@ def test_constructor_spindex_dtype(self): assert arr.dtype == np.int64 assert arr.fill_value == 0 + @pytest.mark.parametrize('scalar,dtype', [ + (False, bool), + (0.0, 'float64'), + (1, 'int64'), + ('z', 'object')]) + def test_scalar_with_index_infer_dtype(self, scalar, dtype): + # GH 19163 + arr = SparseArray(scalar, index=[1, 2, 3], fill_value=scalar) + exp = SparseArray([scalar, scalar, scalar], fill_value=scalar) + + tm.assert_sp_array_equal(arr, exp) + + assert arr.dtype == dtype + assert exp.dtype == dtype + def test_sparseseries_roundtrip(self): # GH 13999 for kind in ['integer', 'block']: From d6fe1940a5d7ea10624e1cc871a6eef13f32b382 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 12 Feb 2018 19:12:55 -0500 Subject: [PATCH 101/214] TST: placement of network error catching in s3 tests (#19645) --- pandas/io/common.py | 13 +++++-- pandas/io/excel.py | 2 +- pandas/io/json/json.py | 10 ++++- pandas/io/packers.py | 8 +++- pandas/io/parquet.py | 30 +++++++++------ pandas/io/parsers.py | 9 ++++- pandas/io/s3.py | 4 +- pandas/io/sas/sas7bdat.py | 2 +- pandas/io/sas/sas_xport.py | 3 +- pandas/io/stata.py | 2 +- pandas/tests/io/conftest.py | 53 +++++++++++++++----------- pandas/tests/io/json/test_pandas.py | 1 - pandas/tests/io/parser/test_network.py | 28 +++++++++----- pandas/tests/io/test_common.py | 8 +++- 14 files changed, 114 insertions(+), 59 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 4ba969f0abac4..e312181f08512 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -183,7 +183,10 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None, Returns ------- - a filepath_ or buffer or S3File instance, the encoding, the compression + tuple of ({a filepath_ or buffer or S3File instance}, + encoding, str, + compression, str, + should_close, bool) """ filepath_or_buffer = _stringify_path(filepath_or_buffer) @@ -194,7 +197,8 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None, # Override compression based on Content-Encoding header compression = 'gzip' reader = BytesIO(req.read()) - return reader, encoding, compression + req.close() + return reader, encoding, compression, True if is_s3_url(filepath_or_buffer): from pandas.io import s3 @@ -206,13 +210,13 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None, if isinstance(filepath_or_buffer, (compat.string_types, compat.binary_type, mmap.mmap)): - return _expand_user(filepath_or_buffer), None, compression + return _expand_user(filepath_or_buffer), None, compression, False if not is_file_like(filepath_or_buffer): msg = "Invalid file path or buffer object type: {_type}" raise ValueError(msg.format(_type=type(filepath_or_buffer))) - return filepath_or_buffer, None, compression + return filepath_or_buffer, None, compression, False def file_path_to_url(path): @@ -309,6 +313,7 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, is_text : boolean, default True whether file/buffer is in text format (csv, json, etc.), or in binary mode (pickle, etc.) + Returns ------- f : file-like diff --git a/pandas/io/excel.py b/pandas/io/excel.py index b03987e933bff..0d3d4286f5a3c 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -381,7 +381,7 @@ def __init__(self, io, **kwds): if _is_url(self._io): io = _urlopen(self._io) elif not isinstance(self.io, (ExcelFile, xlrd.Book)): - io, _, _ = get_filepath_or_buffer(self._io) + io, _, _, _ = get_filepath_or_buffer(self._io) if engine == 'xlrd' and isinstance(io, xlrd.Book): self.book = io diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index e3a1321336fb3..24364fe07405e 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -404,7 +404,7 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, """ compression = _infer_compression(path_or_buf, compression) - filepath_or_buffer, _, compression = get_filepath_or_buffer( + filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer( path_or_buf, encoding=encoding, compression=compression, ) @@ -419,7 +419,13 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, if chunksize: return json_reader - return json_reader.read() + result = json_reader.read() + if should_close: + try: + filepath_or_buffer.close() + except: # noqa: flake8 + pass + return result class JsonReader(BaseIterator): diff --git a/pandas/io/packers.py b/pandas/io/packers.py index 9289853a1bbfd..d3e6f0cf4a1bc 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -180,7 +180,7 @@ def read_msgpack(path_or_buf, encoding='utf-8', iterator=False, **kwargs): obj : type of object stored in file """ - path_or_buf, _, _ = get_filepath_or_buffer(path_or_buf) + path_or_buf, _, _, should_close = get_filepath_or_buffer(path_or_buf) if iterator: return Iterator(path_or_buf) @@ -188,6 +188,12 @@ def read(fh): l = list(unpack(fh, encoding=encoding, **kwargs)) if len(l) == 1: return l[0] + + if should_close: + try: + path_or_buf.close() + except: # noqa: flake8 + pass return l # see if we have an actual file diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 6e1b6e14861c3..1c22a305c089d 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -107,7 +107,7 @@ def write(self, df, path, compression='snappy', self.validate_dataframe(df) if self._pyarrow_lt_070: self._validate_write_lt_070(df) - path, _, _ = get_filepath_or_buffer(path, mode='wb') + path, _, _, _ = get_filepath_or_buffer(path, mode='wb') if self._pyarrow_lt_060: table = self.api.Table.from_pandas(df, timestamps_to_ms=True) @@ -121,13 +121,21 @@ def write(self, df, path, compression='snappy', coerce_timestamps=coerce_timestamps, **kwargs) def read(self, path, columns=None, **kwargs): - path, _, _ = get_filepath_or_buffer(path) + path, _, _, should_close = get_filepath_or_buffer(path) if self._pyarrow_lt_070: - return self.api.parquet.read_pandas(path, columns=columns, - **kwargs).to_pandas() - kwargs['use_pandas_metadata'] = True - return self.api.parquet.read_table(path, columns=columns, - **kwargs).to_pandas() + result = self.api.parquet.read_pandas(path, columns=columns, + **kwargs).to_pandas() + else: + kwargs['use_pandas_metadata'] = True + result = self.api.parquet.read_table(path, columns=columns, + **kwargs).to_pandas() + if should_close: + try: + path.close() + except: # noqa: flake8 + pass + + return result def _validate_write_lt_070(self, df): # Compatibility shim for pyarrow < 0.7.0 @@ -199,11 +207,11 @@ def write(self, df, path, compression='snappy', **kwargs): # path is s3:// so we need to open the s3file in 'wb' mode. # TODO: Support 'ab' - path, _, _ = get_filepath_or_buffer(path, mode='wb') + path, _, _, _ = get_filepath_or_buffer(path, mode='wb') # And pass the opened s3file to the fastparquet internal impl. kwargs['open_with'] = lambda path, _: path else: - path, _, _ = get_filepath_or_buffer(path) + path, _, _, _ = get_filepath_or_buffer(path) with catch_warnings(record=True): self.api.write(path, df, @@ -214,13 +222,13 @@ def read(self, path, columns=None, **kwargs): # When path is s3:// an S3File is returned. # We need to retain the original path(str) while also # pass the S3File().open function to fsatparquet impl. - s3, _, _ = get_filepath_or_buffer(path) + s3, _, _, should_close = get_filepath_or_buffer(path) try: parquet_file = self.api.ParquetFile(path, open_with=s3.s3.open) finally: s3.close() else: - path, _, _ = get_filepath_or_buffer(path) + path, _, _, _ = get_filepath_or_buffer(path) parquet_file = self.api.ParquetFile(path) return parquet_file.to_pandas(columns=columns, **kwargs) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index af1441f4a0fc9..7ea6d321e0fdd 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -413,7 +413,7 @@ def _read(filepath_or_buffer, kwds): compression = kwds.get('compression') compression = _infer_compression(filepath_or_buffer, compression) - filepath_or_buffer, _, compression = get_filepath_or_buffer( + filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer( filepath_or_buffer, encoding, compression) kwds['compression'] = compression @@ -439,6 +439,13 @@ def _read(filepath_or_buffer, kwds): data = parser.read(nrows) finally: parser.close() + + if should_close: + try: + filepath_or_buffer.close() + except: # noqa: flake8 + pass + return data diff --git a/pandas/io/s3.py b/pandas/io/s3.py index e2650e29c0db3..bd2286c5c8569 100644 --- a/pandas/io/s3.py +++ b/pandas/io/s3.py @@ -27,7 +27,7 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None, fs = s3fs.S3FileSystem(anon=False) try: filepath_or_buffer = fs.open(_strip_schema(filepath_or_buffer), mode) - except (OSError, NoCredentialsError): + except (compat.FileNotFoundError, NoCredentialsError): # boto3 has troubles when trying to access a public file # when credentialed... # An OSError is raised if you have credentials, but they @@ -36,4 +36,4 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None, # for that bucket. fs = s3fs.S3FileSystem(anon=True) filepath_or_buffer = fs.open(_strip_schema(filepath_or_buffer), mode) - return filepath_or_buffer, None, compression + return filepath_or_buffer, None, compression, True diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index 26e39f0df8b29..806cbddaa2ee2 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -90,7 +90,7 @@ def __init__(self, path_or_buf, index=None, convert_dates=True, self._current_row_on_page_index = 0 self._current_row_in_file_index = 0 - self._path_or_buf, _, _ = get_filepath_or_buffer(path_or_buf) + self._path_or_buf, _, _, _ = get_filepath_or_buffer(path_or_buf) if isinstance(self._path_or_buf, compat.string_types): self._path_or_buf = open(self._path_or_buf, 'rb') self.handle = self._path_or_buf diff --git a/pandas/io/sas/sas_xport.py b/pandas/io/sas/sas_xport.py index c14524f7d7cd6..7994517b9f303 100644 --- a/pandas/io/sas/sas_xport.py +++ b/pandas/io/sas/sas_xport.py @@ -236,7 +236,8 @@ def __init__(self, filepath_or_buffer, index=None, encoding='ISO-8859-1', self._chunksize = chunksize if isinstance(filepath_or_buffer, str): - filepath_or_buffer, encoding, compression = get_filepath_or_buffer( + (filepath_or_buffer, encoding, + compression, should_close) = get_filepath_or_buffer( filepath_or_buffer, encoding=encoding) if isinstance(filepath_or_buffer, (str, compat.text_type, bytes)): diff --git a/pandas/io/stata.py b/pandas/io/stata.py index ee6975ea1d938..9646831cb612c 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -988,7 +988,7 @@ def __init__(self, path_or_buf, convert_dates=True, self._native_byteorder = _set_endianness(sys.byteorder) path_or_buf = _stringify_path(path_or_buf) if isinstance(path_or_buf, str): - path_or_buf, encoding, _ = get_filepath_or_buffer( + path_or_buf, encoding, _, should_close = get_filepath_or_buffer( path_or_buf, encoding=self._default_encoding ) diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index 57e72da2fd3f4..8deb51e190bab 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -2,30 +2,34 @@ import pytest from pandas.io.parsers import read_table +from pandas.util import testing as tm -HERE = os.path.dirname(__file__) +@pytest.fixture +def parser_data(request): + return os.path.join(tm.get_data_path(), '..', 'parser', 'data') -@pytest.fixture(scope='module') -def tips_file(): + +@pytest.fixture +def tips_file(parser_data): """Path to the tips dataset""" - return os.path.join(HERE, 'parser', 'data', 'tips.csv') + return os.path.join(parser_data, 'tips.csv') -@pytest.fixture(scope='module') -def jsonl_file(): +@pytest.fixture +def jsonl_file(parser_data): """Path a JSONL dataset""" - return os.path.join(HERE, 'parser', 'data', 'items.jsonl') + return os.path.join(parser_data, 'items.jsonl') -@pytest.fixture(scope='module') -def salaries_table(): +@pytest.fixture +def salaries_table(parser_data): """DataFrame with the salaries dataset""" - path = os.path.join(HERE, 'parser', 'data', 'salaries.csv') + path = os.path.join(parser_data, 'salaries.csv') return read_table(path) -@pytest.fixture(scope='module') +@pytest.fixture def s3_resource(tips_file, jsonl_file): """Fixture for mocking S3 interaction. @@ -41,8 +45,8 @@ def s3_resource(tips_file, jsonl_file): is yielded by the fixture. """ pytest.importorskip('s3fs') + boto3 = pytest.importorskip('boto3') moto = pytest.importorskip('moto') - moto.mock_s3().start() test_s3_files = [ ('tips.csv', tips_file), @@ -58,17 +62,22 @@ def add_tips_files(bucket_name): Key=s3_key, Body=f) - boto3 = pytest.importorskip('boto3') - # see gh-16135 - bucket = 'pandas-test' + try: - conn = boto3.resource("s3", region_name="us-east-1") - conn.create_bucket(Bucket=bucket) - add_tips_files(bucket) + s3 = moto.mock_s3() + s3.start() - conn.create_bucket(Bucket='cant_get_it', ACL='private') - add_tips_files('cant_get_it') + # see gh-16135 + bucket = 'pandas-test' + conn = boto3.resource("s3", region_name="us-east-1") - yield conn + conn.create_bucket(Bucket=bucket) + add_tips_files(bucket) - moto.mock_s3().stop() + conn.create_bucket(Bucket='cant_get_it', ACL='private') + add_tips_files('cant_get_it') + yield conn + except: # noqa: flake8 + pytest.skip("failure to use s3 resource") + finally: + s3.stop() diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 10139eb07a925..a72744e08fa7c 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1039,7 +1039,6 @@ def test_read_inline_jsonl(self): assert_frame_equal(result, expected) def test_read_s3_jsonl(self, s3_resource): - pytest.importorskip('s3fs') # GH17200 result = read_json('s3n://pandas-test/items.jsonl', lines=True) diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index 10f6cef04b593..f16338fda6245 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -46,6 +46,7 @@ def check_compressed_urls(salaries_table, compression, extension, mode, class TestS3(object): + @tm.network def test_parse_public_s3_bucket(self): pytest.importorskip('s3fs') @@ -65,7 +66,8 @@ def test_parse_public_s3_bucket(self): assert not df.empty tm.assert_frame_equal(read_csv(tm.get_data_path('tips.csv')), df) - def test_parse_public_s3n_bucket(self, s3_resource): + @tm.network + def test_parse_public_s3n_bucket(self): # Read from AWS s3 as "s3n" URL df = read_csv('s3n://pandas-test/tips.csv', nrows=10) @@ -74,7 +76,8 @@ def test_parse_public_s3n_bucket(self, s3_resource): tm.assert_frame_equal(read_csv( tm.get_data_path('tips.csv')).iloc[:10], df) - def test_parse_public_s3a_bucket(self, s3_resource): + @tm.network + def test_parse_public_s3a_bucket(self): # Read from AWS s3 as "s3a" URL df = read_csv('s3a://pandas-test/tips.csv', nrows=10) assert isinstance(df, DataFrame) @@ -82,7 +85,8 @@ def test_parse_public_s3a_bucket(self, s3_resource): tm.assert_frame_equal(read_csv( tm.get_data_path('tips.csv')).iloc[:10], df) - def test_parse_public_s3_bucket_nrows(self, s3_resource): + @tm.network + def test_parse_public_s3_bucket_nrows(self): for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: df = read_csv('s3://pandas-test/tips.csv' + ext, nrows=10, compression=comp) @@ -91,7 +95,8 @@ def test_parse_public_s3_bucket_nrows(self, s3_resource): tm.assert_frame_equal(read_csv( tm.get_data_path('tips.csv')).iloc[:10], df) - def test_parse_public_s3_bucket_chunked(self, s3_resource): + @tm.network + def test_parse_public_s3_bucket_chunked(self): # Read with a chunksize chunksize = 5 local_tips = read_csv(tm.get_data_path('tips.csv')) @@ -109,7 +114,8 @@ def test_parse_public_s3_bucket_chunked(self, s3_resource): chunksize * i_chunk: chunksize * (i_chunk + 1)] tm.assert_frame_equal(true_df, df) - def test_parse_public_s3_bucket_chunked_python(self, s3_resource): + @tm.network + def test_parse_public_s3_bucket_chunked_python(self): # Read with a chunksize using the Python parser chunksize = 5 local_tips = read_csv(tm.get_data_path('tips.csv')) @@ -127,7 +133,8 @@ def test_parse_public_s3_bucket_chunked_python(self, s3_resource): chunksize * i_chunk: chunksize * (i_chunk + 1)] tm.assert_frame_equal(true_df, df) - def test_parse_public_s3_bucket_python(self, s3_resource): + @tm.network + def test_parse_public_s3_bucket_python(self): for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python', compression=comp) @@ -136,7 +143,8 @@ def test_parse_public_s3_bucket_python(self, s3_resource): tm.assert_frame_equal(read_csv( tm.get_data_path('tips.csv')), df) - def test_infer_s3_compression(self, s3_resource): + @tm.network + def test_infer_s3_compression(self): for ext in ['', '.gz', '.bz2']: df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python', compression='infer') @@ -145,7 +153,8 @@ def test_infer_s3_compression(self, s3_resource): tm.assert_frame_equal(read_csv( tm.get_data_path('tips.csv')), df) - def test_parse_public_s3_bucket_nrows_python(self, s3_resource): + @tm.network + def test_parse_public_s3_bucket_nrows_python(self): for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python', nrows=10, compression=comp) @@ -154,7 +163,8 @@ def test_parse_public_s3_bucket_nrows_python(self, s3_resource): tm.assert_frame_equal(read_csv( tm.get_data_path('tips.csv')).iloc[:10], df) - def test_s3_fails(self, s3_resource): + @tm.network + def test_s3_fails(self): with pytest.raises(IOError): read_csv('s3://nyqpug/asdf.csv') diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index a0070dce6a7f1..a89156db38ae3 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -102,15 +102,19 @@ def test_infer_compression_from_path(self, extension, expected, path_type): def test_get_filepath_or_buffer_with_path(self): filename = '~/sometest' - filepath_or_buffer, _, _ = common.get_filepath_or_buffer(filename) + filepath_or_buffer, _, _, should_close = common.get_filepath_or_buffer( + filename) assert filepath_or_buffer != filename assert isabs(filepath_or_buffer) assert os.path.expanduser(filename) == filepath_or_buffer + assert not should_close def test_get_filepath_or_buffer_with_buffer(self): input_buffer = StringIO() - filepath_or_buffer, _, _ = common.get_filepath_or_buffer(input_buffer) + filepath_or_buffer, _, _, should_close = common.get_filepath_or_buffer( + input_buffer) assert filepath_or_buffer == input_buffer + assert not should_close def test_iterator(self): reader = read_csv(StringIO(self.data1), chunksize=1) From d9551c8ee2a09ccc4c39d2b661f199334162edb6 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 12 Feb 2018 16:19:23 -0800 Subject: [PATCH 102/214] De-duplicate masking/fallback logic in ops (#19613) --- pandas/core/frame.py | 12 +---- pandas/core/ops.py | 109 +++++++++++++++++++++++++++++------------- pandas/core/series.py | 15 +----- 3 files changed, 78 insertions(+), 58 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 23579d84a3964..2782ee7b9d201 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3943,17 +3943,7 @@ def _combine_frame(self, other, func, fill_value=None, level=None): new_index, new_columns = this.index, this.columns def _arith_op(left, right): - if fill_value is not None: - left_mask = isna(left) - right_mask = isna(right) - left = left.copy() - right = right.copy() - - # one but not both - mask = left_mask ^ right_mask - left[left_mask & mask] = fill_value - right[right_mask & mask] = fill_value - + left, right = ops.fill_binop(left, right, fill_value) return func(left, right) if this._is_mixed_type or other._is_mixed_type: diff --git a/pandas/core/ops.py b/pandas/core/ops.py index effa35695fcd1..4c234ccb4dd47 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -398,6 +398,79 @@ def _make_flex_doc(op_name, typ): return doc +# ----------------------------------------------------------------------------- +# Masking NA values and fallbacks for operations numpy does not support + +def fill_binop(left, right, fill_value): + """ + If a non-None fill_value is given, replace null entries in left and right + with this value, but only in positions where _one_ of left/right is null, + not both. + + Parameters + ---------- + left : array-like + right : array-like + fill_value : object + + Returns + ------- + left : array-like + right : array-like + + Notes + ----- + Makes copies if fill_value is not None + """ + # TODO: can we make a no-copy implementation? + if fill_value is not None: + left_mask = isna(left) + right_mask = isna(right) + left = left.copy() + right = right.copy() + + # one but not both + mask = left_mask ^ right_mask + left[left_mask & mask] = fill_value + right[right_mask & mask] = fill_value + return left, right + + +def mask_cmp_op(x, y, op, allowed_types): + """ + Apply the function `op` to only non-null points in x and y. + + Parameters + ---------- + x : array-like + y : array-like + op : binary operation + allowed_types : class or tuple of classes + + Returns + ------- + result : ndarray[bool] + """ + # TODO: Can we make the allowed_types arg unnecessary? + xrav = x.ravel() + result = np.empty(x.size, dtype=bool) + if isinstance(y, allowed_types): + yrav = y.ravel() + mask = notna(xrav) & notna(yrav) + result[mask] = op(np.array(list(xrav[mask])), + np.array(list(yrav[mask]))) + else: + mask = notna(xrav) + result[mask] = op(np.array(list(xrav[mask])), y) + + if op == operator.ne: # pragma: no cover + np.putmask(result, ~mask, True) + else: + np.putmask(result, ~mask, False) + result = result.reshape(x.shape) + return result + + # ----------------------------------------------------------------------------- # Functions that add arithmetic methods to objects, given arithmetic factory # methods @@ -1127,23 +1200,7 @@ def na_op(x, y): with np.errstate(invalid='ignore'): result = op(x, y) except TypeError: - xrav = x.ravel() - result = np.empty(x.size, dtype=bool) - if isinstance(y, (np.ndarray, ABCSeries)): - yrav = y.ravel() - mask = notna(xrav) & notna(yrav) - result[mask] = op(np.array(list(xrav[mask])), - np.array(list(yrav[mask]))) - else: - mask = notna(xrav) - result[mask] = op(np.array(list(xrav[mask])), y) - - if op == operator.ne: # pragma: no cover - np.putmask(result, ~mask, True) - else: - np.putmask(result, ~mask, False) - result = result.reshape(x.shape) - + result = mask_cmp_op(x, y, op, (np.ndarray, ABCSeries)) return result @Appender('Wrapper for flexible comparison methods {name}' @@ -1221,23 +1278,7 @@ def na_op(x, y): try: result = expressions.evaluate(op, str_rep, x, y) except TypeError: - xrav = x.ravel() - result = np.empty(x.size, dtype=bool) - if isinstance(y, np.ndarray): - yrav = y.ravel() - mask = notna(xrav) & notna(yrav) - result[mask] = op(np.array(list(xrav[mask])), - np.array(list(yrav[mask]))) - else: - mask = notna(xrav) - result[mask] = op(np.array(list(xrav[mask])), y) - - if op == operator.ne: # pragma: no cover - np.putmask(result, ~mask, True) - else: - np.putmask(result, ~mask, False) - result = result.reshape(x.shape) - + result = mask_cmp_op(x, y, op, np.ndarray) return result @Appender('Wrapper for comparison method {name}'.format(name=name)) diff --git a/pandas/core/series.py b/pandas/core/series.py index e4b8979d6393a..655eaa5373f5a 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1725,19 +1725,8 @@ def _binop(self, other, func, level=None, fill_value=None): copy=False) new_index = this.index - this_vals = this.values - other_vals = other.values - - if fill_value is not None: - this_mask = isna(this_vals) - other_mask = isna(other_vals) - this_vals = this_vals.copy() - other_vals = other_vals.copy() - - # one but not both - mask = this_mask ^ other_mask - this_vals[this_mask & mask] = fill_value - other_vals[other_mask & mask] = fill_value + this_vals, other_vals = ops.fill_binop(this.values, other.values, + fill_value) with np.errstate(all='ignore'): result = func(this_vals, other_vals) From df38f66b417b754afdd2b0e17282255bbf2c769e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 13 Feb 2018 08:50:54 -0600 Subject: [PATCH 103/214] REF: Internal / External values (#19558) * REF/Clean: Internal / External values * Move to index base * Cleanup unique handling * Simplify object concat * Use values for intersection I think eventually we'll want to ndarray_values for this, but it'll require a bit more work to support. Currently, using ndarary_values causes occasional failures on categorical. * hmm * Additional testing * More tests * ndarray_values * API: Default ExtensionArray.astype (cherry picked from commit 943a915562b72bed147c857de927afa0daf31c1a) (cherry picked from commit fbf0a0672380e210d3cb3c527fa8045a204d81be) * Simplify concat_as_object * Py2 compat (cherry picked from commit b20e12cae68dd86ff51597464045656763d369f7) * Set-ops ugliness * better docstrings * tolist * linting * Moved dtypes (cherry picked from commit d1362271bca8a7b183f3241e5c2f040c422118b8) * clean * cleanup * NumPy compat * Use base _values for CategoricalIndex * Update dev docs * cleanup * Linting * Precision in tests * Push _ndarray_values to ExtensionArray Now IndexOpsMixin._ndarray_values will dispatch all the way down to the EA. Subclasses like Categorical can override it as they see fit. * Clean up tolist * Move test locations * Fixed test * REF: Update per comments * lint * REF: Use _values for size and shape * PERF: Implement size, shape for IntervalIndex * PERF: Avoid materializing values for PeriodIndex shape, size * Cleanup * Override nbytes --- doc/source/internals.rst | 19 +++ pandas/core/arrays/base.py | 12 ++ pandas/core/arrays/categorical.py | 4 + pandas/core/base.py | 21 +++- pandas/core/dtypes/cast.py | 2 +- pandas/core/dtypes/common.py | 2 +- pandas/core/dtypes/concat.py | 6 +- pandas/core/indexes/base.py | 108 ++++++++++++++---- pandas/core/indexes/category.py | 9 +- pandas/core/indexes/datetimelike.py | 2 +- pandas/core/indexes/datetimes.py | 43 +++++++ pandas/core/indexes/interval.py | 10 ++ pandas/core/indexes/multi.py | 34 +++--- pandas/core/indexes/numeric.py | 2 +- pandas/core/indexes/period.py | 49 +++++--- pandas/io/pytables.py | 2 +- pandas/plotting/_converter.py | 6 +- pandas/tests/indexes/common.py | 6 +- .../tests/indexes/datetimes/test_datetime.py | 9 ++ .../tests/indexes/period/test_construction.py | 4 +- pandas/tests/indexes/period/test_period.py | 6 +- pandas/tests/indexes/period/test_tools.py | 2 +- pandas/tests/indexes/test_category.py | 8 ++ pandas/tests/indexes/test_multi.py | 47 ++++++++ pandas/tests/test_base.py | 58 +++++++++- 25 files changed, 386 insertions(+), 85 deletions(-) diff --git a/doc/source/internals.rst b/doc/source/internals.rst index ee4df879d9478..957f82fd9eba7 100644 --- a/doc/source/internals.rst +++ b/doc/source/internals.rst @@ -89,6 +89,25 @@ not check (or care) whether the levels themselves are sorted. Fortunately, the constructors ``from_tuples`` and ``from_arrays`` ensure that this is true, but if you compute the levels and labels yourself, please be careful. +Values +~~~~~~ + +Pandas extends NumPy's type system with custom types, like ``Categorical`` or +datetimes with a timezone, so we have multiple notions of "values". For 1-D +containers (``Index`` classes and ``Series``) we have the following convention: + +* ``cls._ndarray_values`` is *always* a NumPy ``ndarray``. Ideally, + ``_ndarray_values`` is cheap to compute. For example, for a ``Categorical``, + this returns the codes, not the array of objects. +* ``cls._values`` refers is the "best possible" array. This could be an + ``ndarray``, ``ExtensionArray``, or in ``Index`` subclass (note: we're in the + process of removing the index subclasses here so that it's always an + ``ndarray`` or ``ExtensionArray``). + +So, for example, ``Series[category]._values`` is a ``Categorical``, while +``Series[category]._ndarray_values`` is the underlying codes. + + .. _ref-subclassing-pandas: Subclassing pandas Data Structures diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 553e1e0ac2066..e618dc6b69b2d 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -266,3 +266,15 @@ def _can_hold_na(self): Setting this to false will optimize some operations like fillna. """ return True + + @property + def _ndarray_values(self): + # type: () -> np.ndarray + """Internal pandas method for lossy conversion to a NumPy ndarray. + + This method is not part of the pandas interface. + + The expectation is that this is cheap to compute, and is primarily + used for interacting with our indexers. + """ + return np.array(self) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 93250bdbb5054..bcf9cb7646704 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -410,6 +410,10 @@ def dtype(self): """The :class:`~pandas.api.types.CategoricalDtype` for this instance""" return self._dtype + @property + def _ndarray_values(self): + return self.codes + @property def _constructor(self): return Categorical diff --git a/pandas/core/base.py b/pandas/core/base.py index 3d8f5f265e3db..0ca029ffd4c25 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -13,7 +13,8 @@ is_list_like, is_scalar, is_datetimelike, - is_extension_type) + is_extension_type, + is_extension_array_dtype) from pandas.util._validators import validate_bool_kwarg @@ -738,7 +739,7 @@ def data(self): @property def itemsize(self): """ return the size of the dtype of the item of the underlying data """ - return self._values.itemsize + return self._ndarray_values.itemsize @property def nbytes(self): @@ -748,7 +749,7 @@ def nbytes(self): @property def strides(self): """ return the strides of the underlying data """ - return self._values.strides + return self._ndarray_values.strides @property def size(self): @@ -768,8 +769,17 @@ def base(self): return self.values.base @property - def _values(self): - """ the internal implementation """ + def _ndarray_values(self): + """The data as an ndarray, possibly losing information. + + The expectation is that this is cheap to compute, and is primarily + used for interacting with our indexers. + + - categorical -> codes + """ + # type: () -> np.ndarray + if is_extension_array_dtype(self): + return self.values._ndarray_values return self.values @property @@ -979,6 +989,7 @@ def unique(self): values = self._values if hasattr(values, 'unique'): + result = values.unique() else: from pandas.core.algorithms import unique1d diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index b2816343fc8eb..55919fb2bea0d 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -927,7 +927,7 @@ def try_timedelta(v): # will try first with a string & object conversion from pandas import to_timedelta try: - return to_timedelta(v)._values.reshape(shape) + return to_timedelta(v)._ndarray_values.reshape(shape) except Exception: return v.reshape(shape) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index c66e7fcfc6978..c2b71bc316fe8 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1709,7 +1709,7 @@ def is_extension_array_dtype(arr_or_dtype): from pandas.core.arrays import ExtensionArray # we want to unpack series, anything else? - if isinstance(arr_or_dtype, ABCSeries): + if isinstance(arr_or_dtype, (ABCIndexClass, ABCSeries)): arr_or_dtype = arr_or_dtype._values return isinstance(arr_or_dtype, (ExtensionDtype, ExtensionArray)) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index ddecbe85087d8..d306d0d78f1f4 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -488,12 +488,14 @@ def _concat_index_asobject(to_concat, name=None): concat all inputs as object. DatetimeIndex, TimedeltaIndex and PeriodIndex are converted to object dtype before concatenation """ + from pandas import Index + from pandas.core.arrays import ExtensionArray - klasses = ABCDatetimeIndex, ABCTimedeltaIndex, ABCPeriodIndex + klasses = (ABCDatetimeIndex, ABCTimedeltaIndex, ABCPeriodIndex, + ExtensionArray) to_concat = [x.astype(object) if isinstance(x, klasses) else x for x in to_concat] - from pandas import Index self = to_concat[0] attribs = self._get_attributes_dict() attribs['name'] = name diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 15df77bf772dc..be7c1624936bf 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -31,12 +31,14 @@ is_object_dtype, is_categorical_dtype, is_interval_dtype, + is_period_dtype, is_bool, is_bool_dtype, is_signed_integer_dtype, is_unsigned_integer_dtype, is_integer_dtype, is_float_dtype, is_datetime64_any_dtype, + is_datetime64tz_dtype, is_timedelta64_dtype, needs_i8_conversion, is_iterator, is_list_like, @@ -412,7 +414,7 @@ def _simple_new(cls, values, name=None, dtype=None, **kwargs): values = np.array(values, copy=False) if is_object_dtype(values): values = cls(values, name=name, dtype=dtype, - **kwargs)._values + **kwargs)._ndarray_values result = object.__new__(cls) result._data = values @@ -594,6 +596,40 @@ def values(self): """ return the underlying data as an ndarray """ return self._data.view(np.ndarray) + @property + def _values(self): + # type: () -> Union[ExtensionArray, Index] + # TODO(EA): remove index types as they become extension arrays + """The best array representation. + + This is an ndarray, ExtensionArray, or Index subclass. This differs + from ``_ndarray_values``, which always returns an ndarray. + + Both ``_values`` and ``_ndarray_values`` are consistent between + ``Series`` and ``Index``. + + It may differ from the public '.values' method. + + index | values | _values | _ndarray_values | + ----------------- | -------------- -| ----------- | --------------- | + CategoricalIndex | Categorical | Categorical | codes | + DatetimeIndex[tz] | ndarray[M8ns] | DTI[tz] | ndarray[M8ns] | + + For the following, the ``._values`` is currently ``ndarray[object]``, + but will soon be an ``ExtensionArray`` + + index | values | _values | _ndarray_values | + ----------------- | --------------- | ------------ | --------------- | + PeriodIndex | ndarray[object] | ndarray[obj] | ndarray[int] | + IntervalIndex | ndarray[object] | ndarray[obj] | ndarray[object] | + + See Also + -------- + values + _ndarray_values + """ + return self.values + def get_values(self): """ return the underlying data as an ndarray """ return self.values @@ -664,7 +700,7 @@ def ravel(self, order='C'): -------- numpy.ndarray.ravel """ - return self._values.ravel(order=order) + return self._ndarray_values.ravel(order=order) # construction helpers @classmethod @@ -1597,7 +1633,7 @@ def _constructor(self): @cache_readonly def _engine(self): # property, for now, slow to look up - return self._engine_type(lambda: self._values, len(self)) + return self._engine_type(lambda: self._ndarray_values, len(self)) def _validate_index_level(self, level): """ @@ -2228,27 +2264,37 @@ def union(self, other): other = other.astype('O') return this.union(other) + # TODO(EA): setops-refactor, clean all this up + if is_period_dtype(self) or is_datetime64tz_dtype(self): + lvals = self._ndarray_values + else: + lvals = self._values + if is_period_dtype(other) or is_datetime64tz_dtype(other): + rvals = other._ndarray_values + else: + rvals = other._values + if self.is_monotonic and other.is_monotonic: try: - result = self._outer_indexer(self._values, other._values)[0] + result = self._outer_indexer(lvals, rvals)[0] except TypeError: # incomparable objects - result = list(self._values) + result = list(lvals) # worth making this faster? a very unusual case - value_set = set(self._values) - result.extend([x for x in other._values if x not in value_set]) + value_set = set(lvals) + result.extend([x for x in rvals if x not in value_set]) else: indexer = self.get_indexer(other) indexer, = (indexer == -1).nonzero() if len(indexer) > 0: - other_diff = algos.take_nd(other._values, indexer, + other_diff = algos.take_nd(rvals, indexer, allow_fill=False) - result = _concat._concat_compat((self._values, other_diff)) + result = _concat._concat_compat((lvals, other_diff)) try: - self._values[0] < other_diff[0] + lvals[0] < other_diff[0] except TypeError as e: warnings.warn("%s, sort order is undefined for " "incomparable objects" % e, RuntimeWarning, @@ -2260,7 +2306,7 @@ def union(self, other): result.sort() else: - result = self._values + result = lvals try: result = np.sort(result) @@ -2311,20 +2357,30 @@ def intersection(self, other): other = other.astype('O') return this.intersection(other) + # TODO(EA): setops-refactor, clean all this up + if is_period_dtype(self): + lvals = self._ndarray_values + else: + lvals = self._values + if is_period_dtype(other): + rvals = other._ndarray_values + else: + rvals = other._values + if self.is_monotonic and other.is_monotonic: try: - result = self._inner_indexer(self._values, other._values)[0] + result = self._inner_indexer(lvals, rvals)[0] return self._wrap_union_result(other, result) except TypeError: pass try: - indexer = Index(other._values).get_indexer(self._values) + indexer = Index(rvals).get_indexer(lvals) indexer = indexer.take((indexer != -1).nonzero()[0]) except Exception: # duplicates indexer = algos.unique1d( - Index(other._values).get_indexer_non_unique(self._values)[0]) + Index(rvals).get_indexer_non_unique(lvals)[0]) indexer = indexer[indexer != -1] taken = other.take(indexer) @@ -2700,7 +2756,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): raise ValueError('limit argument only valid if doing pad, ' 'backfill or nearest reindexing') - indexer = self._engine.get_indexer(target._values) + indexer = self._engine.get_indexer(target._ndarray_values) return _ensure_platform_int(indexer) @@ -2716,12 +2772,13 @@ def _get_fill_indexer(self, target, method, limit=None, tolerance=None): if self.is_monotonic_increasing and target.is_monotonic_increasing: method = (self._engine.get_pad_indexer if method == 'pad' else self._engine.get_backfill_indexer) - indexer = method(target._values, limit) + indexer = method(target._ndarray_values, limit) else: indexer = self._get_fill_indexer_searchsorted(target, method, limit) if tolerance is not None: - indexer = self._filter_indexer_tolerance(target._values, indexer, + indexer = self._filter_indexer_tolerance(target._ndarray_values, + indexer, tolerance) return indexer @@ -2812,7 +2869,7 @@ def get_indexer_non_unique(self, target): self = Index(self.asi8) tgt_values = target.asi8 else: - tgt_values = target._values + tgt_values = target._ndarray_values indexer, missing = self._engine.get_indexer_non_unique(tgt_values) return _ensure_platform_int(indexer), missing @@ -3247,16 +3304,17 @@ def _join_multi(self, other, how, return_indexers=True): def _join_non_unique(self, other, how='left', return_indexers=False): from pandas.core.reshape.merge import _get_join_indexers - left_idx, right_idx = _get_join_indexers([self._values], - [other._values], how=how, + left_idx, right_idx = _get_join_indexers([self._ndarray_values], + [other._ndarray_values], + how=how, sort=True) left_idx = _ensure_platform_int(left_idx) right_idx = _ensure_platform_int(right_idx) - join_index = np.asarray(self._values.take(left_idx)) + join_index = np.asarray(self._ndarray_values.take(left_idx)) mask = left_idx == -1 - np.putmask(join_index, mask, other._values.take(right_idx)) + np.putmask(join_index, mask, other._ndarray_values.take(right_idx)) join_index = self._wrap_joined_index(join_index, other) @@ -3403,8 +3461,8 @@ def _join_monotonic(self, other, how='left', return_indexers=False): else: return ret_index - sv = self._values - ov = other._values + sv = self._ndarray_values + ov = other._ndarray_values if self.is_unique and other.is_unique: # We can perform much better than the general case @@ -3756,7 +3814,7 @@ def insert(self, loc, item): item = self._na_value _self = np.asarray(self) - item = self._coerce_scalar_to_index(item)._values + item = self._coerce_scalar_to_index(item)._ndarray_values idx = np.concatenate((_self[:loc], item, _self[loc:])) return self._shallow_copy_with_infer(idx) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 60f5552576ea1..a4d0f787cc6ec 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -293,6 +293,11 @@ def values(self): """ return the underlying data, which is a Categorical """ return self._data + @property + def itemsize(self): + # Size of the items in categories, not codes. + return self.values.itemsize + def get_values(self): """ return the underlying data as an ndarray """ return self._data.get_values() @@ -386,8 +391,8 @@ def is_monotonic_decreasing(self): def unique(self, level=None): if level is not None: self._validate_index_level(level) - result = base.IndexOpsMixin.unique(self) - # CategoricalIndex._shallow_copy uses keeps original categories + result = self.values.unique() + # CategoricalIndex._shallow_copy keeps original categories # and ordered if not otherwise specified return self._shallow_copy(result, categories=result.categories, ordered=result.ordered) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 4a526955d9bf4..c98f8ceea0ffa 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -376,7 +376,7 @@ def sort_values(self, return_indexer=False, ascending=True): sorted_index = self.take(_as) return sorted_index, _as else: - sorted_values = np.sort(self._values) + sorted_values = np.sort(self._ndarray_values) attribs = self._get_attributes_dict() freq = attribs['freq'] diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 61c941c3d2333..cc9ce1f3fd5eb 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -678,6 +678,15 @@ def _assert_tzawareness_compat(self, other): raise TypeError('Cannot compare tz-naive and tz-aware ' 'datetime-like objects') + @property + def _values(self): + # tz-naive -> ndarray + # tz-aware -> DatetimeIndex + if self.tz is not None: + return self + else: + return self.values + @property def tzinfo(self): """ @@ -685,6 +694,27 @@ def tzinfo(self): """ return self.tz + @property + def size(self): + # TODO: Remove this when we have a DatetimeTZArray + # Necessary to avoid recursion error since DTI._values is a DTI + # for TZ-aware + return self._ndarray_values.size + + @property + def shape(self): + # TODO: Remove this when we have a DatetimeTZArray + # Necessary to avoid recursion error since DTI._values is a DTI + # for TZ-aware + return self._ndarray_values.shape + + @property + def nbytes(self): + # TODO: Remove this when we have a DatetimeTZArray + # Necessary to avoid recursion error since DTI._values is a DTI + # for TZ-aware + return self._ndarray_values.nbytes + @cache_readonly def _timezone(self): """ Comparable timezone both for pytz / dateutil""" @@ -1086,6 +1116,19 @@ def snap(self, freq='S'): # we know it conforms; skip check return DatetimeIndex(snapped, freq=freq, verify_integrity=False) + def unique(self, level=None): + # Override here since IndexOpsMixin.unique uses self._values.unique + # For DatetimeIndex with TZ, that's a DatetimeIndex -> recursion error + # So we extract the tz-naive DatetimeIndex, unique that, and wrap the + # result with out TZ. + if self.tz is not None: + naive = type(self)(self._ndarray_values, copy=False) + else: + naive = self + result = super(DatetimeIndex, naive).unique(level=level) + return self._simple_new(result, name=self.name, tz=self.tz, + freq=self.freq) + def union(self, other): """ Specialized union for DatetimeIndex objects. If combine diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 3bf783b5a2faa..d431ea1e51e31 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -680,6 +680,16 @@ def length(self): 'e.g. Intervals with string endpoints') raise TypeError(msg) + @property + def size(self): + # Avoid materializing self.values + return self.left.size + + @property + def shape(self): + # Avoid materializing self.values + return self.left.shape + def __len__(self): return len(self.left) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 510f7245cebd8..94dbd8b884e47 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -799,9 +799,11 @@ def values(self): box = hasattr(lev, '_box_values') # Try to minimize boxing. if box and len(lev) > len(lab): - taken = lev._box_values(algos.take_1d(lev._values, lab)) + taken = lev._box_values(algos.take_1d(lev._ndarray_values, + lab)) elif box: - taken = algos.take_1d(lev._box_values(lev._values), lab, + taken = algos.take_1d(lev._box_values(lev._ndarray_values), + lab, fill_value=_get_na_value(lev.dtype.type)) else: taken = algos.take_1d(np.asarray(lev._values), lab) @@ -2410,7 +2412,7 @@ def convert_indexer(start, stop, step, indexer=indexer, labels=labels): mapper = Series(indexer) indexer = labels.take(_ensure_platform_int(indexer)) result = Series(Index(indexer).isin(r).nonzero()[0]) - m = result.map(mapper)._values + m = result.map(mapper)._ndarray_values else: m = np.zeros(len(labels), dtype=bool) @@ -2505,6 +2507,7 @@ def get_locs(self, seq): MultiIndex.slice_locs : Get slice location given start label(s) and end label(s). """ + from .numeric import Int64Index # must be lexsorted to at least as many levels true_slices = [i for (i, s) in enumerate(com.is_true_slices(seq)) if s] @@ -2530,7 +2533,6 @@ def _convert_to_indexer(r): "that is not the same length as the " "index") r = r.nonzero()[0] - from .numeric import Int64Index return Int64Index(r) def _update_indexer(idxr, indexer=indexer): @@ -2567,9 +2569,8 @@ def _update_indexer(idxr, indexer=indexer): if indexers is not None: indexer = _update_indexer(indexers, indexer=indexer) else: - from .numeric import Int64Index # no matches we are done - return Int64Index([])._values + return Int64Index([])._ndarray_values elif com.is_null_slice(k): # empty slice @@ -2589,8 +2590,8 @@ def _update_indexer(idxr, indexer=indexer): # empty indexer if indexer is None: - return Int64Index([])._values - return indexer._values + return Int64Index([])._ndarray_values + return indexer._ndarray_values def truncate(self, before=None, after=None): """ @@ -2639,7 +2640,7 @@ def equals(self, other): if not isinstance(other, MultiIndex): other_vals = com._values_from_object(_ensure_index(other)) - return array_equivalent(self._values, other_vals) + return array_equivalent(self._ndarray_values, other_vals) if self.nlevels != other.nlevels: return False @@ -2655,8 +2656,9 @@ def equals(self, other): olabels = other.labels[i] olabels = olabels[olabels != -1] - ovalues = algos.take_nd(np.asarray(other.levels[i]._values), - olabels, allow_fill=False) + ovalues = algos.take_nd( + np.asarray(other.levels[i]._values), + olabels, allow_fill=False) # since we use NaT both datetime64 and timedelta64 # we can have a situation where a level is typed say @@ -2704,7 +2706,8 @@ def union(self, other): if len(other) == 0 or self.equals(other): return self - uniq_tuples = lib.fast_unique_multiple([self._values, other._values]) + uniq_tuples = lib.fast_unique_multiple([self._ndarray_values, + other._ndarray_values]) return MultiIndex.from_arrays(lzip(*uniq_tuples), sortorder=0, names=result_names) @@ -2726,8 +2729,8 @@ def intersection(self, other): if self.equals(other): return self - self_tuples = self._values - other_tuples = other._values + self_tuples = self._ndarray_values + other_tuples = other._ndarray_values uniq_tuples = sorted(set(self_tuples) & set(other_tuples)) if len(uniq_tuples) == 0: return MultiIndex(levels=[[]] * self.nlevels, @@ -2756,7 +2759,8 @@ def difference(self, other): labels=[[]] * self.nlevels, names=result_names, verify_integrity=False) - difference = sorted(set(self._values) - set(other._values)) + difference = sorted(set(self._ndarray_values) - + set(other._ndarray_values)) if len(difference) == 0: return MultiIndex(levels=[[]] * self.nlevels, diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index b02aee0495d8c..a4558116bfa63 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -378,7 +378,7 @@ def equals(self, other): if (not is_dtype_equal(self.dtype, other.dtype) or self.shape != other.shape): return False - left, right = self._values, other._values + left, right = self._ndarray_values, other._ndarray_values return ((left == right) | (self._isnan & other._isnan)).all() except (TypeError, ValueError): return False diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 1f8542ed5ee60..8f2d7d382a16e 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -54,7 +54,7 @@ def _field_accessor(name, alias, docstring=None): def f(self): base, mult = _gfc(self.freq) - result = get_period_field_arr(alias, self._values, base) + result = get_period_field_arr(alias, self._ndarray_values, base) return Index(result, name=self.name) f.__name__ = name f.__doc__ = docstring @@ -82,7 +82,7 @@ def _period_index_cmp(opname, cls, nat_result=False): def wrapper(self, other): if isinstance(other, Period): - func = getattr(self._values, opname) + func = getattr(self._ndarray_values, opname) other_base, _ = _gfc(other.freq) if other.freq != self.freq: msg = _DIFFERENT_FREQ_INDEX.format(self.freqstr, other.freqstr) @@ -94,7 +94,8 @@ def wrapper(self, other): msg = _DIFFERENT_FREQ_INDEX.format(self.freqstr, other.freqstr) raise IncompatibleFrequency(msg) - result = getattr(self._values, opname)(other._values) + op = getattr(self._ndarray_values, opname) + result = op(other._ndarray_values) mask = self._isnan | other._isnan if mask.any(): @@ -102,11 +103,11 @@ def wrapper(self, other): return result elif other is tslib.NaT: - result = np.empty(len(self._values), dtype=bool) + result = np.empty(len(self._ndarray_values), dtype=bool) result.fill(nat_result) else: other = Period(other, freq=self.freq) - func = getattr(self._values, opname) + func = getattr(self._ndarray_values, opname) result = func(other.ordinal) if self.hasnans: @@ -275,11 +276,11 @@ def __new__(cls, data=None, ordinal=None, freq=None, start=None, end=None, if isinstance(data, PeriodIndex): if freq is None or freq == data.freq: # no freq change freq = data.freq - data = data._values + data = data._ndarray_values else: base1, _ = _gfc(data.freq) base2, _ = _gfc(freq) - data = period.period_asfreq_arr(data._values, + data = period.period_asfreq_arr(data._ndarray_values, base1, base2, 1) return cls._simple_new(data, name=name, freq=freq) @@ -374,7 +375,7 @@ def _shallow_copy(self, values=None, freq=None, **kwargs): if freq is None: freq = self.freq if values is None: - values = self._values + values = self._ndarray_values return super(PeriodIndex, self)._shallow_copy(values=values, freq=freq, **kwargs) @@ -407,7 +408,7 @@ def __contains__(self, key): @property def asi8(self): - return self._values.view('i8') + return self._ndarray_values.view('i8') @cache_readonly def _int64index(self): @@ -418,7 +419,8 @@ def values(self): return self.astype(object).values @property - def _values(self): + def _ndarray_values(self): + # Ordinals return self._data def __array__(self, dtype=None): @@ -475,6 +477,16 @@ def _to_embed(self, keep_tz=False, dtype=None): return self.astype(object).values + @property + def size(self): + # Avoid materializing self._values + return self._ndarray_values.size + + @property + def shape(self): + # Avoid materializing self._values + return self._ndarray_values.shape + @property def _formatter_func(self): return lambda x: "'%s'" % x @@ -489,13 +501,15 @@ def asof_locs(self, where, mask): if isinstance(where_idx, DatetimeIndex): where_idx = PeriodIndex(where_idx.values, freq=self.freq) - locs = self._values[mask].searchsorted(where_idx._values, side='right') + locs = self._ndarray_values[mask].searchsorted( + where_idx._ndarray_values, side='right') locs = np.where(locs > 0, locs - 1, 0) result = np.arange(len(self))[mask].take(locs) first = mask.argmax() - result[(locs == 0) & (where_idx._values < self._values[first])] = -1 + result[(locs == 0) & (where_idx._ndarray_values < + self._ndarray_values[first])] = -1 return result @@ -523,7 +537,8 @@ def searchsorted(self, value, side='left', sorter=None): elif isinstance(value, compat.string_types): value = Period(value, freq=self.freq).ordinal - return self._values.searchsorted(value, side=side, sorter=sorter) + return self._ndarray_values.searchsorted(value, side=side, + sorter=sorter) @property def is_all_dates(self): @@ -664,7 +679,7 @@ def to_timestamp(self, freq=None, how='start'): base, mult = _gfc(freq) new_data = self.asfreq(freq, how) - new_data = period.periodarr_to_dt64arr(new_data._values, base) + new_data = period.periodarr_to_dt64arr(new_data._ndarray_values, base) return DatetimeIndex(new_data, freq='infer', name=self.name) def _maybe_convert_timedelta(self, other): @@ -744,7 +759,7 @@ def shift(self, n): ------- shifted : PeriodIndex """ - values = self._values + n * self.freq.n + values = self._ndarray_values + n * self.freq.n if self.hasnans: values[self._isnan] = tslib.iNaT return self._shallow_copy(values=values) @@ -775,7 +790,7 @@ def get_value(self, series, key): grp = resolution.Resolution.get_freq_group(reso) freqn = resolution.get_freq_group(self.freq) - vals = self._values + vals = self._ndarray_values # if our data is higher resolution than requested key, slice if grp < freqn: @@ -786,7 +801,7 @@ def get_value(self, series, key): if ord2 < vals[0] or ord1 > vals[-1]: raise KeyError(key) - pos = np.searchsorted(self._values, [ord1, ord2]) + pos = np.searchsorted(self._ndarray_values, [ord1, ord2]) key = slice(pos[0], pos[1] + 1) return series[key] elif grp == freqn: diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 0d833807602e1..2437b7d396e84 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -4430,7 +4430,7 @@ def _convert_index(index, encoding=None, format_type=None): elif isinstance(index, (Int64Index, PeriodIndex)): atom = _tables().Int64Col() # avoid to store ndarray of Period objects - return IndexCol(index._values, 'integer', atom, + return IndexCol(index._ndarray_values, 'integer', atom, freq=getattr(index, 'freq', None), index_name=index_name) diff --git a/pandas/plotting/_converter.py b/pandas/plotting/_converter.py index 07163615c6ba4..9ca06475290e4 100644 --- a/pandas/plotting/_converter.py +++ b/pandas/plotting/_converter.py @@ -249,11 +249,11 @@ def _convert_1d(values, units, axis): is_float(values)): return get_datevalue(values, axis.freq) if isinstance(values, PeriodIndex): - return values.asfreq(axis.freq)._values + return values.asfreq(axis.freq)._ndarray_values if isinstance(values, Index): return values.map(lambda x: get_datevalue(x, axis.freq)) if is_period_arraylike(values): - return PeriodIndex(values, freq=axis.freq)._values + return PeriodIndex(values, freq=axis.freq)._ndarray_values if isinstance(values, (list, tuple, np.ndarray, Index)): return [get_datevalue(x, axis.freq) for x in values] return values @@ -642,7 +642,7 @@ def _daily_finder(vmin, vmax, freq): info = np.zeros(span, dtype=[('val', np.int64), ('maj', bool), ('min', bool), ('fmt', '|S20')]) - info['val'][:] = dates_._values + info['val'][:] = dates_._ndarray_values info['fmt'][:] = '' info['maj'][[0, -1]] = True # .. and set some shortcuts diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 8948c5f79900d..2d8d70aa2ac84 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -314,7 +314,8 @@ def test_ensure_copied_data(self): # .values an object array of Period, thus copied result = index_type(ordinal=index.asi8, copy=False, **init_kwargs) - tm.assert_numpy_array_equal(index._values, result._values, + tm.assert_numpy_array_equal(index._ndarray_values, + result._ndarray_values, check_same='same') elif isinstance(index, IntervalIndex): # checked in test_interval.py @@ -323,7 +324,8 @@ def test_ensure_copied_data(self): result = index_type(index.values, copy=False, **init_kwargs) tm.assert_numpy_array_equal(index.values, result.values, check_same='same') - tm.assert_numpy_array_equal(index._values, result._values, + tm.assert_numpy_array_equal(index._ndarray_values, + result._ndarray_values, check_same='same') def test_copy_and_deepcopy(self, indices): diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index a75ace2933b71..05678b0c8dd45 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -469,3 +469,12 @@ def test_factorize_dst(self): arr, res = obj.factorize() tm.assert_numpy_array_equal(arr, np.arange(12, dtype=np.intp)) tm.assert_index_equal(res, idx) + + @pytest.mark.parametrize('arr, expected', [ + (pd.DatetimeIndex(['2017', '2017']), pd.DatetimeIndex(['2017'])), + (pd.DatetimeIndex(['2017', '2017'], tz='US/Eastern'), + pd.DatetimeIndex(['2017'], tz='US/Eastern')), + ]) + def test_unique(self, arr, expected): + result = arr.unique() + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/period/test_construction.py b/pandas/tests/indexes/period/test_construction.py index 639a9272c3808..eca80d17b1dc3 100644 --- a/pandas/tests/indexes/period/test_construction.py +++ b/pandas/tests/indexes/period/test_construction.py @@ -119,8 +119,8 @@ def test_constructor_fromarraylike(self): tm.assert_index_equal(PeriodIndex(idx.values), idx) tm.assert_index_equal(PeriodIndex(list(idx.values)), idx) - pytest.raises(ValueError, PeriodIndex, idx._values) - pytest.raises(ValueError, PeriodIndex, list(idx._values)) + pytest.raises(ValueError, PeriodIndex, idx._ndarray_values) + pytest.raises(ValueError, PeriodIndex, list(idx._ndarray_values)) pytest.raises(TypeError, PeriodIndex, data=Period('2007', freq='A')) diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index f3469b829f8a3..b3f059018493c 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -205,7 +205,7 @@ def test_values(self): tm.assert_numpy_array_equal(idx.values, exp) tm.assert_numpy_array_equal(idx.get_values(), exp) exp = np.array([], dtype=np.int64) - tm.assert_numpy_array_equal(idx._values, exp) + tm.assert_numpy_array_equal(idx._ndarray_values, exp) idx = pd.PeriodIndex(['2011-01', pd.NaT], freq='M') @@ -213,7 +213,7 @@ def test_values(self): tm.assert_numpy_array_equal(idx.values, exp) tm.assert_numpy_array_equal(idx.get_values(), exp) exp = np.array([492, -9223372036854775808], dtype=np.int64) - tm.assert_numpy_array_equal(idx._values, exp) + tm.assert_numpy_array_equal(idx._ndarray_values, exp) idx = pd.PeriodIndex(['2011-01-01', pd.NaT], freq='D') @@ -222,7 +222,7 @@ def test_values(self): tm.assert_numpy_array_equal(idx.values, exp) tm.assert_numpy_array_equal(idx.get_values(), exp) exp = np.array([14975, -9223372036854775808], dtype=np.int64) - tm.assert_numpy_array_equal(idx._values, exp) + tm.assert_numpy_array_equal(idx._ndarray_values, exp) def test_period_index_length(self): pi = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') diff --git a/pandas/tests/indexes/period/test_tools.py b/pandas/tests/indexes/period/test_tools.py index f5e7c8269dc4f..97500f2f5ed95 100644 --- a/pandas/tests/indexes/period/test_tools.py +++ b/pandas/tests/indexes/period/test_tools.py @@ -20,7 +20,7 @@ class TestPeriodRepresentation(object): def _check_freq(self, freq, base_date): rng = PeriodIndex(start=base_date, periods=10, freq=freq) exp = np.arange(10, dtype=np.int64) - tm.assert_numpy_array_equal(rng._values, exp) + tm.assert_numpy_array_equal(rng.asi8, exp) def test_annual(self): diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index c2e40c79f8914..e9fddfde90348 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -353,6 +353,14 @@ def test_append(self): expected = Index(list('caaabbca')) tm.assert_index_equal(result, expected, exact=True) + def test_append_to_another(self): + # hits _concat_index_asobject + fst = Index(['a', 'b']) + snd = CategoricalIndex(['d', 'e']) + result = fst.append(snd) + expected = Index(['a', 'b', 'd', 'e']) + tm.assert_index_equal(result, expected) + def test_insert(self): ci = self.create_index() diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index e59456b8a2d5e..cd6a5c761d0c2 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -962,6 +962,53 @@ def test_values_boxed(self): # Check that code branches for boxed values produce identical results tm.assert_numpy_array_equal(result.values[:4], result[:4].values) + def test_values_multiindex_datetimeindex(self): + # Test to ensure we hit the boxing / nobox part of MI.values + ints = np.arange(10**18, 10**18 + 5) + naive = pd.DatetimeIndex(ints) + aware = pd.DatetimeIndex(ints, tz='US/Central') + + idx = pd.MultiIndex.from_arrays([naive, aware]) + result = idx.values + + outer = pd.DatetimeIndex([x[0] for x in result]) + tm.assert_index_equal(outer, naive) + + inner = pd.DatetimeIndex([x[1] for x in result]) + tm.assert_index_equal(inner, aware) + + # n_lev > n_lab + result = idx[:2].values + + outer = pd.DatetimeIndex([x[0] for x in result]) + tm.assert_index_equal(outer, naive[:2]) + + inner = pd.DatetimeIndex([x[1] for x in result]) + tm.assert_index_equal(inner, aware[:2]) + + def test_values_multiindex_periodindex(self): + # Test to ensure we hit the boxing / nobox part of MI.values + ints = np.arange(2007, 2012) + pidx = pd.PeriodIndex(ints, freq='D') + + idx = pd.MultiIndex.from_arrays([ints, pidx]) + result = idx.values + + outer = pd.Int64Index([x[0] for x in result]) + tm.assert_index_equal(outer, pd.Int64Index(ints)) + + inner = pd.PeriodIndex([x[1] for x in result]) + tm.assert_index_equal(inner, pidx) + + # n_lev > n_lab + result = idx[:2].values + + outer = pd.Int64Index([x[0] for x in result]) + tm.assert_index_equal(outer, pd.Int64Index(ints[:2])) + + inner = pd.PeriodIndex([x[1] for x in result]) + tm.assert_index_equal(inner, pidx[:2]) + def test_append(self): result = self.index[:3].append(self.index[3:]) assert result.equals(self.index) diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index df2547fc7b0da..4b5ad336139b0 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -338,8 +338,9 @@ def test_ops(self): if not isinstance(o, PeriodIndex): expected = getattr(o.values, op)() else: - expected = pd.Period(ordinal=getattr(o._values, op)(), - freq=o.freq) + expected = pd.Period( + ordinal=getattr(o._ndarray_values, op)(), + freq=o.freq) try: assert result == expected except TypeError: @@ -450,7 +451,7 @@ def test_value_counts_unique_nunique_null(self): for orig in self.objs: o = orig.copy() klass = type(o) - values = o._values + values = o._ndarray_values if not self._allow_na_ops(o): continue @@ -1175,3 +1176,54 @@ def test_iter_box(self): assert isinstance(res, pd.Period) assert res.freq == 'M' assert res == exp + + +@pytest.mark.parametrize('array, expected_type, dtype', [ + (np.array([0, 1], dtype=np.int64), np.ndarray, 'int64'), + (np.array(['a', 'b']), np.ndarray, 'object'), + (pd.Categorical(['a', 'b']), pd.Categorical, 'category'), + (pd.DatetimeIndex(['2017', '2018']), np.ndarray, 'datetime64[ns]'), + (pd.DatetimeIndex(['2017', '2018'], tz="US/Central"), pd.DatetimeIndex, + 'datetime64[ns, US/Central]'), + (pd.TimedeltaIndex([10**10]), np.ndarray, 'm8[ns]'), + (pd.PeriodIndex([2018, 2019], freq='A'), np.ndarray, 'object'), + (pd.IntervalIndex.from_breaks([0, 1, 2]), np.ndarray, 'object'), +]) +def test_values_consistent(array, expected_type, dtype): + l_values = pd.Series(array)._values + r_values = pd.Index(array)._values + assert type(l_values) is expected_type + assert type(l_values) is type(r_values) + + if isinstance(l_values, np.ndarray): + tm.assert_numpy_array_equal(l_values, r_values) + elif isinstance(l_values, pd.Index): + tm.assert_index_equal(l_values, r_values) + elif pd.api.types.is_categorical(l_values): + tm.assert_categorical_equal(l_values, r_values) + else: + raise TypeError("Unexpected type {}".format(type(l_values))) + + assert l_values.dtype == dtype + assert r_values.dtype == dtype + + +@pytest.mark.parametrize('array, expected', [ + (np.array([0, 1], dtype=np.int64), np.array([0, 1], dtype=np.int64)), + (np.array(['0', '1']), np.array(['0', '1'], dtype=object)), + (pd.Categorical(['a', 'a']), np.array([0, 0], dtype='int8')), + (pd.DatetimeIndex(['2017-01-01T00:00:00']), + np.array(['2017-01-01T00:00:00'], dtype='M8[ns]')), + (pd.DatetimeIndex(['2017-01-01T00:00:00'], tz="US/Eastern"), + np.array(['2017-01-01T05:00:00'], dtype='M8[ns]')), + (pd.TimedeltaIndex([10**10]), np.array([10**10], dtype='m8[ns]')), + pytest.mark.xfail(reason='PeriodArray not implemented')(( + pd.PeriodIndex(['2017', '2018'], freq='D'), + np.array([17167, 17532]), + )), +]) +def test_ndarray_values(array, expected): + l_values = pd.Series(array)._ndarray_values + r_values = pd.Index(array)._ndarray_values + tm.assert_numpy_array_equal(l_values, r_values) + tm.assert_numpy_array_equal(l_values, expected) From 07137a5acdfc761c7cc30a081600e4c4f23c7d3d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 13 Feb 2018 20:01:42 +0100 Subject: [PATCH 104/214] DOC: ignore Panel deprecation warnings during doc build (#19663) --- doc/source/conf.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/doc/source/conf.py b/doc/source/conf.py index c188f83f80250..7c4edd0486636 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -15,6 +15,8 @@ import re import inspect import importlib +import warnings + from pandas.compat import u, PY3 try: @@ -375,6 +377,13 @@ 'wiki': ('https://github.com/pandas-dev/pandas/wiki/%s', 'wiki ')} + +# ignore all deprecation warnings from Panel during doc build +# (to avoid the need to add :okwarning: in many places) +warnings.filterwarnings("ignore", message="\nPanel is deprecated", + category=FutureWarning) + + ipython_exec_lines = [ 'import numpy as np', 'import pandas as pd', From 8cace882612a6f51ed05ffec43763b8f7ffc110b Mon Sep 17 00:00:00 2001 From: Matthias Bussonnier Date: Tue, 13 Feb 2018 13:41:14 -0800 Subject: [PATCH 105/214] DOC: fix IPython spelling (#19683) It's upper case I and P (or full lower case), --- pandas/core/frame.py | 2 +- pandas/io/gbq.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 2782ee7b9d201..bc045d74cee52 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1059,7 +1059,7 @@ def to_gbq(self, destination_table, project_id, chunksize=10000, private_key : str (optional) Service account private key in JSON format. Can be file path or string contents. This is useful for remote server - authentication (eg. jupyter iPython notebook on remote host) + authentication (eg. Jupyter/IPython notebook on remote host) """ from pandas.io import gbq diff --git a/pandas/io/gbq.py b/pandas/io/gbq.py index b452b0cf5ddd4..f9bc6ae1a5451 100644 --- a/pandas/io/gbq.py +++ b/pandas/io/gbq.py @@ -65,7 +65,7 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, private_key : str (optional) Service account private key in JSON format. Can be file path or string contents. This is useful for remote server - authentication (eg. jupyter iPython notebook on remote host) + authentication (eg. Jupyter/IPython notebook on remote host) dialect : {'legacy', 'standard'}, default 'legacy' 'legacy' : Use BigQuery's legacy SQL dialect. From 49812cf5c4e4907cbab220634baaa9f320f78d2a Mon Sep 17 00:00:00 2001 From: William Ayd Date: Tue, 13 Feb 2018 15:56:31 -0800 Subject: [PATCH 106/214] Explicitly set dtype of np.lexsort in group_rank (#19679) --- pandas/_libs/groupby_helper.pxi.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index 48dac7bf10362..1d77a373bb7dd 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -531,7 +531,7 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, # each label corresponds to a different group value, # the mask helps you differentiate missing values before # performing sort on the actual values - _as = np.lexsort(order) + _as = np.lexsort(order).view(dtype=np.int64) if not ascending: _as = _as[::-1] From b9bd0d7fb2083b29a4943e67d6f646309449bebe Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Wed, 14 Feb 2018 03:05:46 -0800 Subject: [PATCH 107/214] BUG: Do not round DatetimeIndex nanosecond precision when iterating (#19628) --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/_libs/tslib.pyx | 15 ++++++++------- pandas/conftest.py | 6 ++++++ pandas/tests/indexes/datetimes/test_timezones.py | 13 ++++++++++++- 4 files changed, 27 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 72f63a4da0f4d..b6316bd39f396 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -729,6 +729,7 @@ Timezones - Bug in :func:`DatetimeIndex.insert` where inserting ``NaT`` into a timezone-aware index incorrectly raised (:issue:`16357`) - Bug in the :class:`DataFrame` constructor, where tz-aware Datetimeindex and a given column name will result in an empty ``DataFrame`` (:issue:`19157`) - Bug in :func:`Timestamp.tz_localize` where localizing a timestamp near the minimum or maximum valid values could overflow and return a timestamp with an incorrect nanosecond value (:issue:`12677`) +- Bug when iterating over :class:`DatetimeIndex` that was localized with fixed timezone offset that rounded nanosecond precision to microseconds (:issue:`19603`) Offsets ^^^^^^^ diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 85e667521e5f2..fec7f21d6e6eb 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -46,7 +46,8 @@ from tslibs.timezones cimport (is_utc, is_tzlocal, is_fixed_offset, treat_tz_as_pytz, get_dst_info) from tslibs.conversion cimport (tz_convert_single, _TSObject, convert_datetime_to_tsobject, - get_datetime64_nanos) + get_datetime64_nanos, + tz_convert_utc_to_tzlocal) from tslibs.conversion import tz_convert_single from tslibs.nattype import NaT, nat_strings, iNaT @@ -144,12 +145,12 @@ def ints_to_pydatetime(ndarray[int64_t] arr, tz=None, freq=None, if value == NPY_NAT: result[i] = NaT else: - dt64_to_dtstruct(value, &dts) - dt = create_datetime_from_ts(value, dts, tz, freq) - dt = dt + tz.utcoffset(dt) - if box: - dt = Timestamp(dt) - result[i] = dt + # Python datetime objects do not support nanosecond + # resolution (yet, PEP 564). Need to compute new value + # using the i8 representation. + local_value = tz_convert_utc_to_tzlocal(value, tz) + dt64_to_dtstruct(local_value, &dts) + result[i] = func_create(value, dts, tz, freq) else: trans, deltas, typ = get_dst_info(tz) diff --git a/pandas/conftest.py b/pandas/conftest.py index 4fe66d4cf7e1f..37f0a2f818a3b 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -93,3 +93,9 @@ def compression_no_zip(request): except zip """ return request.param + + +@pytest.fixture(scope='module') +def datetime_tz_utc(): + from datetime import timezone + return timezone.utc diff --git a/pandas/tests/indexes/datetimes/test_timezones.py b/pandas/tests/indexes/datetimes/test_timezones.py index 075d239df5f7a..62854676d43be 100644 --- a/pandas/tests/indexes/datetimes/test_timezones.py +++ b/pandas/tests/indexes/datetimes/test_timezones.py @@ -17,7 +17,7 @@ import pandas as pd from pandas._libs import tslib from pandas._libs.tslibs import timezones -from pandas.compat import lrange, zip +from pandas.compat import lrange, zip, PY3 from pandas import (DatetimeIndex, date_range, bdate_range, Timestamp, isna, to_datetime, Index) @@ -949,6 +949,17 @@ def test_dti_union_aware(self): result = rng.union(rng2) assert result.tz.zone == 'UTC' + @pytest.mark.parametrize('tz', [None, 'UTC', "US/Central", + dateutil.tz.tzoffset(None, -28800)]) + @pytest.mark.usefixtures("datetime_tz_utc") + @pytest.mark.skipif(not PY3, reason="datetime.timezone not in PY2") + def test_iteration_preserves_nanoseconds(self, tz): + # GH 19603 + index = DatetimeIndex(["2018-02-08 15:00:00.168456358", + "2018-02-08 15:00:00.168456359"], tz=tz) + for i, ts in enumerate(index): + assert ts == index[i] + class TestDateRange(object): """Tests for date_range with timezones""" From 76f175bec48d51749bbc8b48526ac0f63c01b89a Mon Sep 17 00:00:00 2001 From: Aaron Critchley Date: Wed, 14 Feb 2018 11:12:07 +0000 Subject: [PATCH 108/214] COMPAT-18589: Supporting axis in Series.rename (#18923) --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/core/generic.py | 3 +++ pandas/tests/series/test_alter_axes.py | 8 ++++++++ 3 files changed, 12 insertions(+) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index b6316bd39f396..dddd370780ab6 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -841,6 +841,7 @@ Reshaping - Bug in :func:`concat` when concatting sparse and dense series it returns only a ``SparseDataFrame``. Should be a ``DataFrame``. (:issue:`18914`, :issue:`18686`, and :issue:`16874`) - Improved error message for :func:`DataFrame.merge` when there is no common merge key (:issue:`19427`) - Bug in :func:`DataFrame.join` which does an *outer* instead of a *left* join when being called with multiple DataFrames and some have non-unique indices (:issue:`19624`) +- :func:`Series.rename` now accepts ``axis`` as a kwarg (:issue:`18589`) Other ^^^^^ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 35f866c9e7d58..297450417e3cf 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -863,6 +863,9 @@ def rename(self, *args, **kwargs): copy = kwargs.pop('copy', True) inplace = kwargs.pop('inplace', False) level = kwargs.pop('level', None) + axis = kwargs.pop('axis', None) + if axis is not None: + axis = self._get_axis_number(axis) if kwargs: raise TypeError('rename() got an unexpected keyword ' diff --git a/pandas/tests/series/test_alter_axes.py b/pandas/tests/series/test_alter_axes.py index 714e43a4af1f8..dce4e82cbdcf1 100644 --- a/pandas/tests/series/test_alter_axes.py +++ b/pandas/tests/series/test_alter_axes.py @@ -81,6 +81,14 @@ def test_rename_set_name_inplace(self): exp = np.array(['a', 'b', 'c'], dtype=np.object_) tm.assert_numpy_array_equal(s.index.values, exp) + def test_rename_axis_supported(self): + # Supporting axis for compatibility, detailed in GH-18589 + s = Series(range(5)) + s.rename({}, axis=0) + s.rename({}, axis='index') + with tm.assert_raises_regex(ValueError, 'No axis named 5'): + s.rename({}, axis=5) + def test_set_name_attribute(self): s = Series([1, 2, 3]) s2 = Series([1, 2, 3], name='bar') From 39e7b6916b07982240bac87132848fb2665806a2 Mon Sep 17 00:00:00 2001 From: Matt Kirk Date: Wed, 14 Feb 2018 18:13:19 +0700 Subject: [PATCH 109/214] Performance increase rolling min max (#19549) --- asv_bench/benchmarks/rolling.py | 16 +++++++++- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/_libs/src/headers/cmath | 15 +++++++++ pandas/_libs/src/headers/math.h | 11 ------- pandas/_libs/window.pyx | 54 ++++++++++++++++++++++----------- setup.py | 5 +-- 6 files changed, 70 insertions(+), 32 deletions(-) create mode 100644 pandas/_libs/src/headers/cmath delete mode 100644 pandas/_libs/src/headers/math.h diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index 75990d83f8212..ba25ad6c5eda6 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -16,12 +16,26 @@ class Methods(object): def setup(self, constructor, window, dtype, method): N = 10**5 - arr = np.random.random(N).astype(dtype) + arr = (100 * np.random.random(N)).astype(dtype) self.roll = getattr(pd, constructor)(arr).rolling(window) def time_rolling(self, constructor, window, dtype, method): getattr(self.roll, method)() +class VariableWindowMethods(Methods): + sample_time = 0.2 + params = (['DataFrame', 'Series'], + ['50s', '1h', '1d'], + ['int', 'float'], + ['median', 'mean', 'max', 'min', 'std', 'count', 'skew', 'kurt', + 'sum']) + param_names = ['contructor', 'window', 'dtype', 'method'] + + def setup(self, constructor, window, dtype, method): + N = 10**5 + arr = (100 * np.random.random(N)).astype(dtype) + index = pd.date_range('2017-01-01', periods=N, freq='5s') + self.roll = getattr(pd, constructor)(arr, index=index).rolling(window) class Pairwise(object): diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index dddd370780ab6..932618ba1df21 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -645,6 +645,7 @@ Performance Improvements - Improved performance of :func:`MultiIndex.get_loc` for large indexes, at the cost of a reduction in performance for small ones (:issue:`18519`) - Improved performance of pairwise ``.rolling()`` and ``.expanding()`` with ``.cov()`` and ``.corr()`` operations (:issue:`17917`) - Improved performance of :func:`DataFrameGroupBy.rank` (:issue:`15779`) +- Improved performance of variable ``.rolling()`` on ``.min()`` and ``.max()`` (:issue:`19521`) .. _whatsnew_0230.docs: diff --git a/pandas/_libs/src/headers/cmath b/pandas/_libs/src/headers/cmath new file mode 100644 index 0000000000000..d8e2239406cae --- /dev/null +++ b/pandas/_libs/src/headers/cmath @@ -0,0 +1,15 @@ +#ifndef _PANDAS_MATH_H_ +#define _PANDAS_MATH_H_ + +// In older versions of Visual Studio there wasn't a std::signbit defined +// This defines it using _copysign +#if defined(_MSC_VER) && (_MSC_VER < 1800) +#include +namespace std { + __inline int signbit(double num) { return _copysign(1.0, num) < 0; } +} +#else +#include +#endif + +#endif diff --git a/pandas/_libs/src/headers/math.h b/pandas/_libs/src/headers/math.h deleted file mode 100644 index 34ad9f24a58f9..0000000000000 --- a/pandas/_libs/src/headers/math.h +++ /dev/null @@ -1,11 +0,0 @@ -#ifndef _PANDAS_MATH_H_ -#define _PANDAS_MATH_H_ - -#if defined(_MSC_VER) && (_MSC_VER < 1800) -#include -__inline int signbit(double num) { return _copysign(1.0, num) < 0; } -#else -#include -#endif - -#endif diff --git a/pandas/_libs/window.pyx b/pandas/_libs/window.pyx index cacb073da581c..aa13f03d8e9e4 100644 --- a/pandas/_libs/window.pyx +++ b/pandas/_libs/window.pyx @@ -3,6 +3,7 @@ cimport cython from cython cimport Py_ssize_t +from libcpp.deque cimport deque from libc.stdlib cimport malloc, free @@ -12,7 +13,7 @@ from numpy cimport ndarray, double_t, int64_t, float64_t cnp.import_array() -cdef extern from "../src/headers/math.h": +cdef extern from "../src/headers/cmath" namespace "std": int signbit(double) nogil double sqrt(double x) nogil @@ -1222,8 +1223,9 @@ cdef _roll_min_max(ndarray[numeric] input, int64_t win, int64_t minp, cdef: numeric ai bint is_variable, should_replace - int64_t s, e, N, i, j, removed + int64_t N, i, removed, window_i Py_ssize_t nobs = 0 + deque Q[int64_t] ndarray[int64_t] starti, endi ndarray[numeric, ndim=1] output cdef: @@ -1242,32 +1244,48 @@ cdef _roll_min_max(ndarray[numeric] input, int64_t win, int64_t minp, output = np.empty(N, dtype=input.dtype) + Q = deque[int64_t]() + if is_variable: with nogil: - for i in range(N): - s = starti[i] - e = endi[i] + # This is using a modified version of the C++ code in this + # SO post: http://bit.ly/2nOoHlY + # The original impl didn't deal with variable window sizes + # So the code was optimized for that - r = input[s] - nobs = 0 - for j in range(s, e): + for i from starti[0] <= i < endi[0]: + ai = init_mm(input[i], &nobs, is_max) - # adds, death at the i offset - ai = init_mm(input[j], &nobs, is_max) + if is_max: + while not Q.empty() and ai >= input[Q.back()]: + Q.pop_back() + else: + while not Q.empty() and ai <= input[Q.back()]: + Q.pop_back() + Q.push_back(i) - if is_max: - if ai > r: - r = ai - else: - if ai < r: - r = ai + for i from endi[0] <= i < N: + output[i-1] = calc_mm(minp, nobs, input[Q.front()]) - output[i] = calc_mm(minp, nobs, r) + ai = init_mm(input[i], &nobs, is_max) - else: + if is_max: + while not Q.empty() and ai >= input[Q.back()]: + Q.pop_back() + else: + while not Q.empty() and ai <= input[Q.back()]: + Q.pop_back() + while not Q.empty() and Q.front() <= i - (endi[i] - starti[i]): + Q.pop_front() + + Q.push_back(i) + + output[N-1] = calc_mm(minp, nobs, input[Q.front()]) + + else: # setup the rings of death! ring = malloc(win * sizeof(numeric)) death = malloc(win * sizeof(int64_t)) diff --git a/setup.py b/setup.py index 2332503e558ed..c66979dd19ef0 100755 --- a/setup.py +++ b/setup.py @@ -617,7 +617,8 @@ def pxd(name): 'pyxfile': '_libs/testing'}, '_libs.window': { 'pyxfile': '_libs/window', - 'pxdfiles': ['_libs/skiplist', '_libs/src/util']}, + 'pxdfiles': ['_libs/skiplist', '_libs/src/util'], + 'language': 'c++'}, '_libs.writers': { 'pyxfile': '_libs/writers', 'pxdfiles': ['_libs/src/util']}, @@ -640,11 +641,11 @@ def pxd(name): sources=sources, depends=data.get('depends', []), include_dirs=include, + language=data.get('language', 'c'), extra_compile_args=extra_compile_args) extensions.append(obj) - # ---------------------------------------------------------------------- # msgpack From 5a20717ddc58c4d961227a482fa88f905f159bab Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 14 Feb 2018 03:26:51 -0800 Subject: [PATCH 110/214] tests for tslibs.conversion and tslibs.timezones (#19642) --- pandas/tests/tseries/test_timezones.py | 86 +------------------------- pandas/tests/tslibs/test_conversion.py | 57 +++++++++++++++++ pandas/tests/tslibs/test_timezones.py | 37 +++++++++++ 3 files changed, 95 insertions(+), 85 deletions(-) create mode 100644 pandas/tests/tslibs/test_conversion.py create mode 100644 pandas/tests/tslibs/test_timezones.py diff --git a/pandas/tests/tseries/test_timezones.py b/pandas/tests/tseries/test_timezones.py index 565e735c14c80..97326dc04a522 100644 --- a/pandas/tests/tseries/test_timezones.py +++ b/pandas/tests/tseries/test_timezones.py @@ -2,15 +2,10 @@ import pytest import pytz -import dateutil -import numpy as np from datetime import datetime -import pandas.util.testing as tm -from pandas.core.indexes.datetimes import date_range -from pandas._libs import tslib -from pandas._libs.tslibs import timezones, conversion +from pandas._libs.tslibs import timezones from pandas import Timestamp @@ -111,82 +106,3 @@ def localize(self, tz, x): def normalize(self, ts): # no-op for dateutil return ts - - def test_tzlocal(self): - # GH 13583 - ts = Timestamp('2011-01-01', tz=dateutil.tz.tzlocal()) - assert ts.tz == dateutil.tz.tzlocal() - assert "tz='tzlocal()')" in repr(ts) - - tz = timezones.maybe_get_tz('tzlocal()') - assert tz == dateutil.tz.tzlocal() - - # get offset using normal datetime for test - offset = dateutil.tz.tzlocal().utcoffset(datetime(2011, 1, 1)) - offset = offset.total_seconds() * 1000000000 - assert ts.value + offset == Timestamp('2011-01-01').value - - -class TestTimeZoneCacheKey(object): - - @pytest.mark.parametrize('tz_name', list(pytz.common_timezones)) - def test_cache_keys_are_distinct_for_pytz_vs_dateutil(self, tz_name): - if tz_name == 'UTC': - # skip utc as it's a special case in dateutil - return - tz_p = timezones.maybe_get_tz(tz_name) - tz_d = timezones.maybe_get_tz('dateutil/' + tz_name) - if tz_d is None: - # skip timezones that dateutil doesn't know about. - return - assert (timezones._p_tz_cache_key(tz_p) != - timezones._p_tz_cache_key(tz_d)) - - -class TestTslib(object): - - def test_tslib_tz_convert(self): - def compare_utc_to_local(tz_didx, utc_didx): - f = lambda x: conversion.tz_convert_single(x, 'UTC', tz_didx.tz) - result = conversion.tz_convert(tz_didx.asi8, 'UTC', tz_didx.tz) - result_single = np.vectorize(f)(tz_didx.asi8) - tm.assert_numpy_array_equal(result, result_single) - - def compare_local_to_utc(tz_didx, utc_didx): - f = lambda x: conversion.tz_convert_single(x, tz_didx.tz, 'UTC') - result = conversion.tz_convert(utc_didx.asi8, tz_didx.tz, 'UTC') - result_single = np.vectorize(f)(utc_didx.asi8) - tm.assert_numpy_array_equal(result, result_single) - - for tz in ['UTC', 'Asia/Tokyo', 'US/Eastern', 'Europe/Moscow']: - # US: 2014-03-09 - 2014-11-11 - # MOSCOW: 2014-10-26 / 2014-12-31 - tz_didx = date_range('2014-03-01', '2015-01-10', freq='H', tz=tz) - utc_didx = date_range('2014-03-01', '2015-01-10', freq='H') - compare_utc_to_local(tz_didx, utc_didx) - # local tz to UTC can be differ in hourly (or higher) freqs because - # of DST - compare_local_to_utc(tz_didx, utc_didx) - - tz_didx = date_range('2000-01-01', '2020-01-01', freq='D', tz=tz) - utc_didx = date_range('2000-01-01', '2020-01-01', freq='D') - compare_utc_to_local(tz_didx, utc_didx) - compare_local_to_utc(tz_didx, utc_didx) - - tz_didx = date_range('2000-01-01', '2100-01-01', freq='A', tz=tz) - utc_didx = date_range('2000-01-01', '2100-01-01', freq='A') - compare_utc_to_local(tz_didx, utc_didx) - compare_local_to_utc(tz_didx, utc_didx) - - # Check empty array - result = conversion.tz_convert(np.array([], dtype=np.int64), - timezones.maybe_get_tz('US/Eastern'), - timezones.maybe_get_tz('Asia/Tokyo')) - tm.assert_numpy_array_equal(result, np.array([], dtype=np.int64)) - - # Check all-NaT array - result = conversion.tz_convert(np.array([tslib.iNaT], dtype=np.int64), - timezones.maybe_get_tz('US/Eastern'), - timezones.maybe_get_tz('Asia/Tokyo')) - tm.assert_numpy_array_equal(result, np.array( - [tslib.iNaT], dtype=np.int64)) diff --git a/pandas/tests/tslibs/test_conversion.py b/pandas/tests/tslibs/test_conversion.py new file mode 100644 index 0000000000000..76038136c26cb --- /dev/null +++ b/pandas/tests/tslibs/test_conversion.py @@ -0,0 +1,57 @@ +# -*- coding: utf-8 -*- + +import numpy as np +import pytest + +import pandas.util.testing as tm +from pandas import date_range +from pandas._libs.tslib import iNaT +from pandas._libs.tslibs import conversion, timezones + + +def compare_utc_to_local(tz_didx, utc_didx): + f = lambda x: conversion.tz_convert_single(x, 'UTC', tz_didx.tz) + result = conversion.tz_convert(tz_didx.asi8, 'UTC', tz_didx.tz) + result_single = np.vectorize(f)(tz_didx.asi8) + tm.assert_numpy_array_equal(result, result_single) + + +def compare_local_to_utc(tz_didx, utc_didx): + f = lambda x: conversion.tz_convert_single(x, tz_didx.tz, 'UTC') + result = conversion.tz_convert(utc_didx.asi8, tz_didx.tz, 'UTC') + result_single = np.vectorize(f)(utc_didx.asi8) + tm.assert_numpy_array_equal(result, result_single) + + +class TestTZConvert(object): + + @pytest.mark.parametrize('tz', ['UTC', 'Asia/Tokyo', + 'US/Eastern', 'Europe/Moscow']) + def test_tz_convert_single_matches_tz_convert_hourly(self, tz): + # US: 2014-03-09 - 2014-11-11 + # MOSCOW: 2014-10-26 / 2014-12-31 + tz_didx = date_range('2014-03-01', '2015-01-10', freq='H', tz=tz) + utc_didx = date_range('2014-03-01', '2015-01-10', freq='H') + compare_utc_to_local(tz_didx, utc_didx) + + # local tz to UTC can be differ in hourly (or higher) freqs because + # of DST + compare_local_to_utc(tz_didx, utc_didx) + + @pytest.mark.parametrize('tz', ['UTC', 'Asia/Tokyo', + 'US/Eastern', 'Europe/Moscow']) + @pytest.mark.parametrize('freq', ['D', 'A']) + def test_tz_convert_single_matches_tz_convert(self, tz, freq): + tz_didx = date_range('2000-01-01', '2020-01-01', freq=freq, tz=tz) + utc_didx = date_range('2000-01-01', '2020-01-01', freq=freq) + compare_utc_to_local(tz_didx, utc_didx) + compare_local_to_utc(tz_didx, utc_didx) + + @pytest.mark.parametrize('arr', [ + pytest.param(np.array([], dtype=np.int64), id='empty'), + pytest.param(np.array([iNaT], dtype=np.int64), id='all_nat')]) + def test_tz_convert_corner(self, arr): + result = conversion.tz_convert(arr, + timezones.maybe_get_tz('US/Eastern'), + timezones.maybe_get_tz('Asia/Tokyo')) + tm.assert_numpy_array_equal(result, arr) diff --git a/pandas/tests/tslibs/test_timezones.py b/pandas/tests/tslibs/test_timezones.py new file mode 100644 index 0000000000000..603c5e3fea26f --- /dev/null +++ b/pandas/tests/tslibs/test_timezones.py @@ -0,0 +1,37 @@ +# -*- coding: utf-8 -*- +from datetime import datetime + +import pytest +import pytz +import dateutil.tz + +from pandas._libs.tslibs import timezones +from pandas import Timestamp + + +@pytest.mark.parametrize('tz_name', list(pytz.common_timezones)) +def test_cache_keys_are_distinct_for_pytz_vs_dateutil(tz_name): + if tz_name == 'UTC': + # skip utc as it's a special case in dateutil + return + tz_p = timezones.maybe_get_tz(tz_name) + tz_d = timezones.maybe_get_tz('dateutil/' + tz_name) + if tz_d is None: + # skip timezones that dateutil doesn't know about. + return + assert timezones._p_tz_cache_key(tz_p) != timezones._p_tz_cache_key(tz_d) + + +def test_tzlocal(): + # GH#13583 + ts = Timestamp('2011-01-01', tz=dateutil.tz.tzlocal()) + assert ts.tz == dateutil.tz.tzlocal() + assert "tz='tzlocal()')" in repr(ts) + + tz = timezones.maybe_get_tz('tzlocal()') + assert tz == dateutil.tz.tzlocal() + + # get offset using normal datetime for test + offset = dateutil.tz.tzlocal().utcoffset(datetime(2011, 1, 1)) + offset = offset.total_seconds() * 1000000000 + assert ts.value + offset == Timestamp('2011-01-01').value From d198a6efd5a3d2e7d5afb4a3cf556507c7501dd4 Mon Sep 17 00:00:00 2001 From: Tommy <10076072+tommyod@users.noreply.github.com> Date: Wed, 14 Feb 2018 12:31:20 +0100 Subject: [PATCH 111/214] Spellchecked io.rst (#19660) --- doc/source/io.rst | 418 ++++++++++++++++++++++++---------------------- 1 file changed, 217 insertions(+), 201 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 1785de54b7dd6..7bb34e4d232dd 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -28,8 +28,11 @@ IO Tools (Text, CSV, HDF5, ...) =============================== -The pandas I/O API is a set of top level ``reader`` functions accessed like ``pd.read_csv()`` that generally return a ``pandas`` -object. The corresponding ``writer`` functions are object methods that are accessed like ``df.to_csv()`` +The pandas I/O API is a set of top level ``reader`` functions accessed like +:func:`pandas.read_csv` that generally return a pandas object. The corresponding +``writer`` functions are object methods that are accessed like +:meth:`DataFrame.to_csv`. Below is a table containing available ``readers`` and +``writers``. .. csv-table:: :header: "Format Type", "Data Description", "Reader", "Writer" @@ -65,13 +68,14 @@ CSV & Text files The two workhorse functions for reading text files (a.k.a. flat files) are :func:`read_csv` and :func:`read_table`. They both use the same parsing code to -intelligently convert tabular data into a DataFrame object. See the +intelligently convert tabular data into a ``DataFrame`` object. See the :ref:`cookbook` for some advanced strategies. Parsing options ''''''''''''''' -:func:`read_csv` and :func:`read_table` accept the following arguments: +The functions :func:`read_csv` and :func:`read_table` accept the following +common arguments: Basic +++++ @@ -94,7 +98,7 @@ delimiter : str, default ``None`` delim_whitespace : boolean, default False Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be used as the delimiter. Equivalent to setting ``sep='\s+'``. - If this option is set to True, nothing should be passed in for the + If this option is set to ``True``, nothing should be passed in for the ``delimiter`` parameter. .. versionadded:: 0.18.1 support for the Python parser. @@ -122,7 +126,7 @@ names : array-like, default ``None`` explicitly pass ``header=None``. Duplicates in this list will cause a ``UserWarning`` to be issued. index_col : int or sequence or ``False``, default ``None`` - Column to use as the row labels of the DataFrame. If a sequence is given, a + Column to use as the row labels of the ``DataFrame``. If a sequence is given, a MultiIndex is used. If you have a malformed file with delimiters at the end of each line, you might consider ``index_col=False`` to force pandas to *not* use the first column as the index (row names). @@ -131,8 +135,8 @@ usecols : array-like or callable, default ``None`` be positional (i.e. integer indices into the document columns) or strings that correspond to column names provided either by the user in `names` or inferred from the document header row(s). For example, a valid array-like - `usecols` parameter would be [0, 1, 2] or ['foo', 'bar', 'baz']. Element - order is ignored, so usecols=[0,1] is the same as [1, 0]. + `usecols` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``. + Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``. If callable, the callable function will be evaluated against the column names, returning names where the callable function evaluates to True: @@ -145,12 +149,12 @@ usecols : array-like or callable, default ``None`` Using this parameter results in much faster parsing time and lower memory usage. squeeze : boolean, default ``False`` - If the parsed data only contains one column then return a Series. + If the parsed data only contains one column then return a ``Series``. prefix : str, default ``None`` Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ... mangle_dupe_cols : boolean, default ``True`` Duplicate columns will be specified as 'X', 'X.1'...'X.N', rather than 'X'...'X'. - Passing in False will cause data to be overwritten if there are duplicate + Passing in ``False`` will cause data to be overwritten if there are duplicate names in the columns. General Parsing Configuration @@ -197,7 +201,7 @@ low_memory : boolean, default ``True`` Internally process the file in chunks, resulting in lower memory use while parsing, but possibly mixed type inference. To ensure no mixed types either set ``False``, or specify the type with the ``dtype`` parameter. - Note that the entire file is read into a single DataFrame regardless, + Note that the entire file is read into a single ``DataFrame`` regardless, use the ``chunksize`` or ``iterator`` parameter to return the data in chunks. (Only valid with C parser) memory_map : boolean, default False @@ -217,16 +221,16 @@ keep_default_na : boolean, default ``True`` Whether or not to include the default NaN values when parsing the data. Depending on whether `na_values` is passed in, the behavior is as follows: - * If `keep_default_na` is True, and `na_values` are specified, `na_values` + * If `keep_default_na` is ``True``, and `na_values` are specified, `na_values` is appended to the default NaN values used for parsing. - * If `keep_default_na` is True, and `na_values` are not specified, only + * If `keep_default_na` is ``True``, and `na_values` are not specified, only the default NaN values are used for parsing. - * If `keep_default_na` is False, and `na_values` are specified, only + * If `keep_default_na` is ``False``, and `na_values` are specified, only the NaN values specified `na_values` are used for parsing. - * If `keep_default_na` is False, and `na_values` are not specified, no + * If `keep_default_na` is ``False``, and `na_values` are not specified, no strings will be parsed as NaN. - Note that if `na_filter` is passed in as False, the `keep_default_na` and + Note that if `na_filter` is passed in as ``False``, the `keep_default_na` and `na_values` parameters will be ignored. na_filter : boolean, default ``True`` Detect missing value markers (empty strings and the value of na_values). In @@ -341,9 +345,9 @@ Error Handling error_bad_lines : boolean, default ``True`` Lines with too many fields (e.g. a csv line with too many commas) will by - default cause an exception to be raised, and no DataFrame will be returned. If - ``False``, then these "bad lines" will dropped from the DataFrame that is - returned. See :ref:`bad lines ` + default cause an exception to be raised, and no ``DataFrame`` will be + returned. If ``False``, then these "bad lines" will dropped from the + ``DataFrame`` that is returned. See :ref:`bad lines ` below. warn_bad_lines : boolean, default ``True`` If error_bad_lines is ``False``, and warn_bad_lines is ``True``, a warning for @@ -354,8 +358,8 @@ warn_bad_lines : boolean, default ``True`` Specifying column data types '''''''''''''''''''''''''''' -You can indicate the data type for the whole DataFrame or -individual columns: +You can indicate the data type for the whole ``DataFrame`` or individual +columns: .. ipython:: python @@ -368,11 +372,11 @@ individual columns: df = pd.read_csv(StringIO(data), dtype={'b': object, 'c': np.float64}) df.dtypes -Fortunately, ``pandas`` offers more than one way to ensure that your column(s) +Fortunately, pandas offers more than one way to ensure that your column(s) contain only one ``dtype``. If you're unfamiliar with these concepts, you can see :ref:`here` to learn more about dtypes, and :ref:`here` to learn more about ``object`` conversion in -``pandas``. +pandas. For instance, you can use the ``converters`` argument @@ -395,7 +399,7 @@ dtypes after reading in the data, df2 df2['col_1'].apply(type).value_counts() -which would convert all valid parsing to floats, leaving the invalid parsing +which will convert all valid parsing to floats, leaving the invalid parsing as ``NaN``. Ultimately, how you deal with reading in columns containing mixed dtypes @@ -407,7 +411,7 @@ worth trying. .. versionadded:: 0.20.0 support for the Python parser. - The ``dtype`` option is supported by the 'python' engine + The ``dtype`` option is supported by the 'python' engine. .. note:: In some cases, reading in abnormal data with columns containing mixed dtypes @@ -453,7 +457,8 @@ Specifying Categorical dtype pd.read_csv(StringIO(data)).dtypes pd.read_csv(StringIO(data), dtype='category').dtypes -Individual columns can be parsed as a ``Categorical`` using a dict specification +Individual columns can be parsed as a ``Categorical`` using a dict +specification: .. ipython:: python @@ -551,17 +556,18 @@ If the header is in a row other than the first, pass the row number to Duplicate names parsing ''''''''''''''''''''''' -If the file or header contains duplicate names, pandas by default will deduplicate -these names so as to prevent data overwrite: +If the file or header contains duplicate names, pandas will by default +distinguish between them so as to prevent overwriting data: .. ipython :: python data = 'a,b,a\n0,1,2\n3,4,5' pd.read_csv(StringIO(data)) -There is no more duplicate data because ``mangle_dupe_cols=True`` by default, which modifies -a series of duplicate columns 'X'...'X' to become 'X', 'X.1',...'X.N'. If ``mangle_dupe_cols -=False``, duplicate data can arise: +There is no more duplicate data because ``mangle_dupe_cols=True`` by default, +which modifies a series of duplicate columns 'X', ..., 'X' to become +'X', 'X.1', ..., 'X.N'. If ``mangle_dupe_cols=False``, duplicate data can +arise: .. code-block :: python @@ -716,7 +722,7 @@ result in byte strings being decoded to unicode in the result: Some formats which encode all characters as multiple bytes, like UTF-16, won't parse correctly at all without specifying the encoding. `Full list of Python standard encodings -`_ +`_. .. _io.index_col: @@ -724,7 +730,7 @@ Index columns and trailing delimiters ''''''''''''''''''''''''''''''''''''' If a file has one more column of data than the number of column names, the -first column will be used as the DataFrame's row names: +first column will be used as the ``DataFrame``'s row names: .. ipython:: python @@ -894,30 +900,31 @@ Pandas will try to call the ``date_parser`` function in three different ways. If an exception is raised, the next one is tried: 1. ``date_parser`` is first called with one or more arrays as arguments, - as defined using `parse_dates` (e.g., ``date_parser(['2013', '2013'], ['1', '2'])``) + as defined using `parse_dates` (e.g., ``date_parser(['2013', '2013'], ['1', '2'])``). 2. If #1 fails, ``date_parser`` is called with all the columns - concatenated row-wise into a single array (e.g., ``date_parser(['2013 1', '2013 2'])``) + concatenated row-wise into a single array (e.g., ``date_parser(['2013 1', '2013 2'])``). 3. If #2 fails, ``date_parser`` is called once for every row with one or more string arguments from the columns indicated with `parse_dates` (e.g., ``date_parser('2013', '1')`` for the first row, ``date_parser('2013', '2')`` - for the second, etc.) + for the second, etc.). Note that performance-wise, you should try these methods of parsing dates in order: -1. Try to infer the format using ``infer_datetime_format=True`` (see section below) +1. Try to infer the format using ``infer_datetime_format=True`` (see section below). 2. If you know the format, use ``pd.to_datetime()``: - ``date_parser=lambda x: pd.to_datetime(x, format=...)`` + ``date_parser=lambda x: pd.to_datetime(x, format=...)``. 3. If you have a really non-standard format, use a custom ``date_parser`` function. For optimal performance, this should be vectorized, i.e., it should accept arrays as arguments. -You can explore the date parsing functionality in ``date_converters.py`` and -add your own. We would love to turn this module into a community supported set -of date/time parsers. To get you started, ``date_converters.py`` contains +You can explore the date parsing functionality in +`date_converters.py `__ +and add your own. We would love to turn this module into a community supported +set of date/time parsers. To get you started, ``date_converters.py`` contains functions to parse dual date and time columns, year/month/day columns, and year/month/day/hour/minute/second columns. It also contains a ``generic_parser`` function so you can curry it with a function that deals with @@ -945,7 +952,7 @@ of strings. So in general, ``infer_datetime_format`` should not have any negative consequences if enabled. Here are some examples of datetime strings that can be guessed (All -representing December 30th, 2011 at 00:00:00) +representing December 30th, 2011 at 00:00:00): - "20111230" - "2011/12/30" @@ -954,7 +961,7 @@ representing December 30th, 2011 at 00:00:00) - "30/Dec/2011 00:00:00" - "30/December/2011 00:00:00" -``infer_datetime_format`` is sensitive to ``dayfirst``. With +Note that ``infer_datetime_format`` is sensitive to ``dayfirst``. With ``dayfirst=True``, it will guess "01/12/2011" to be December 1st. With ``dayfirst=False`` (default) it will guess "01/12/2011" to be January 12th. @@ -1030,7 +1037,7 @@ correctly: with open('tmp.csv', 'w') as fh: fh.write(data) -By default, numbers with a thousands separator will be parsed as strings +By default, numbers with a thousands separator will be parsed as strings: .. ipython:: python @@ -1040,7 +1047,7 @@ By default, numbers with a thousands separator will be parsed as strings df.level.dtype -The ``thousands`` keyword allows integers to be parsed correctly +The ``thousands`` keyword allows integers to be parsed correctly: .. ipython:: python @@ -1060,11 +1067,12 @@ The ``thousands`` keyword allows integers to be parsed correctly NA Values ''''''''' -To control which values are parsed as missing values (which are signified by ``NaN``), specify a -string in ``na_values``. If you specify a list of strings, then all values in -it are considered to be missing values. If you specify a number (a ``float``, like ``5.0`` or an ``integer`` like ``5``), -the corresponding equivalent values will also imply a missing value (in this case effectively -``[5.0,5]`` are recognized as ``NaN``. +To control which values are parsed as missing values (which are signified by +``NaN``), specify a string in ``na_values``. If you specify a list of strings, +then all values in it are considered to be missing values. If you specify a +number (a ``float``, like ``5.0`` or an ``integer`` like ``5``), the +corresponding equivalent values will also imply a missing value (in this case +effectively ``[5.0, 5]`` are recognized as ``NaN``). To completely override the default values that are recognized as missing, specify ``keep_default_na=False``. @@ -1073,29 +1081,34 @@ To completely override the default values that are recognized as missing, specif The default ``NaN`` recognized values are ``['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', 'N/A', 'n/a', 'NA', '#NA', 'NULL', 'null', 'NaN', '-NaN', 'nan', '-nan', '']``. +Let us consider some examples: + .. code-block:: python read_csv(path, na_values=[5]) -the default values, in addition to ``5`` , ``5.0`` when interpreted as numbers are recognized as ``NaN`` +In the example above ``5`` and ``5.0`` will be recognized as ``NaN``, in +addition to the defaults. A string will first be interpreted as a numerical +``5``, then as a ``NaN``. .. code-block:: python read_csv(path, keep_default_na=False, na_values=[""]) -only an empty field will be ``NaN`` +Above, only an empty field will be recognized as ``NaN``. .. code-block:: python read_csv(path, keep_default_na=False, na_values=["NA", "0"]) -only ``NA`` and ``0`` as strings are ``NaN`` +Above, both ``NA`` and ``0`` as strings are ``NaN``. .. code-block:: python read_csv(path, na_values=["Nope"]) -the default values, in addition to the string ``"Nope"`` are recognized as ``NaN`` +The default values, in addition to the string ``"Nope"`` are recognized as +``NaN``. .. _io.infinity: @@ -1143,9 +1156,9 @@ Boolean values '''''''''''''' The common values ``True``, ``False``, ``TRUE``, and ``FALSE`` are all -recognized as boolean. Sometime you would want to recognize some other values -as being boolean. To do this use the ``true_values`` and ``false_values`` -options: +recognized as boolean. Occasionally you might want to recognize other values +as being boolean. To do this, use the ``true_values`` and ``false_values`` +options as follows: .. ipython:: python @@ -1161,7 +1174,7 @@ Handling "bad" lines Some files may have malformed lines with too few fields or too many. Lines with too few fields will have NA values filled in the trailing fields. Lines with -too many will cause an error by default: +too many fields will raise an error by default: .. ipython:: python :suppress: @@ -1228,7 +1241,7 @@ By default, ``read_csv`` uses the Excel dialect and treats the double quote as the quote character, which causes it to fail when it finds a newline before it finds the closing double quote. -We can get around this using ``dialect`` +We can get around this using ``dialect``: .. ipython:: python :okwarning: @@ -1253,9 +1266,9 @@ after a delimiter: print(data) pd.read_csv(StringIO(data), skipinitialspace=True) -The parsers make every attempt to "do the right thing" and not be very -fragile. Type inference is a pretty big deal. So if a column can be coerced to -integer dtype without altering the contents, it will do so. Any non-numeric +The parsers make every attempt to "do the right thing" and not be fragile. Type +inference is a pretty big deal. If a column can be coerced to integer dtype +without altering the contents, the parser will do so. Any non-numeric columns will come through as object dtype as with the rest of pandas objects. .. _io.quoting: @@ -1278,7 +1291,7 @@ should pass the ``escapechar`` option: Files with Fixed Width Columns '''''''''''''''''''''''''''''' -While ``read_csv`` reads delimited data, the :func:`read_fwf` function works +While :func:`read_csv` reads delimited data, the :func:`read_fwf` function works with data files that have known and fixed column widths. The function parameters to ``read_fwf`` are largely the same as `read_csv` with two extra parameters, and a different usage of the ``delimiter`` parameter: @@ -1287,7 +1300,7 @@ a different usage of the ``delimiter`` parameter: fixed-width fields of each line as half-open intervals (i.e., [from, to[ ). String value 'infer' can be used to instruct the parser to try detecting the column specifications from the first 100 rows of the data. Default - behaviour, if not specified, is to infer. + behavior, if not specified, is to infer. - ``widths``: A list of field widths which can be used instead of 'colspecs' if the intervals are contiguous. - ``delimiter``: Characters to consider as filler characters in the fixed-width file. @@ -1312,7 +1325,7 @@ Consider a typical fixed-width data file: print(open('bar.csv').read()) -In order to parse this file into a DataFrame, we simply need to supply the +In order to parse this file into a ``DataFrame``, we simply need to supply the column specifications to the `read_fwf` function along with the file name: .. ipython:: python @@ -1383,7 +1396,7 @@ column: print(open('foo.csv').read()) In this special case, ``read_csv`` assumes that the first column is to be used -as the index of the DataFrame: +as the index of the ``DataFrame``: .. ipython:: python @@ -1436,10 +1449,10 @@ rows will skip the intervening rows. .. ipython:: python from pandas.util.testing import makeCustomDataframe as mkdf - df = mkdf(5,3,r_idx_nlevels=2,c_idx_nlevels=4) + df = mkdf(5, 3, r_idx_nlevels=2, c_idx_nlevels=4) df.to_csv('mi.csv') print(open('mi.csv').read()) - pd.read_csv('mi.csv',header=[0,1,2,3],index_col=[0,1]) + pd.read_csv('mi.csv', header=[0, 1, 2, 3], index_col=[0, 1]) ``read_csv`` is also able to interpret a more common format of multi-columns indices. @@ -1448,17 +1461,17 @@ of multi-columns indices. :suppress: data = ",a,a,a,b,c,c\n,q,r,s,t,u,v\none,1,2,3,4,5,6\ntwo,7,8,9,10,11,12" - fh = open('mi2.csv','w') + fh = open('mi2.csv', 'w') fh.write(data) fh.close() .. ipython:: python print(open('mi2.csv').read()) - pd.read_csv('mi2.csv',header=[0,1],index_col=0) + pd.read_csv('mi2.csv', header=[0, 1], index_col=0) Note: If an ``index_col`` is not specified (e.g. you don't have an index, or wrote it -with ``df.to_csv(..., index=False``), then any ``names`` on the columns index will be *lost*. +with ``df.to_csv(..., index=False)``, then any ``names`` on the columns index will be *lost*. .. ipython:: python :suppress: @@ -1578,7 +1591,7 @@ Writing out Data Writing to CSV format +++++++++++++++++++++ -The Series and DataFrame objects have an instance method ``to_csv`` which +The ``Series`` and ``DataFrame`` objects have an instance method ``to_csv`` which allows storing the contents of the object as a comma-separated-values file. The function takes a number of arguments. Only the first is required. @@ -1591,7 +1604,7 @@ function takes a number of arguments. Only the first is required. - ``index``: whether to write row (index) names (default True) - ``index_label``: Column label(s) for index column(s) if desired. If None (default), and `header` and `index` are True, then the index names are - used. (A sequence should be given if the DataFrame uses MultiIndex). + used. (A sequence should be given if the ``DataFrame`` uses MultiIndex). - ``mode`` : Python write mode, default 'w' - ``encoding``: a string representing the encoding to use if the contents are non-ASCII, for Python versions prior to 3 @@ -1611,7 +1624,7 @@ Writing a formatted string .. _io.formatting: -The DataFrame object has an instance method ``to_string`` which allows control +The ``DataFrame`` object has an instance method ``to_string`` which allows control over the string representation of the object. All arguments are optional: - ``buf`` default None, for example a StringIO object @@ -1622,8 +1635,8 @@ over the string representation of the object. All arguments are optional: which takes a single argument and returns a formatted string - ``float_format`` default None, a function which takes a single (float) argument and returns a formatted string; to be applied to floats in the - DataFrame. - - ``sparsify`` default True, set to False for a DataFrame with a hierarchical + ``DataFrame``. + - ``sparsify`` default True, set to False for a ``DataFrame`` with a hierarchical index to print every multiindex key at each row. - ``index_names`` default True, will print the names of the indices - ``index`` default True, will print the index (ie, row labels) @@ -1631,7 +1644,7 @@ over the string representation of the object. All arguments are optional: - ``justify`` default ``left``, will print column headers left- or right-justified -The Series object also has a ``to_string`` method, but with only the ``buf``, +The ``Series`` object also has a ``to_string`` method, but with only the ``buf``, ``na_rep``, ``float_format`` arguments. There is also a ``length`` argument which, if set to ``True``, will additionally output the length of the Series. @@ -1654,11 +1667,11 @@ with optional parameters: This can be ``None`` in which case a JSON string is returned - ``orient`` : - Series : + ``Series``: - default is ``index`` - allowed values are {``split``, ``records``, ``index``} - DataFrame + ``DataFrame``: - default is ``columns`` - allowed values are {``split``, ``records``, ``index``, ``columns``, ``values``, ``table``} @@ -1693,7 +1706,7 @@ Orient Options ++++++++++++++ There are a number of different options for the format of the resulting JSON -file / string. Consider the following DataFrame and Series: +file / string. Consider the following ``DataFrame`` and ``Series``: .. ipython:: python @@ -1720,8 +1733,8 @@ but the index labels are now primary: sjo.to_json(orient="index") **Record oriented** serializes the data to a JSON array of column -> value records, -index labels are not included. This is useful for passing DataFrame data to plotting -libraries, for example the JavaScript library d3.js: +index labels are not included. This is useful for passing ``DataFrame`` data to plotting +libraries, for example the JavaScript library ``d3.js``: .. ipython:: python @@ -1756,7 +1769,7 @@ preservation of metadata including but not limited to dtypes and index names. Date Handling +++++++++++++ -Writing in ISO date format +Writing in ISO date format: .. ipython:: python @@ -1766,21 +1779,21 @@ Writing in ISO date format json = dfd.to_json(date_format='iso') json -Writing in ISO date format, with microseconds +Writing in ISO date format, with microseconds: .. ipython:: python json = dfd.to_json(date_format='iso', date_unit='us') json -Epoch timestamps, in seconds +Epoch timestamps, in seconds: .. ipython:: python json = dfd.to_json(date_format='epoch', date_unit='s') json -Writing to a file, with a date index and a date column +Writing to a file, with a date index and a date column: .. ipython:: python @@ -1795,7 +1808,8 @@ Writing to a file, with a date index and a date column Fallback Behavior +++++++++++++++++ -If the JSON serializer cannot handle the container contents directly it will fallback in the following manner: +If the JSON serializer cannot handle the container contents directly it will +fall back in the following manner: - if the dtype is unsupported (e.g. ``np.complex``) then the ``default_handler``, if provided, will be called for each value, otherwise an exception is raised. @@ -1864,13 +1878,13 @@ is ``None``. To explicitly force ``Series`` parsing, pass ``typ=series`` ``table``; adhering to the JSON `Table Schema`_ -- ``dtype`` : if True, infer dtypes, if a dict of column to dtype, then use those, if False, then don't infer dtypes at all, default is True, apply only to the data -- ``convert_axes`` : boolean, try to convert the axes to the proper dtypes, default is True -- ``convert_dates`` : a list of columns to parse for dates; If True, then try to parse date-like columns, default is True -- ``keep_default_dates`` : boolean, default True. If parsing dates, then parse the default date-like columns -- ``numpy`` : direct decoding to NumPy arrays. default is False; - Supports numeric data only, although labels may be non-numeric. Also note that the JSON ordering **MUST** be the same for each term if ``numpy=True`` -- ``precise_float`` : boolean, default ``False``. Set to enable usage of higher precision (strtod) function when decoding string to double values. Default (``False``) is to use fast but less precise builtin functionality +- ``dtype`` : if True, infer dtypes, if a dict of column to dtype, then use those, if ``False``, then don't infer dtypes at all, default is True, apply only to the data. +- ``convert_axes`` : boolean, try to convert the axes to the proper dtypes, default is ``True`` +- ``convert_dates`` : a list of columns to parse for dates; If ``True``, then try to parse date-like columns, default is ``True``. +- ``keep_default_dates`` : boolean, default ``True``. If parsing dates, then parse the default date-like columns. +- ``numpy`` : direct decoding to NumPy arrays. default is ``False``; + Supports numeric data only, although labels may be non-numeric. Also note that the JSON ordering **MUST** be the same for each term if ``numpy=True``. +- ``precise_float`` : boolean, default ``False``. Set to enable usage of higher precision (strtod) function when decoding string to double values. Default (``False``) is to use fast but less precise builtin functionality. - ``date_unit`` : string, the timestamp unit to detect if converting dates. Default None. By default the timestamp precision will be detected, if this is not desired then pass one of 's', 'ms', 'us' or 'ns' to force timestamp precision to @@ -1888,9 +1902,11 @@ overview. Data Conversion +++++++++++++++ -The default of ``convert_axes=True``, ``dtype=True``, and ``convert_dates=True`` will try to parse the axes, and all of the data -into appropriate types, including dates. If you need to override specific dtypes, pass a dict to ``dtype``. ``convert_axes`` should only -be set to ``False`` if you need to preserve string-like numbers (e.g. '1', '2') in an axes. +The default of ``convert_axes=True``, ``dtype=True``, and ``convert_dates=True`` +will try to parse the axes, and all of the data into appropriate types, +including dates. If you need to override specific dtypes, pass a dict to +``dtype``. ``convert_axes`` should only be set to ``False`` if you need to +preserve string-like numbers (e.g. '1', '2') in an axes. .. note:: @@ -2175,7 +2191,7 @@ A few notes on the generated table schema: - Periods are converted to timestamps before serialization, and so have the same behavior of being converted to UTC. In addition, periods will contain - and additional field ``freq`` with the period's frequency, e.g. ``'A-DEC'`` + and additional field ``freq`` with the period's frequency, e.g. ``'A-DEC'``. .. ipython:: python @@ -2184,7 +2200,7 @@ A few notes on the generated table schema: build_table_schema(s_per) - Categoricals use the ``any`` type and an ``enum`` constraint listing - the set of possible values. Additionally, an ``ordered`` field is included + the set of possible values. Additionally, an ``ordered`` field is included: .. ipython:: python @@ -2212,7 +2228,7 @@ A few notes on the generated table schema: + For series, the ``object.name`` is used. If that's none, then the name is ``values`` - + For DataFrames, the stringified version of the column name is used + + For ``DataFrames``, the stringified version of the column name is used + For ``Index`` (not ``MultiIndex``), ``index.name`` is used, with a fallback to ``index`` if that is None. + For ``MultiIndex``, ``mi.names`` is used. If any level has no name, @@ -2268,15 +2284,15 @@ Reading HTML Content below regarding the issues surrounding the BeautifulSoup4/html5lib/lxml parsers. The top-level :func:`~pandas.io.html.read_html` function can accept an HTML -string/file/URL and will parse HTML tables into list of pandas DataFrames. +string/file/URL and will parse HTML tables into list of pandas ``DataFrames``. Let's look at a few examples. .. note:: ``read_html`` returns a ``list`` of ``DataFrame`` objects, even if there is - only a single table contained in the HTML content + only a single table contained in the HTML content. -Read a URL with no options +Read a URL with no options: .. ipython:: python @@ -2290,7 +2306,7 @@ Read a URL with no options and the data below may be slightly different. Read in the content of the file from the above URL and pass it to ``read_html`` -as a string +as a string: .. ipython:: python :suppress: @@ -2304,7 +2320,7 @@ as a string dfs = pd.read_html(f.read()) dfs -You can even pass in an instance of ``StringIO`` if you so desire +You can even pass in an instance of ``StringIO`` if you so desire: .. ipython:: python @@ -2323,7 +2339,7 @@ You can even pass in an instance of ``StringIO`` if you so desire `__. -Read a URL and match a table that contains specific text +Read a URL and match a table that contains specific text: .. code-block:: python @@ -2339,26 +2355,26 @@ from the data minus the parsed header elements (``
{val}{val}`` elements). dfs = pd.read_html(url, header=0) -Specify an index column +Specify an index column: .. code-block:: python dfs = pd.read_html(url, index_col=0) -Specify a number of rows to skip +Specify a number of rows to skip: .. code-block:: python dfs = pd.read_html(url, skiprows=0) Specify a number of rows to skip using a list (``xrange`` (Python 2 only) works -as well) +as well): .. code-block:: python dfs = pd.read_html(url, skiprows=range(2)) -Specify an HTML attribute +Specify an HTML attribute: .. code-block:: python @@ -2366,7 +2382,7 @@ Specify an HTML attribute dfs2 = pd.read_html(url, attrs={'class': 'sortable'}) print(np.array_equal(dfs1[0], dfs2[0])) # Should be True -Specify values that should be converted to NaN +Specify values that should be converted to NaN: .. code-block:: python @@ -2374,7 +2390,7 @@ Specify values that should be converted to NaN .. versionadded:: 0.19 -Specify whether to keep the default set of NaN values +Specify whether to keep the default set of NaN values: .. code-block:: python @@ -2384,7 +2400,7 @@ Specify whether to keep the default set of NaN values Specify converters for columns. This is useful for numerical text data that has leading zeros. By default columns that are numerical are cast to numeric -types and the leading zeros are lost. To avoid this, we can convert these +types and the leading zeros are lost. To avoid this, we can convert these columns to strings. .. code-block:: python @@ -2395,13 +2411,13 @@ columns to strings. .. versionadded:: 0.19 -Use some combination of the above +Use some combination of the above: .. code-block:: python dfs = pd.read_html(url, match='Metcalf Bank', index_col=0) -Read in pandas ``to_html`` output (with some loss of floating point precision) +Read in pandas ``to_html`` output (with some loss of floating point precision): .. code-block:: python @@ -2410,15 +2426,15 @@ Read in pandas ``to_html`` output (with some loss of floating point precision) dfin = pd.read_html(s, index_col=0) The ``lxml`` backend will raise an error on a failed parse if that is the only -parser you provide (if you only have a single parser you can provide just a +parser you provide. If you only have a single parser you can provide just a string, but it is considered good practice to pass a list with one string if, -for example, the function expects a sequence of strings) +for example, the function expects a sequence of strings. You may use: .. code-block:: python dfs = pd.read_html(url, 'Metcalf Bank', index_col=0, flavor=['lxml']) -or +Or you could pass ``flavor='lxml'`` without a list: .. code-block:: python @@ -2472,7 +2488,7 @@ HTML: .. raw:: html :file: _static/basic.html -The ``columns`` argument will limit the columns shown +The ``columns`` argument will limit the columns shown: .. ipython:: python @@ -2489,7 +2505,7 @@ HTML: :file: _static/columns.html ``float_format`` takes a Python callable to control the precision of floating -point values +point values: .. ipython:: python @@ -2506,7 +2522,7 @@ HTML: :file: _static/float_format.html ``bold_rows`` will make the row labels bold by default, but you can turn that -off +off: .. ipython:: python @@ -2579,7 +2595,7 @@ parse HTML tables in the top-level pandas io function ``read_html``. * Benefits - * |lxml|_ is very fast + * |lxml|_ is very fast. * |lxml|_ requires Cython to install correctly. @@ -2652,8 +2668,8 @@ The :func:`~pandas.read_excel` method can read Excel 2003 (``.xls``) and Excel 2007+ (``.xlsx``) files using the ``xlrd`` Python module. The :meth:`~DataFrame.to_excel` instance method is used for saving a ``DataFrame`` to Excel. Generally the semantics are -similar to working with :ref:`csv` data. See the :ref:`cookbook` for some -advanced strategies +similar to working with :ref:`csv` data. +See the :ref:`cookbook` for some advanced strategies. .. _io.excel_reader: @@ -2696,7 +2712,7 @@ The ``sheet_names`` property will generate a list of the sheet names in the file. The primary use-case for an ``ExcelFile`` is parsing multiple sheets with -different parameters +different parameters: .. code-block:: python @@ -2725,7 +2741,7 @@ of sheet names can simply be passed to ``read_excel`` with no loss in performanc Specifying Sheets +++++++++++++++++ -.. note :: The second argument is ``sheet_name``, not to be confused with ``ExcelFile.sheet_names`` +.. note :: The second argument is ``sheet_name``, not to be confused with ``ExcelFile.sheet_names``. .. note :: An ExcelFile's attribute ``sheet_names`` provides access to a list of sheets. @@ -2802,12 +2818,12 @@ parameters. df.index = df.index.set_names(['lvl1', 'lvl2']) df.to_excel('path_to_file.xlsx') - df = pd.read_excel('path_to_file.xlsx', index_col=[0,1]) + df = pd.read_excel('path_to_file.xlsx', index_col=[0, 1]) df If the source file has both ``MultiIndex`` index and columns, lists specifying each -should be passed to ``index_col`` and ``header`` +should be passed to ``index_col`` and ``header``: .. ipython:: python @@ -2828,10 +2844,10 @@ Parsing Specific Columns ++++++++++++++++++++++++ It is often the case that users will insert columns to do temporary computations -in Excel and you may not want to read in those columns. `read_excel` takes -a `usecols` keyword to allow you to specify a subset of columns to parse. +in Excel and you may not want to read in those columns. ``read_excel`` takes +a ``usecols`` keyword to allow you to specify a subset of columns to parse. -If `usecols` is an integer, then it is assumed to indicate the last column +If ``usecols`` is an integer, then it is assumed to indicate the last column to be parsed. .. code-block:: python @@ -2840,11 +2856,12 @@ to be parsed. If `usecols` is a list of integers, then it is assumed to be the file column indices to be parsed. + .. code-block:: python read_excel('path_to_file.xls', 'Sheet1', usecols=[0, 2, 3]) -Element order is ignored, so usecols=[0,1] is the same as [1,0]. +Element order is ignored, so ``usecols=[0,1]`` is the same as ``[1,0]``. Parsing Dates +++++++++++++ @@ -2852,7 +2869,7 @@ Parsing Dates Datetime-like values are normally automatically converted to the appropriate dtype when reading the excel file. But if you have a column of strings that *look* like dates (but are not actually formatted as dates in excel), you can -use the `parse_dates` keyword to parse those strings to datetimes: +use the ``parse_dates`` keyword to parse those strings to datetimes: .. code-block:: python @@ -2862,7 +2879,7 @@ use the `parse_dates` keyword to parse those strings to datetimes: Cell Converters +++++++++++++++ -It is possible to transform the contents of Excel cells via the `converters` +It is possible to transform the contents of Excel cells via the ``converters`` option. For instance, to convert a column to boolean: .. code-block:: python @@ -2903,11 +2920,11 @@ Writing Excel Files Writing Excel Files to Disk +++++++++++++++++++++++++++ -To write a DataFrame object to a sheet of an Excel file, you can use the +To write a ``DataFrame`` object to a sheet of an Excel file, you can use the ``to_excel`` instance method. The arguments are largely the same as ``to_csv`` described above, the first argument being the name of the excel file, and the -optional second argument the name of the sheet to which the DataFrame should be -written. For example: +optional second argument the name of the sheet to which the ``DataFrame`` should be +written. For example: .. code-block:: python @@ -2917,7 +2934,7 @@ Files with a ``.xls`` extension will be written using ``xlwt`` and those with a ``.xlsx`` extension will be written using ``xlsxwriter`` (if available) or ``openpyxl``. -The DataFrame will be written in a way that tries to mimic the REPL output. +The ``DataFrame`` will be written in a way that tries to mimic the REPL output. The ``index_label`` will be placed in the second row instead of the first. You can place it in the first row by setting the ``merge_cells`` option in ``to_excel()`` to ``False``: @@ -2926,10 +2943,7 @@ row instead of the first. You can place it in the first row by setting the df.to_excel('path_to_file.xlsx', index_label='label', merge_cells=False) -The Panel class also has a ``to_excel`` instance method, -which writes each DataFrame in the Panel to a separate sheet. - -In order to write separate DataFrames to separate sheets in a single Excel file, +In order to write separate ``DataFrames`` to separate sheets in a single Excel file, one can pass an :class:`~pandas.io.excel.ExcelWriter`. .. code-block:: python @@ -2990,13 +3004,13 @@ Pandas supports writing Excel files to buffer-like objects such as ``StringIO`` Excel writer engines '''''''''''''''''''' -``pandas`` chooses an Excel writer via two methods: +Pandas chooses an Excel writer via two methods: 1. the ``engine`` keyword argument 2. the filename extension (via the default specified in config options) -By default, ``pandas`` uses the `XlsxWriter`_ for ``.xlsx`` and `openpyxl`_ -for ``.xlsm`` files and `xlwt`_ for ``.xls`` files. If you have multiple +By default, pandas uses the `XlsxWriter`_ for ``.xlsx``, `openpyxl`_ +for ``.xlsm``, and `xlwt`_ for ``.xls`` files. If you have multiple engines installed, you can set the default engine through :ref:`setting the config options ` ``io.excel.xlsx.writer`` and ``io.excel.xls.writer``. pandas will fall back on `openpyxl`_ for ``.xlsx`` @@ -3034,8 +3048,8 @@ Style and Formatting The look and feel of Excel worksheets created from pandas can be modified using the following parameters on the ``DataFrame``'s ``to_excel`` method. -- ``float_format`` : Format string for floating point numbers (default None) -- ``freeze_panes`` : A tuple of two integers representing the bottommost row and rightmost column to freeze. Each of these parameters is one-based, so (1, 1) will freeze the first row and first column (default None) +- ``float_format`` : Format string for floating point numbers (default ``None``). +- ``freeze_panes`` : A tuple of two integers representing the bottommost row and rightmost column to freeze. Each of these parameters is one-based, so (1, 1) will freeze the first row and first column (default ``None``). @@ -3044,10 +3058,10 @@ The look and feel of Excel worksheets created from pandas can be modified using Clipboard --------- -A handy way to grab data is to use the ``read_clipboard`` method, which takes -the contents of the clipboard buffer and passes them to the ``read_table`` -method. For instance, you can copy the following -text to the clipboard (CTRL-C on many operating systems): +A handy way to grab data is to use the :meth:`~DataFrame.read_clipboard` method, +which takes the contents of the clipboard buffer and passes them to the +``read_table`` method. For instance, you can copy the following text to the +clipboard (CTRL-C on many operating systems): .. code-block:: python @@ -3056,7 +3070,7 @@ text to the clipboard (CTRL-C on many operating systems): y 2 5 q z 3 6 r -And then import the data directly to a DataFrame by calling: +And then import the data directly to a ``DataFrame`` by calling: .. code-block:: python @@ -3066,10 +3080,11 @@ And then import the data directly to a DataFrame by calling: clipdf -The ``to_clipboard`` method can be used to write the contents of a DataFrame to + +The ``to_clipboard`` method can be used to write the contents of a ``DataFrame`` to the clipboard. Following which you can paste the clipboard contents into other applications (CTRL-V on many operating systems). Here we illustrate writing a -DataFrame into clipboard and reading it back. +``DataFrame`` into clipboard and reading it back. .. ipython:: python @@ -3121,7 +3136,7 @@ any pickled pandas object (or any other pickled object) from file: Several internal refactorings have been done while still preserving compatibility with pickles created with older versions of pandas. However, - for such cases, pickled dataframes, series etc, must be read with + for such cases, pickled ``DataFrames``, ``Series`` etc, must be read with ``pd.read_pickle``, rather than ``pickle.load``. See `here `__ @@ -3139,8 +3154,8 @@ Compressed pickle files :func:`read_pickle`, :meth:`DataFrame.to_pickle` and :meth:`Series.to_pickle` can read and write compressed pickle files. The compression types of ``gzip``, ``bz2``, ``xz`` are supported for reading and writing. -`zip`` file supports read only and must contain only one data file -to be read in. +The ``zip`` file format only supports reading and must contain only one data file +to be read. The compression type can be an explicit parameter or be inferred from the file extension. If 'infer', then use ``gzip``, ``bz2``, ``zip``, or ``xz`` if filename ends in ``'.gz'``, ``'.bz2'``, ``'.zip'``, or @@ -3154,7 +3169,7 @@ If 'infer', then use ``gzip``, ``bz2``, ``zip``, or ``xz`` if filename ends in ` 'C': pd.date_range('20130101', periods=1000, freq='s')}) df -Using an explicit compression type +Using an explicit compression type: .. ipython:: python @@ -3162,7 +3177,7 @@ Using an explicit compression type rt = pd.read_pickle("data.pkl.compress", compression="gzip") rt -Inferring compression type from the extension +Inferring compression type from the extension: .. ipython:: python @@ -3170,7 +3185,7 @@ Inferring compression type from the extension rt = pd.read_pickle("data.pkl.xz", compression="infer") rt -The default is to 'infer +The default is to 'infer': .. ipython:: python @@ -3221,14 +3236,14 @@ You can pass a list of objects and you will receive them back on deserialization pd.to_msgpack('foo.msg', df, 'foo', np.array([1,2,3]), s) pd.read_msgpack('foo.msg') -You can pass ``iterator=True`` to iterate over the unpacked results +You can pass ``iterator=True`` to iterate over the unpacked results: .. ipython:: python for o in pd.read_msgpack('foo.msg',iterator=True): print(o) -You can pass ``append=True`` to the writer to append to an existing pack +You can pass ``append=True`` to the writer to append to an existing pack: .. ipython:: python @@ -3331,7 +3346,7 @@ In a current or later Python session, you can retrieve stored objects: # dotted (attribute) access provides get as well store.df -Deletion of the object specified by the key +Deletion of the object specified by the key: .. ipython:: python @@ -3340,7 +3355,7 @@ Deletion of the object specified by the key store -Closing a Store, Context Manager +Closing a Store and using a context manager: .. ipython:: python @@ -3348,8 +3363,7 @@ Closing a Store, Context Manager store store.is_open - # Working with, and automatically closing the store with the context - # manager + # Working with, and automatically closing the store using a context manager with pd.HDFStore('store.h5') as store: store.keys() @@ -3449,17 +3463,17 @@ the ``fixed`` format. These types of stores are **not** appendable once written remove them and rewrite). Nor are they **queryable**; they must be retrieved in their entirety. They also do not support dataframes with non-unique column names. The ``fixed`` format stores offer very fast writing and slightly faster reading than ``table`` stores. -This format is specified by default when using ``put`` or ``to_hdf`` or by ``format='fixed'`` or ``format='f'`` +This format is specified by default when using ``put`` or ``to_hdf`` or by ``format='fixed'`` or ``format='f'``. .. warning:: - A ``fixed`` format will raise a ``TypeError`` if you try to retrieve using a ``where`` . + A ``fixed`` format will raise a ``TypeError`` if you try to retrieve using a ``where``: .. code-block:: python - pd.DataFrame(randn(10,2)).to_hdf('test_fixed.h5','df') + pd.DataFrame(randn(10, 2)).to_hdf('test_fixed.h5', 'df') - pd.read_hdf('test_fixed.h5','df',where='index>5') + pd.read_hdf('test_fixed.h5', 'df', where='index>5') TypeError: cannot pass a where specification when reading a fixed format. this store must be selected in its entirety @@ -3472,9 +3486,9 @@ Table Format ``HDFStore`` supports another ``PyTables`` format on disk, the ``table`` format. Conceptually a ``table`` is shaped very much like a DataFrame, with rows and columns. A ``table`` may be appended to in the same or -other sessions. In addition, delete & query type operations are +other sessions. In addition, delete and query type operations are supported. This format is specified by ``format='table'`` or ``format='t'`` -to ``append`` or ``put`` or ``to_hdf`` +to ``append`` or ``put`` or ``to_hdf``. This format can be set as an option as well ``pd.set_option('io.hdf.default_format','table')`` to enable ``put/append/to_hdf`` to by default store in the ``table`` format. @@ -3514,9 +3528,9 @@ Hierarchical Keys Keys to a store can be specified as a string. These can be in a hierarchical path-name like format (e.g. ``foo/bar/bah``), which will generate a hierarchy of sub-stores (or ``Groups`` in PyTables -parlance). Keys can be specified with out the leading '/' and are ALWAYS +parlance). Keys can be specified with out the leading '/' and are **always** absolute (e.g. 'foo' refers to '/foo'). Removal operations can remove -everything in the sub-store and BELOW, so be *careful*. +everything in the sub-store and **below**, so be *careful*. .. ipython:: python @@ -3547,7 +3561,7 @@ everything in the sub-store and BELOW, so be *careful*. /foo/bar/bah (Group) '' children := ['block0_items' (Array), 'block0_values' (Array), 'axis0' (Array), 'axis1' (Array)] - Instead, use explicit string based keys + Instead, use explicit string based keys: .. ipython:: python @@ -3596,8 +3610,8 @@ defaults to `nan`. Storing Multi-Index DataFrames ++++++++++++++++++++++++++++++ -Storing multi-index dataframes as tables is very similar to -storing/selecting from homogeneous index DataFrames. +Storing multi-index ``DataFrames`` as tables is very similar to +storing/selecting from homogeneous index ``DataFrames``. .. ipython:: python @@ -3632,10 +3646,10 @@ data. A query is specified using the ``Term`` class under the hood, as a boolean expression. -- ``index`` and ``columns`` are supported indexers of a DataFrame +- ``index`` and ``columns`` are supported indexers of a ``DataFrames``. - ``major_axis``, ``minor_axis``, and ``items`` are supported indexers of - the Panel -- if ``data_columns`` are specified, these can be used as additional indexers + the Panel. +- if ``data_columns`` are specified, these can be used as additional indexers. Valid comparison operators are: @@ -3849,7 +3863,7 @@ to perform queries (other than the `indexable` columns, which you can always query). For instance say you want to perform this common operation, on-disk, and return just the frame that matches this query. You can specify ``data_columns = True`` to force all columns to -be data_columns +be ``data_columns``. .. ipython:: python @@ -3879,7 +3893,7 @@ There is some performance degradation by making lots of columns into `data columns`, so it is up to the user to designate these. In addition, you cannot change data columns (nor indexables) after the first append/put operation (Of course you can simply read in the data and -create a new table!) +create a new table!). Iterator ++++++++ @@ -3912,7 +3926,7 @@ chunks. .. ipython:: python - dfeq = pd.DataFrame({'number': np.arange(1,11)}) + dfeq = pd.DataFrame({'number': np.arange(1, 11)}) dfeq store.append('dfeq', dfeq, data_columns=['number']) @@ -3921,9 +3935,9 @@ chunks. return [l[i:i+n] for i in range(0, len(l), n)] evens = [2,4,6,8,10] - coordinates = store.select_as_coordinates('dfeq','number=evens') + coordinates = store.select_as_coordinates('dfeq', 'number=evens') for c in chunks(coordinates, 2): - print(store.select('dfeq',where=c)) + print(store.select('dfeq', where=c)) Advanced Queries ++++++++++++++++ @@ -4005,7 +4019,7 @@ table names to a list of 'columns' you want in that table. If `None` is used in place of a list, that table will have the remaining unspecified columns of the given DataFrame. The argument ``selector`` defines which table is the selector table (which you can make queries from). -The argument ``dropna`` will drop rows from the input DataFrame to ensure +The argument ``dropna`` will drop rows from the input ``DataFrame`` to ensure tables are synchronized. This means that if a row for one of the tables being written to is entirely ``np.NaN``, that row will be dropped from all tables. @@ -4081,7 +4095,7 @@ the table using a ``where`` that selects all but the missing data. automatically. Thus, repeatedly deleting (or removing nodes) and adding again, **WILL TEND TO INCREASE THE FILE SIZE**. - To *repack and clean* the file, use :ref:`ptrepack ` + To *repack and clean* the file, use :ref:`ptrepack `. .. _io.hdf5-notes: @@ -4464,7 +4478,7 @@ Several caveats. - Non supported types include ``Period`` and actual Python object types. These will raise a helpful error message on an attempt at serialization. -See the `Full Documentation `__ +See the `Full Documentation `__. .. ipython:: python @@ -4522,8 +4536,8 @@ dtypes, including extension dtypes such as datetime with tz. Several caveats. -- Duplicate column names and non-string columns names are not supported -- Index level names, if specified, must be strings +- Duplicate column names and non-string columns names are not supported. +- Index level names, if specified, must be strings. - Categorical dtypes can be serialized to parquet, but will de-serialize as ``object`` dtype. - Non supported types include ``Period`` and actual Python object types. These will raise a helpful error message on an attempt at serialization. @@ -4532,7 +4546,7 @@ You can specify an ``engine`` to direct the serialization. This can be one of `` If the engine is NOT specified, then the ``pd.options.io.parquet.engine`` option is checked; if this is also ``auto``, then ``pyarrow`` is tried, and falling back to ``fastparquet``. -See the documentation for `pyarrow `__ and `fastparquet `__ +See the documentation for `pyarrow `__ and `fastparquet `__. .. note:: @@ -4652,7 +4666,7 @@ If you want to manage your own connections you can pass one of those instead: Writing DataFrames '''''''''''''''''' -Assuming the following data is in a DataFrame ``data``, we can insert it into +Assuming the following data is in a ``DataFrame`` ``data``, we can insert it into the database using :func:`~pandas.DataFrame.to_sql`. +-----+------------+-------+-------+-------+ @@ -4738,7 +4752,7 @@ table name and optionally a subset of columns to read. pd.read_sql_table('data', engine) -You can also specify the name of the column as the DataFrame index, +You can also specify the name of the column as the ``DataFrame`` index, and specify a subset of columns to be read. .. ipython:: python @@ -4807,7 +4821,7 @@ Specifying this will return an iterator through chunks of the query result: for chunk in pd.read_sql_query("SELECT * FROM data_chunks", engine, chunksize=5): print(chunk) -You can also run a plain query without creating a dataframe with +You can also run a plain query without creating a ``DataFrame`` with :func:`~pandas.io.sql.execute`. This is useful for queries that don't return values, such as INSERT. This is functionally equivalent to calling ``execute`` on the SQLAlchemy engine or db connection object. Again, you must use the SQL syntax @@ -4923,7 +4937,7 @@ pandas integrates with this external package. if ``pandas-gbq`` is installed, yo use the pandas methods ``pd.read_gbq`` and ``DataFrame.to_gbq``, which will call the respective functions from ``pandas-gbq``. -Full documentation can be found `here `__ +Full documentation can be found `here `__. .. _io.stata: @@ -4986,7 +5000,7 @@ Reading from Stata format ''''''''''''''''''''''''' The top-level function ``read_stata`` will read a dta file and return -either a DataFrame or a :class:`~pandas.io.stata.StataReader` that can +either a ``DataFrame`` or a :class:`~pandas.io.stata.StataReader` that can be used to read the file incrementally. .. ipython:: python @@ -5084,7 +5098,7 @@ whether imported ``Categorical`` variables are ordered. .. note:: - *Stata* supports partially labeled series. These series have value labels for + *Stata* supports partially labeled series. These series have value labels for some but not all data values. Importing a partially labeled series will produce a ``Categorical`` with string categories for the values that are labeled and numeric categories for values with no label. @@ -5144,7 +5158,7 @@ into and from pandas, we recommend these packages from the broader community. netCDF '''''' -xarray_ provides data structures inspired by the pandas DataFrame for working +xarray_ provides data structures inspired by the pandas ``DataFrame`` for working with multi-dimensional datasets, with a focus on the netCDF file format and easy conversion to and from pandas. @@ -5173,7 +5187,8 @@ ignored. dtypes: float64(1), int64(1) memory usage: 15.3 MB -Writing +When writing, the top-three functions in terms of speed are are +``test_pickle_write``, ``test_feather_write`` and ``test_hdf_fixed_write_compress``. .. code-block:: ipython @@ -5204,7 +5219,8 @@ Writing In [32]: %timeit test_pickle_write_compress(df) 3.33 s ± 55.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) -Reading +When reading, the top three are ``test_feather_read``, ``test_pickle_read`` and +``test_hdf_fixed_read``. .. code-block:: ipython @@ -5249,7 +5265,7 @@ Space on disk (in bytes) 16000848 Aug 21 18:00 test.pkl 7554108 Aug 21 18:00 test.pkl.compress -And here's the code +And here's the code: .. code-block:: python From 11de13140f66b7f04ded53f7890738061512df3b Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 14 Feb 2018 13:56:51 -0600 Subject: [PATCH 112/214] CI: Move conda build and ASV check to cron job (#19698) * CI: Remove ASV run * CI: Removed conda build test * Removed asv.sh step * Removed the ASV matrix item --- .travis.yml | 12 ++----- ci/asv.sh | 35 ------------------- ci/install_travis.sh | 17 +-------- ...UILD_TEST.build => requirements-3.5.build} | 0 ...DA_BUILD_TEST.pip => requirements-3.5.pip} | 0 ...DA_BUILD_TEST.run => requirements-3.5.run} | 0 ...ONDA_BUILD_TEST.sh => requirements-3.5.sh} | 2 +- ci/requirements-3.6_ASV.build | 5 --- ci/requirements-3.6_ASV.run | 25 ------------- ci/requirements-3.6_ASV.sh | 7 ---- ci/script_multi.sh | 5 +-- ci/script_single.sh | 5 +-- 12 files changed, 6 insertions(+), 107 deletions(-) delete mode 100755 ci/asv.sh rename ci/{requirements-3.5_CONDA_BUILD_TEST.build => requirements-3.5.build} (100%) rename ci/{requirements-3.5_CONDA_BUILD_TEST.pip => requirements-3.5.pip} (100%) rename ci/{requirements-3.5_CONDA_BUILD_TEST.run => requirements-3.5.run} (100%) rename ci/{requirements-3.5_CONDA_BUILD_TEST.sh => requirements-3.5.sh} (86%) delete mode 100644 ci/requirements-3.6_ASV.build delete mode 100644 ci/requirements-3.6_ASV.run delete mode 100755 ci/requirements-3.6_ASV.sh diff --git a/.travis.yml b/.travis.yml index 4cbe7f86bd2fa..b1168f18315c3 100644 --- a/.travis.yml +++ b/.travis.yml @@ -52,7 +52,7 @@ matrix: # In allow_failures - dist: trusty env: - - JOB="3.5_CONDA_BUILD_TEST" TEST_ARGS="--skip-slow --skip-network" CONDA_BUILD_TEST=true + - JOB="3.5" TEST_ARGS="--skip-slow --skip-network" - dist: trusty env: - JOB="3.6" TEST_ARGS="--skip-slow --skip-network" PANDAS_TESTING_MODE="deprecate" CONDA_FORGE=true COVERAGE=true @@ -73,17 +73,13 @@ matrix: env: - JOB="3.6_NUMPY_DEV" TEST_ARGS="--skip-slow --skip-network" PANDAS_TESTING_MODE="deprecate" # In allow_failures - - dist: trusty - env: - - JOB="3.6_ASV" ASV=true - # In allow_failures - dist: trusty env: - JOB="3.6_DOC" DOC=true allow_failures: - dist: trusty env: - - JOB="3.5_CONDA_BUILD_TEST" TEST_ARGS="--skip-slow --skip-network" CONDA_BUILD_TEST=true + - JOB="3.5" TEST_ARGS="--skip-slow --skip-network" - dist: trusty env: - JOB="2.7_SLOW" SLOW=true @@ -97,9 +93,6 @@ matrix: - dist: trusty env: - JOB="3.6_NUMPY_DEV" TEST_ARGS="--skip-slow --skip-network" PANDAS_TESTING_MODE="deprecate" - - dist: trusty - env: - - JOB="3.6_ASV" ASV=true - dist: trusty env: - JOB="3.6_DOC" DOC=true @@ -135,7 +128,6 @@ script: - ci/script_single.sh - ci/script_multi.sh - ci/lint.sh - - ci/asv.sh - echo "checking imports" - source activate pandas && python ci/check_imports.py - echo "script done" diff --git a/ci/asv.sh b/ci/asv.sh deleted file mode 100755 index 1e9a8d6380eb5..0000000000000 --- a/ci/asv.sh +++ /dev/null @@ -1,35 +0,0 @@ -#!/bin/bash - -echo "inside $0" - -source activate pandas - -RET=0 - -if [ "$ASV" ]; then - echo "Check for failed asv benchmarks" - - cd asv_bench - - asv machine --yes - - time asv dev | tee failed_asv.txt - - echo "The following asvs benchmarks (if any) failed." - - cat failed_asv.txt | grep "failed" failed_asv.txt - - if [ $? = "0" ]; then - RET=1 - fi - - echo "DONE displaying failed asvs benchmarks." - - rm failed_asv.txt - - echo "Check for failed asv benchmarks DONE" -else - echo "NOT checking for failed asv benchmarks" -fi - -exit $RET diff --git a/ci/install_travis.sh b/ci/install_travis.sh index 6e270519e60c3..458ff083b65eb 100755 --- a/ci/install_travis.sh +++ b/ci/install_travis.sh @@ -50,12 +50,6 @@ conda config --set ssl_verify false || exit 1 conda config --set quiet true --set always_yes true --set changeps1 false || exit 1 conda update -q conda -if [ "$CONDA_BUILD_TEST" ]; then - echo - echo "[installing conda-build]" - conda install conda-build -fi - echo echo "[add channels]" conda config --remove channels defaults || exit 1 @@ -122,7 +116,7 @@ if [ "$COVERAGE" ]; then fi echo -if [ -z "$PIP_BUILD_TEST" ] && [ -z "$CONDA_BUILD_TEST" ]; then +if [ -z "$PIP_BUILD_TEST" ] ; then # build but don't install echo "[build em]" @@ -177,15 +171,6 @@ if [ "$PIP_BUILD_TEST" ]; then conda uninstall -y cython time pip install dist/*tar.gz || exit 1 -elif [ "$CONDA_BUILD_TEST" ]; then - - # build & install testing - echo "[building conda recipe]" - time conda build ./conda.recipe --python 3.5 -q --no-test || exit 1 - - echo "[installing]" - conda install pandas --use-local || exit 1 - else # install our pandas diff --git a/ci/requirements-3.5_CONDA_BUILD_TEST.build b/ci/requirements-3.5.build similarity index 100% rename from ci/requirements-3.5_CONDA_BUILD_TEST.build rename to ci/requirements-3.5.build diff --git a/ci/requirements-3.5_CONDA_BUILD_TEST.pip b/ci/requirements-3.5.pip similarity index 100% rename from ci/requirements-3.5_CONDA_BUILD_TEST.pip rename to ci/requirements-3.5.pip diff --git a/ci/requirements-3.5_CONDA_BUILD_TEST.run b/ci/requirements-3.5.run similarity index 100% rename from ci/requirements-3.5_CONDA_BUILD_TEST.run rename to ci/requirements-3.5.run diff --git a/ci/requirements-3.5_CONDA_BUILD_TEST.sh b/ci/requirements-3.5.sh similarity index 86% rename from ci/requirements-3.5_CONDA_BUILD_TEST.sh rename to ci/requirements-3.5.sh index 093fdbcf21d78..529e1e8742722 100644 --- a/ci/requirements-3.5_CONDA_BUILD_TEST.sh +++ b/ci/requirements-3.5.sh @@ -2,7 +2,7 @@ source activate pandas -echo "install 35 CONDA_BUILD_TEST" +echo "install 35" # pip install python-dateutil to get latest conda remove -n pandas python-dateutil --force diff --git a/ci/requirements-3.6_ASV.build b/ci/requirements-3.6_ASV.build deleted file mode 100644 index bc72eed2a0d4e..0000000000000 --- a/ci/requirements-3.6_ASV.build +++ /dev/null @@ -1,5 +0,0 @@ -python=3.6* -python-dateutil -pytz -numpy=1.13* -cython diff --git a/ci/requirements-3.6_ASV.run b/ci/requirements-3.6_ASV.run deleted file mode 100644 index 6c45e3371e9cf..0000000000000 --- a/ci/requirements-3.6_ASV.run +++ /dev/null @@ -1,25 +0,0 @@ -ipython -ipykernel -ipywidgets -sphinx=1.5* -nbconvert -nbformat -notebook -matplotlib -seaborn -scipy -lxml -beautifulsoup4 -html5lib -pytables -python-snappy -openpyxl -xlrd -xlwt -xlsxwriter -sqlalchemy -numexpr -bottleneck -statsmodels -xarray -pyqt diff --git a/ci/requirements-3.6_ASV.sh b/ci/requirements-3.6_ASV.sh deleted file mode 100755 index 8a46f85dbb6bc..0000000000000 --- a/ci/requirements-3.6_ASV.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash - -source activate pandas - -echo "[install ASV_BUILD deps]" - -pip install git+https://github.com/spacetelescope/asv diff --git a/ci/script_multi.sh b/ci/script_multi.sh index 766e51625fbe6..6c354fc4cab0b 100755 --- a/ci/script_multi.sh +++ b/ci/script_multi.sh @@ -18,7 +18,7 @@ fi export PYTHONHASHSEED=$(python -c 'import random; print(random.randint(1, 4294967295))') echo PYTHONHASHSEED=$PYTHONHASHSEED -if [ "$PIP_BUILD_TEST" ] || [ "$CONDA_BUILD_TEST" ]; then +if [ "$PIP_BUILD_TEST" ] ; then echo "[build-test]" echo "[env]" @@ -37,9 +37,6 @@ if [ "$PIP_BUILD_TEST" ] || [ "$CONDA_BUILD_TEST" ]; then elif [ "$DOC" ]; then echo "We are not running pytest as this is a doc-build" -elif [ "$ASV" ]; then - echo "We are not running pytest as this is an asv-build" - elif [ "$COVERAGE" ]; then echo pytest -s -n 2 -m "not single" --cov=pandas --cov-report xml:/tmp/cov-multiple.xml --junitxml=/tmp/multiple.xml --strict $TEST_ARGS pandas pytest -s -n 2 -m "not single" --cov=pandas --cov-report xml:/tmp/cov-multiple.xml --junitxml=/tmp/multiple.xml --strict $TEST_ARGS pandas diff --git a/ci/script_single.sh b/ci/script_single.sh index 153847ab2e8c9..74b0e897f1d73 100755 --- a/ci/script_single.sh +++ b/ci/script_single.sh @@ -16,15 +16,12 @@ if [ "$SLOW" ]; then TEST_ARGS="--only-slow --skip-network" fi -if [ "$PIP_BUILD_TEST" ] || [ "$CONDA_BUILD_TEST" ]; then +if [ "$PIP_BUILD_TEST" ]; then echo "We are not running pytest as this is a build test." elif [ "$DOC" ]; then echo "We are not running pytest as this is a doc-build" -elif [ "$ASV" ]; then - echo "We are not running pytest as this is an asv-build" - elif [ "$COVERAGE" ]; then echo pytest -s -m "single" --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas pytest -s -m "single" --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas From 6cd42ebf269436baa49159d24d2610d9506b50b6 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Wed, 14 Feb 2018 15:12:14 -0800 Subject: [PATCH 113/214] GroupBy Rank SegFault Fix - astype instead of view (#19701) * Use astype instead of view for lexsort upcasting * Added copy=False to astype in group_rank --- pandas/_libs/groupby_helper.pxi.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index 1d77a373bb7dd..fe4d31516d839 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -531,7 +531,7 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, # each label corresponds to a different group value, # the mask helps you differentiate missing values before # performing sort on the actual values - _as = np.lexsort(order).view(dtype=np.int64) + _as = np.lexsort(order).astype(np.int64, copy=False) if not ascending: _as = _as[::-1] From db55f4786f0888f19c23ad8c03b791e0ef69ffa1 Mon Sep 17 00:00:00 2001 From: Gilberto Olimpio Date: Thu, 15 Feb 2018 06:17:31 -0200 Subject: [PATCH 114/214] DOC: Ambiguous description in to_parquet engine documentation (#19669) --- pandas/core/frame.py | 7 ++++--- pandas/io/parquet.py | 14 ++++++++------ 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index bc045d74cee52..a001037b573d4 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1678,9 +1678,10 @@ def to_parquet(self, fname, engine='auto', compression='snappy', fname : str string file path engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto' - Parquet reader library to use. If 'auto', then the option - 'io.parquet.engine' is used. If 'auto', then the first - library to be installed is used. + Parquet library to use. If 'auto', then the option + ``io.parquet.engine`` is used. The default ``io.parquet.engine`` + behavior is to try 'pyarrow', falling back to 'fastparquet' if + 'pyarrow' is unavailable. compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy' Name of the compression to use. Use ``None`` for no compression. kwargs diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 1c22a305c089d..a99014f07a6b3 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -244,9 +244,10 @@ def to_parquet(df, path, engine='auto', compression='snappy', **kwargs): path : string File path engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto' - Parquet reader library to use. If 'auto', then the option - 'io.parquet.engine' is used. If 'auto', then the first - library to be installed is used. + Parquet library to use. If 'auto', then the option + ``io.parquet.engine`` is used. The default ``io.parquet.engine`` + behavior is to try 'pyarrow', falling back to 'fastparquet' if + 'pyarrow' is unavailable. compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy' Name of the compression to use. Use ``None`` for no compression. kwargs @@ -271,9 +272,10 @@ def read_parquet(path, engine='auto', columns=None, **kwargs): .. versionadded 0.21.1 engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto' - Parquet reader library to use. If 'auto', then the option - 'io.parquet.engine' is used. If 'auto', then the first - library to be installed is used. + Parquet library to use. If 'auto', then the option + ``io.parquet.engine`` is used. The default ``io.parquet.engine`` + behavior is to try 'pyarrow', falling back to 'fastparquet' if + 'pyarrow' is unavailable. kwargs are passed to the engine Returns From d59aad656d55a95dd5e52e8de17bd42836d2e872 Mon Sep 17 00:00:00 2001 From: Jan F-F Date: Thu, 15 Feb 2018 00:36:09 -0800 Subject: [PATCH 115/214] ENH: groupby().is_monotonic_increasing #17015 (#17453) --- doc/source/api.rst | 2 + doc/source/whatsnew/v0.23.0.txt | 1 + pandas/core/groupby.py | 4 +- pandas/tests/groupby/test_groupby.py | 61 +++++++++++++++++++++++++- pandas/tests/groupby/test_whitelist.py | 7 ++- 5 files changed, 71 insertions(+), 4 deletions(-) diff --git a/doc/source/api.rst b/doc/source/api.rst index 44f87aa3e1cec..103b0fe9ff019 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -2240,6 +2240,8 @@ The following methods are available only for ``SeriesGroupBy`` objects. SeriesGroupBy.nunique SeriesGroupBy.unique SeriesGroupBy.value_counts + SeriesGroupBy.is_monotonic_increasing + SeriesGroupBy.is_monotonic_decreasing The following methods are available only for ``DataFrameGroupBy`` objects. diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 932618ba1df21..a2198d9103528 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -323,6 +323,7 @@ Other Enhancements - ``IntervalIndex.astype`` now supports conversions between subtypes when passed an ``IntervalDtype`` (:issue:`19197`) - :class:`IntervalIndex` and its associated constructor methods (``from_arrays``, ``from_breaks``, ``from_tuples``) have gained a ``dtype`` parameter (:issue:`19262`) +- Added :func:`SeriesGroupBy.is_monotonic_increasing` and :func:`SeriesGroupBy.is_monotonic_decreasing` (:issue:`17015`) .. _whatsnew_0230.api_breaking: diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 0363bcd02aa16..b1615f720368d 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -336,7 +336,9 @@ ]) | _plotting_methods _series_apply_whitelist = ((_common_apply_whitelist | - {'nlargest', 'nsmallest'}) - + {'nlargest', 'nsmallest', + 'is_monotonic_increasing', + 'is_monotonic_decreasing'}) - {'boxplot'}) | frozenset(['dtype', 'unique']) _dataframe_apply_whitelist = ((_common_apply_whitelist | diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 6eacd45deb7bc..4cf7c8013aa2b 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2639,7 +2639,7 @@ def test_group_shift_with_null_key(self): # Generate a moderately large dataframe with occasional missing # values in column `B`, and then group by [`A`, `B`]. This should # force `-1` in `labels` array of `g.grouper.group_info` exactly - # at those places, where the group-by key is partilly missing. + # at those places, where the group-by key is partially missing. df = DataFrame([(i % 12, i % 3 if i % 3 else np.nan, i) for i in range(n_rows)], dtype=float, columns=["A", "B", "Z"], index=None) @@ -2764,6 +2764,65 @@ def test_cummin_cummax(self): expected = pd.Series([1, 2, 1], name='b') tm.assert_series_equal(result, expected) + @pytest.mark.parametrize('in_vals, out_vals', [ + + # Basics: strictly increasing (T), strictly decreasing (F), + # abs val increasing (F), non-strictly increasing (T) + ([1, 2, 5, 3, 2, 0, 4, 5, -6, 1, 1], + [True, False, False, True]), + + # Test with inf vals + ([1, 2.1, np.inf, 3, 2, np.inf, -np.inf, 5, 11, 1, -np.inf], + [True, False, True, False]), + + # Test with nan vals; should always be False + ([1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan], + [False, False, False, False]), + ]) + def test_is_monotonic_increasing(self, in_vals, out_vals): + # GH 17015 + source_dict = { + 'A': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'], + 'B': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd'], + 'C': in_vals} + df = pd.DataFrame(source_dict) + result = df.groupby('B').C.is_monotonic_increasing + index = Index(list('abcd'), name='B') + expected = pd.Series(index=index, data=out_vals, name='C') + tm.assert_series_equal(result, expected) + + # Also check result equal to manually taking x.is_monotonic_increasing. + expected = ( + df.groupby(['B']).C.apply(lambda x: x.is_monotonic_increasing)) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize('in_vals, out_vals', [ + # Basics: strictly decreasing (T), strictly increasing (F), + # abs val decreasing (F), non-strictly increasing (T) + ([10, 9, 7, 3, 4, 5, -3, 2, 0, 1, 1], + [True, False, False, True]), + + # Test with inf vals + ([np.inf, 1, -np.inf, np.inf, 2, -3, -np.inf, 5, -3, -np.inf, -np.inf], + [True, True, False, True]), + + # Test with nan vals; should always be False + ([1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan], + [False, False, False, False]), + ]) + def test_is_monotonic_decreasing(self, in_vals, out_vals): + # GH 17015 + source_dict = { + 'A': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'], + 'B': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd'], + 'C': in_vals} + + df = pd.DataFrame(source_dict) + result = df.groupby('B').C.is_monotonic_decreasing + index = Index(list('abcd'), name='B') + expected = pd.Series(index=index, data=out_vals, name='C') + tm.assert_series_equal(result, expected) + def test_apply_numeric_coercion_when_datetime(self): # In the past, group-by/apply operations have been over-eager # in converting dtypes to numeric, in the presence of datetime diff --git a/pandas/tests/groupby/test_whitelist.py b/pandas/tests/groupby/test_whitelist.py index 3117525d899f6..8d6e074881cbb 100644 --- a/pandas/tests/groupby/test_whitelist.py +++ b/pandas/tests/groupby/test_whitelist.py @@ -88,6 +88,8 @@ 'unique', 'nlargest', 'nsmallest', + 'is_monotonic_increasing', + 'is_monotonic_decreasing', ]) @@ -184,7 +186,7 @@ def test_regression_whitelist_methods( axis, skipna, sort): # GH6944 # GH 17537 - # explicitly test the whitelest methods + # explicitly test the whitelist methods if axis == 0: frame = raw_frame @@ -249,7 +251,8 @@ def test_tab_completion(mframe): 'cumsum', 'cumcount', 'ngroup', 'all', 'shift', 'skew', 'take', 'tshift', 'pct_change', 'any', 'mad', 'corr', 'corrwith', 'cov', 'dtypes', 'ndim', 'diff', 'idxmax', 'idxmin', - 'ffill', 'bfill', 'pad', 'backfill', 'rolling', 'expanding', 'pipe'} + 'ffill', 'bfill', 'pad', 'backfill', 'rolling', 'expanding', 'pipe', + } assert results == expected From 405ed25b214740f2e0457ee84007567072b6fd18 Mon Sep 17 00:00:00 2001 From: Clemens Brunner Date: Thu, 15 Feb 2018 10:00:32 +0100 Subject: [PATCH 116/214] DOC: improve docs to clarify MultiIndex indexing (#19507) --- doc/source/advanced.rst | 86 ++++++++++++++++++++++++++++------------- 1 file changed, 59 insertions(+), 27 deletions(-) diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst index ca903dadc6eb1..c455fbb8d0687 100644 --- a/doc/source/advanced.rst +++ b/doc/source/advanced.rst @@ -113,7 +113,13 @@ of the index is up to you: pd.DataFrame(np.random.randn(6, 6), index=index[:6], columns=index[:6]) We've "sparsified" the higher levels of the indexes to make the console output a -bit easier on the eyes. +bit easier on the eyes. Note that how the index is displayed can be controlled using the +``multi_sparse`` option in ``pandas.set_options()``: + +.. ipython:: python + + with pd.option_context('display.multi_sparse', False): + df It's worth keeping in mind that there's nothing preventing you from using tuples as atomic labels on an axis: @@ -129,15 +135,6 @@ can find yourself working with hierarchically-indexed data without creating a ``MultiIndex`` explicitly yourself. However, when loading data from a file, you may wish to generate your own ``MultiIndex`` when preparing the data set. -Note that how the index is displayed by be controlled using the -``multi_sparse`` option in ``pandas.set_options()``: - -.. ipython:: python - - pd.set_option('display.multi_sparse', False) - df - pd.set_option('display.multi_sparse', True) - .. _advanced.get_level_values: Reconstructing the level labels @@ -180,14 +177,13 @@ For example: .. ipython:: python -   # original MultiIndex -   df.columns +   df.columns # original MultiIndex - # sliced - df[['foo','qux']].columns + df[['foo','qux']].columns # sliced This is done to avoid a recomputation of the levels in order to make slicing -highly performant. If you want to see the actual used levels. +highly performant. If you want to see only the used levels, you can use the +:func:`MultiIndex.get_level_values` method. .. ipython:: python @@ -196,7 +192,7 @@ highly performant. If you want to see the actual used levels. # for a specific level df[['foo','qux']].columns.get_level_values(0) -To reconstruct the ``MultiIndex`` with only the used levels, the +To reconstruct the ``MultiIndex`` with only the used levels, the ``remove_unused_levels`` method may be used. .. versionadded:: 0.20.0 @@ -231,15 +227,33 @@ Advanced indexing with hierarchical index ----------------------------------------- Syntactically integrating ``MultiIndex`` in advanced indexing with ``.loc`` is a -bit challenging, but we've made every effort to do so. For example the -following works as you would expect: +bit challenging, but we've made every effort to do so. In general, MultiIndex +keys take the form of tuples. For example, the following works as you would expect: .. ipython:: python df = df.T df - df.loc['bar'] - df.loc['bar', 'two'] + df.loc[('bar', 'two'),] + +Note that ``df.loc['bar', 'two']`` would also work in this example, but this shorthand +notation can lead to ambiguity in general. + +If you also want to index a specific column with ``.loc``, you must use a tuple +like this: + +.. ipython:: python + + df.loc[('bar', 'two'), 'A'] + +You don't have to specify all levels of the ``MultiIndex`` by passing only the +first elements of the tuple. For example, you can use "partial" indexing to +get all elements with ``bar`` in the first level as follows: + +df.loc['bar'] + +This is a shortcut for the slightly more verbose notation ``df.loc[('bar',),]`` (equivalent +to ``df.loc['bar',]`` in this example). "Partial" slicing also works quite nicely. @@ -260,6 +274,24 @@ Passing a list of labels or tuples works similar to reindexing: df.loc[[('bar', 'two'), ('qux', 'one')]] +.. info:: + + It is important to note that tuples and lists are not treated identically + in pandas when it comes to indexing. Whereas a tuple is interpreted as one + multi-level key, a list is used to specify several keys. Or in other words, + tuples go horizontally (traversing levels), lists go vertically (scanning levels). + +Importantly, a list of tuples indexes several complete ``MultiIndex`` keys, +whereas a tuple of lists refer to several values within a level: + +.. ipython:: python + + s = pd.Series([1, 2, 3, 4, 5, 6], + index=pd.MultiIndex.from_product([["A", "B"], ["c", "d", "e"]])) + s.loc[[("A", "c"), ("B", "d")]] # list of tuples + s.loc[(["A", "B"], ["c", "d"])] # tuple of lists + + .. _advanced.mi_slicers: Using slicers @@ -317,7 +349,7 @@ Basic multi-index slicing using slices, lists, and labels. dfmi.loc[(slice('A1','A3'), slice(None), ['C1', 'C3']), :] -You can use :class:`pandas.IndexSlice` to facilitate a more natural syntax +You can use :class:`pandas.IndexSlice` to facilitate a more natural syntax using ``:``, rather than using ``slice(None)``. .. ipython:: python @@ -626,7 +658,7 @@ Index Types ----------- We have discussed ``MultiIndex`` in the previous sections pretty extensively. ``DatetimeIndex`` and ``PeriodIndex`` -are shown :ref:`here `, and information about +are shown :ref:`here `, and information about `TimedeltaIndex`` is found :ref:`here `. In the following sub-sections we will highlight some other index types. @@ -671,9 +703,9 @@ The ``CategoricalIndex`` is **preserved** after indexing: df2.loc['a'].index -Sorting the index will sort by the order of the categories (Recall that we -created the index with ``CategoricalDtype(list('cab'))``, so the sorted -order is ``cab``.). +Sorting the index will sort by the order of the categories (recall that we +created the index with ``CategoricalDtype(list('cab'))``, so the sorted +order is ``cab``). .. ipython:: python @@ -726,7 +758,7 @@ Int64Index and RangeIndex Indexing on an integer-based Index with floats has been clarified in 0.18.0, for a summary of the changes, see :ref:`here `. -``Int64Index`` is a fundamental basic index in pandas. +``Int64Index`` is a fundamental basic index in pandas. This is an Immutable array implementing an ordered, sliceable set. Prior to 0.18.0, the ``Int64Index`` would provide the default index for all ``NDFrame`` objects. @@ -765,7 +797,7 @@ The only positional indexing is via ``iloc``. sf.iloc[3] A scalar index that is not found will raise a ``KeyError``. -Slicing is primarily on the values of the index when using ``[],ix,loc``, and +Slicing is primarily on the values of the index when using ``[],ix,loc``, and **always** positional when using ``iloc``. The exception is when the slice is boolean, in which case it will always be positional. From 2fdf1e256e5e0b7f1fe909629e2f0b7893c8c7c3 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 15 Feb 2018 04:31:11 -0800 Subject: [PATCH 117/214] add missing args, make kwarg explicit (#19691) --- pandas/core/indexes/base.py | 4 ++-- pandas/core/internals.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index be7c1624936bf..81b6b28d3927e 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3943,8 +3943,8 @@ def _evaluate_with_timedelta_like(self, other, op, opstr, reversed=False): def _evaluate_with_datetime_like(self, other, op, opstr): raise TypeError("can only perform ops with datetime like values") - def _evaluate_compare(self, op): - raise base.AbstractMethodError(self) + def _evaluate_compare(self, other, op): + raise com.AbstractMethodError(self) @classmethod def _add_comparison_methods(cls): diff --git a/pandas/core/internals.py b/pandas/core/internals.py index f553e1a02c9d6..dd5feefc49fe3 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -3569,8 +3569,8 @@ def reduction(self, f, axis=0, consolidate=True, transposed=False, placement=np.arange(len(values)))], axes[0]) - def isna(self, **kwargs): - return self.apply('apply', **kwargs) + def isna(self, func, **kwargs): + return self.apply('apply', func=func, **kwargs) def where(self, **kwargs): return self.apply('where', **kwargs) From 44c822de8dee0bf0e1ed2e8bc15424bb323b786f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 16 Feb 2018 10:21:52 -0800 Subject: [PATCH 118/214] remove usages of _get_na_value (#19692) --- pandas/core/indexes/api.py | 3 +-- pandas/core/indexes/base.py | 8 +------- pandas/core/indexes/multi.py | 4 ++-- pandas/core/reshape/reshape.py | 6 +++--- pandas/core/series.py | 4 +--- 5 files changed, 8 insertions(+), 17 deletions(-) diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index b7af533f96ddc..2e5ec8b554ce7 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -2,7 +2,6 @@ _new_Index, _ensure_index, _ensure_index_from_sequences, - _get_na_value, InvalidIndexError) # noqa from pandas.core.indexes.category import CategoricalIndex # noqa from pandas.core.indexes.multi import MultiIndex # noqa @@ -25,7 +24,7 @@ 'InvalidIndexError', 'TimedeltaIndex', 'PeriodIndex', 'DatetimeIndex', '_new_Index', 'NaT', - '_ensure_index', '_ensure_index_from_sequences', '_get_na_value', + '_ensure_index', '_ensure_index_from_sequences', '_get_combined_index', '_get_objs_combined_axis', '_union_indexes', '_get_consensus_names', diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 81b6b28d3927e..02dd2dbc25703 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2098,7 +2098,7 @@ def asof(self, label): try: loc = self.get_loc(label, method='pad') except KeyError: - return _get_na_value(self.dtype) + return self._na_value else: if isinstance(loc, slice): loc = loc.indices(len(self))[-1] @@ -4316,12 +4316,6 @@ def _ensure_index(index_like, copy=False): return Index(index_like) -def _get_na_value(dtype): - if is_datetime64_any_dtype(dtype) or is_timedelta64_dtype(dtype): - return libts.NaT - return np.nan - - def _ensure_has_len(seq): """If seq is an iterator, put its values into a list.""" try: diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 94dbd8b884e47..73f4aee1c4880 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -34,7 +34,7 @@ from pandas.core.indexes.base import ( Index, _ensure_index, - _get_na_value, InvalidIndexError, + InvalidIndexError, _index_shared_docs) from pandas.core.indexes.frozen import ( FrozenNDArray, FrozenList, _ensure_frozen) @@ -804,7 +804,7 @@ def values(self): elif box: taken = algos.take_1d(lev._box_values(lev._ndarray_values), lab, - fill_value=_get_na_value(lev.dtype.type)) + fill_value=lev._na_value) else: taken = algos.take_1d(np.asarray(lev._values), lab) values.append(taken) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index c8bca476c65f2..3ef152d091b24 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -29,7 +29,7 @@ import pandas.core.algorithms as algos from pandas._libs import algos as _algos, reshape as _reshape -from pandas.core.index import Index, MultiIndex, _get_na_value +from pandas.core.index import Index, MultiIndex class _Unstacker(object): @@ -260,7 +260,7 @@ def get_new_columns(self): return self.removed_level lev = self.removed_level - return lev.insert(0, _get_na_value(lev.dtype.type)) + return lev.insert(0, lev._na_value) stride = len(self.removed_level) + self.lift width = len(self.value_columns) @@ -299,7 +299,7 @@ def get_new_index(self): if len(self.new_index_levels) == 1: lev, lab = self.new_index_levels[0], result_labels[0] if (lab == -1).any(): - lev = lev.insert(len(lev), _get_na_value(lev.dtype.type)) + lev = lev.insert(len(lev), lev._na_value) return lev.take(lab) return MultiIndex(levels=self.new_index_levels, labels=result_labels, diff --git a/pandas/core/series.py b/pandas/core/series.py index 655eaa5373f5a..90dc14836ab55 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1261,8 +1261,6 @@ def count(self, level=None): ------- nobs : int or Series (if level specified) """ - from pandas.core.index import _get_na_value - if level is None: return notna(com._values_from_object(self)).sum() @@ -1275,7 +1273,7 @@ def count(self, level=None): mask = lab == -1 if mask.any(): lab[mask] = cnt = len(lev) - lev = lev.insert(cnt, _get_na_value(lev.dtype.type)) + lev = lev.insert(cnt, lev._na_value) obs = lab[notna(self.values)] out = np.bincount(obs, minlength=len(lev) or None) From c8d83311611814b36aff9edab8bac902f2ec74ee Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 17 Feb 2018 12:47:18 -0800 Subject: [PATCH 119/214] TST: Parametrize PeriodIndex tests (#19659) * fixup formatting * parametrize PeriodIndex tests * fixup typo * put lists of params at module level * make fixtures * docstrings for fixtures * requested docstring --- .../tests/indexes/period/test_arithmetic.py | 521 ++++++++++-------- pandas/tests/indexes/period/test_astype.py | 99 ++++ pandas/tests/indexes/period/test_ops.py | 36 -- pandas/tests/indexes/period/test_period.py | 60 +- 4 files changed, 394 insertions(+), 322 deletions(-) create mode 100644 pandas/tests/indexes/period/test_astype.py diff --git a/pandas/tests/indexes/period/test_arithmetic.py b/pandas/tests/indexes/period/test_arithmetic.py index 81171920f635f..5f8f9533e9c44 100644 --- a/pandas/tests/indexes/period/test_arithmetic.py +++ b/pandas/tests/indexes/period/test_arithmetic.py @@ -11,7 +11,81 @@ import pandas.core.indexes.period as period +_common_mismatch = [pd.offsets.YearBegin(2), + pd.offsets.MonthBegin(1), + pd.offsets.Minute()] + + +@pytest.fixture(params=[timedelta(minutes=30), + np.timedelta64(30, 's'), + Timedelta(seconds=30)] + _common_mismatch) +def not_hourly(request): + """ + Several timedelta-like and DateOffset instances that are _not_ + compatible with Hourly frequencies. + """ + return request.param + + +@pytest.fixture(params=[np.timedelta64(4, 'h'), + timedelta(hours=23), + Timedelta('23:00:00')] + _common_mismatch) +def not_daily(request): + """ + Several timedelta-like and DateOffset instances that are _not_ + compatible with Daily frequencies. + """ + return request.param + + +@pytest.fixture(params=[np.timedelta64(365, 'D'), + timedelta(365), + Timedelta(days=365)] + _common_mismatch) +def mismatched(request): + """ + Several timedelta-like and DateOffset instances that are _not_ + compatible with Monthly or Annual frequencies. + """ + return request.param + + +@pytest.fixture(params=[pd.offsets.Day(3), + timedelta(days=3), + np.timedelta64(3, 'D'), + pd.offsets.Hour(72), + timedelta(minutes=60 * 24 * 3), + np.timedelta64(72, 'h'), + Timedelta('72:00:00')]) +def three_days(request): + """ + Several timedelta-like and DateOffset objects that each represent + a 3-day timedelta + """ + return request.param + + +@pytest.fixture(params=[pd.offsets.Hour(2), + timedelta(hours=2), + np.timedelta64(2, 'h'), + pd.offsets.Minute(120), + timedelta(minutes=120), + np.timedelta64(120, 'm')]) +def two_hours(request): + """ + Several timedelta-like and DateOffset objects that each represent + a 2-hour timedelta + """ + return request.param + + class TestPeriodIndexComparisons(object): + def test_pi_cmp_period(self): + idx = period_range('2007-01', periods=20, freq='M') + + result = idx < idx[10] + exp = idx.values < idx.values[10] + tm.assert_numpy_array_equal(result, exp) + @pytest.mark.parametrize('freq', ['M', '2M', '3M']) def test_pi_cmp_pi(self, freq): base = PeriodIndex(['2011-01', '2011-02', '2011-03', '2011-04'], @@ -148,32 +222,35 @@ def test_pi_cmp_nat_mismatched_freq_raises(self, freq): idx1 == diff # TODO: De-duplicate with test_pi_cmp_nat - def test_comp_nat(self): + @pytest.mark.parametrize('dtype', [object, None]) + def test_comp_nat(self, dtype): left = pd.PeriodIndex([pd.Period('2011-01-01'), pd.NaT, pd.Period('2011-01-03')]) right = pd.PeriodIndex([pd.NaT, pd.NaT, pd.Period('2011-01-03')]) - for lhs, rhs in [(left, right), - (left.astype(object), right.astype(object))]: - result = lhs == rhs - expected = np.array([False, False, True]) - tm.assert_numpy_array_equal(result, expected) + if dtype is not None: + left = left.astype(dtype) + right = right.astype(dtype) - result = lhs != rhs - expected = np.array([True, True, False]) - tm.assert_numpy_array_equal(result, expected) + result = left == right + expected = np.array([False, False, True]) + tm.assert_numpy_array_equal(result, expected) + + result = left != right + expected = np.array([True, True, False]) + tm.assert_numpy_array_equal(result, expected) - expected = np.array([False, False, False]) - tm.assert_numpy_array_equal(lhs == pd.NaT, expected) - tm.assert_numpy_array_equal(pd.NaT == rhs, expected) + expected = np.array([False, False, False]) + tm.assert_numpy_array_equal(left == pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT == right, expected) - expected = np.array([True, True, True]) - tm.assert_numpy_array_equal(lhs != pd.NaT, expected) - tm.assert_numpy_array_equal(pd.NaT != lhs, expected) + expected = np.array([True, True, True]) + tm.assert_numpy_array_equal(left != pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT != left, expected) - expected = np.array([False, False, False]) - tm.assert_numpy_array_equal(lhs < pd.NaT, expected) - tm.assert_numpy_array_equal(pd.NaT > lhs, expected) + expected = np.array([False, False, False]) + tm.assert_numpy_array_equal(left < pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT > left, expected) class TestPeriodIndexArithmetic(object): @@ -203,7 +280,7 @@ def test_pi_radd_offset_array(self): expected = pd.PeriodIndex([pd.Period('2015Q2'), pd.Period('2015Q4')]) tm.assert_index_equal(res, expected) - def test_add_iadd(self): + def test_pi_add_iadd_pi_raises(self): rng = pd.period_range('1/1/2000', freq='D', periods=5) other = pd.period_range('1/6/2000', freq='D', periods=5) @@ -214,89 +291,7 @@ def test_add_iadd(self): with pytest.raises(TypeError): rng += other - # offset - # DateOffset - rng = pd.period_range('2014', '2024', freq='A') - result = rng + pd.offsets.YearEnd(5) - expected = pd.period_range('2019', '2029', freq='A') - tm.assert_index_equal(result, expected) - rng += pd.offsets.YearEnd(5) - tm.assert_index_equal(rng, expected) - - for o in [pd.offsets.YearBegin(2), pd.offsets.MonthBegin(1), - pd.offsets.Minute(), np.timedelta64(365, 'D'), - timedelta(365), Timedelta(days=365)]: - msg = ('Input has different freq(=.+)? ' - 'from PeriodIndex\\(freq=A-DEC\\)') - with tm.assert_raises_regex( - period.IncompatibleFrequency, msg): - rng + o - - rng = pd.period_range('2014-01', '2016-12', freq='M') - result = rng + pd.offsets.MonthEnd(5) - expected = pd.period_range('2014-06', '2017-05', freq='M') - tm.assert_index_equal(result, expected) - rng += pd.offsets.MonthEnd(5) - tm.assert_index_equal(rng, expected) - - for o in [pd.offsets.YearBegin(2), pd.offsets.MonthBegin(1), - pd.offsets.Minute(), np.timedelta64(365, 'D'), - timedelta(365), Timedelta(days=365)]: - rng = pd.period_range('2014-01', '2016-12', freq='M') - msg = 'Input has different freq(=.+)? from PeriodIndex\\(freq=M\\)' - with tm.assert_raises_regex( - period.IncompatibleFrequency, msg): - rng + o - - # Tick - offsets = [pd.offsets.Day(3), timedelta(days=3), - np.timedelta64(3, 'D'), pd.offsets.Hour(72), - timedelta(minutes=60 * 24 * 3), np.timedelta64(72, 'h'), - Timedelta('72:00:00')] - for delta in offsets: - rng = pd.period_range('2014-05-01', '2014-05-15', freq='D') - result = rng + delta - expected = pd.period_range('2014-05-04', '2014-05-18', freq='D') - tm.assert_index_equal(result, expected) - rng += delta - tm.assert_index_equal(rng, expected) - - for o in [pd.offsets.YearBegin(2), pd.offsets.MonthBegin(1), - pd.offsets.Minute(), np.timedelta64(4, 'h'), - timedelta(hours=23), Timedelta('23:00:00')]: - rng = pd.period_range('2014-05-01', '2014-05-15', freq='D') - msg = 'Input has different freq(=.+)? from PeriodIndex\\(freq=D\\)' - with tm.assert_raises_regex( - period.IncompatibleFrequency, msg): - rng + o - - offsets = [pd.offsets.Hour(2), timedelta(hours=2), - np.timedelta64(2, 'h'), pd.offsets.Minute(120), - timedelta(minutes=120), np.timedelta64(120, 'm'), - Timedelta(minutes=120)] - for delta in offsets: - rng = pd.period_range('2014-01-01 10:00', '2014-01-05 10:00', - freq='H') - result = rng + delta - expected = pd.period_range('2014-01-01 12:00', '2014-01-05 12:00', - freq='H') - tm.assert_index_equal(result, expected) - rng += delta - tm.assert_index_equal(rng, expected) - - for delta in [pd.offsets.YearBegin(2), timedelta(minutes=30), - np.timedelta64(30, 's'), Timedelta(seconds=30)]: - rng = pd.period_range('2014-01-01 10:00', '2014-01-05 10:00', - freq='H') - msg = 'Input has different freq(=.+)? from PeriodIndex\\(freq=H\\)' - with tm.assert_raises_regex( - period.IncompatibleFrequency, msg): - rng + delta - with tm.assert_raises_regex( - period.IncompatibleFrequency, msg): - rng += delta - - def test_pi_add_int(self, one): + def test_pi_add_iadd_int(self, one): # Variants of `one` for #19012 rng = pd.period_range('2000-01-01 09:00', freq='H', periods=10) result = rng + one @@ -305,16 +300,27 @@ def test_pi_add_int(self, one): rng += one tm.assert_index_equal(rng, expected) + def test_pi_sub_isub_int(self, one): + """ + PeriodIndex.__sub__ and __isub__ with several representations of + the integer 1, e.g. int, long, np.int64, np.uint8, ... + """ + rng = pd.period_range('2000-01-01 09:00', freq='H', periods=10) + result = rng - one + expected = pd.period_range('2000-01-01 08:00', freq='H', periods=10) + tm.assert_index_equal(result, expected) + rng -= one + tm.assert_index_equal(rng, expected) + @pytest.mark.parametrize('five', [5, np.array(5, dtype=np.int64)]) - def test_sub(self, five): + def test_pi_sub_intlike(self, five): rng = period_range('2007-01', periods=50) result = rng - five exp = rng + (-five) tm.assert_index_equal(result, exp) - def test_sub_isub(self): - + def test_pi_sub_isub_pi_raises(self): # previously performed setop, now raises TypeError (GH14164) # TODO needs to wait on #13077 for decision on result type rng = pd.period_range('1/1/2000', freq='D', periods=5) @@ -326,6 +332,7 @@ def test_sub_isub(self): with pytest.raises(TypeError): rng -= other + def test_pi_sub_isub_offset(self): # offset # DateOffset rng = pd.period_range('2014', '2024', freq='A') @@ -335,102 +342,165 @@ def test_sub_isub(self): rng -= pd.offsets.YearEnd(5) tm.assert_index_equal(rng, expected) - for o in [pd.offsets.YearBegin(2), pd.offsets.MonthBegin(1), - pd.offsets.Minute(), np.timedelta64(365, 'D'), - timedelta(365)]: - rng = pd.period_range('2014', '2024', freq='A') - msg = ('Input has different freq(=.+)? ' - 'from PeriodIndex\\(freq=A-DEC\\)') - with tm.assert_raises_regex( - period.IncompatibleFrequency, msg): - rng - o - rng = pd.period_range('2014-01', '2016-12', freq='M') result = rng - pd.offsets.MonthEnd(5) expected = pd.period_range('2013-08', '2016-07', freq='M') tm.assert_index_equal(result, expected) + rng -= pd.offsets.MonthEnd(5) tm.assert_index_equal(rng, expected) - for o in [pd.offsets.YearBegin(2), pd.offsets.MonthBegin(1), - pd.offsets.Minute(), np.timedelta64(365, 'D'), - timedelta(365)]: - rng = pd.period_range('2014-01', '2016-12', freq='M') - msg = 'Input has different freq(=.+)? from PeriodIndex\\(freq=M\\)' - with tm.assert_raises_regex( - period.IncompatibleFrequency, msg): - rng - o + # --------------------------------------------------------------- + # Timedelta-like (timedelta, timedelta64, Timedelta, Tick) + # TODO: Some of these are misnomers because of non-Tick DateOffsets + def test_pi_add_iadd_timedeltalike_daily(self, three_days): # Tick - offsets = [pd.offsets.Day(3), timedelta(days=3), - np.timedelta64(3, 'D'), pd.offsets.Hour(72), - timedelta(minutes=60 * 24 * 3), np.timedelta64(72, 'h')] - for delta in offsets: - rng = pd.period_range('2014-05-01', '2014-05-15', freq='D') - result = rng - delta - expected = pd.period_range('2014-04-28', '2014-05-12', freq='D') - tm.assert_index_equal(result, expected) - rng -= delta - tm.assert_index_equal(rng, expected) - - for o in [pd.offsets.YearBegin(2), pd.offsets.MonthBegin(1), - pd.offsets.Minute(), np.timedelta64(4, 'h'), - timedelta(hours=23)]: - rng = pd.period_range('2014-05-01', '2014-05-15', freq='D') - msg = 'Input has different freq(=.+)? from PeriodIndex\\(freq=D\\)' - with tm.assert_raises_regex( - period.IncompatibleFrequency, msg): - rng - o - - offsets = [pd.offsets.Hour(2), timedelta(hours=2), - np.timedelta64(2, 'h'), pd.offsets.Minute(120), - timedelta(minutes=120), np.timedelta64(120, 'm')] - for delta in offsets: - rng = pd.period_range('2014-01-01 10:00', '2014-01-05 10:00', - freq='H') - result = rng - delta - expected = pd.period_range('2014-01-01 08:00', '2014-01-05 08:00', - freq='H') - tm.assert_index_equal(result, expected) - rng -= delta - tm.assert_index_equal(rng, expected) - - for delta in [pd.offsets.YearBegin(2), timedelta(minutes=30), - np.timedelta64(30, 's')]: - rng = pd.period_range('2014-01-01 10:00', '2014-01-05 10:00', - freq='H') - msg = 'Input has different freq(=.+)? from PeriodIndex\\(freq=H\\)' - with tm.assert_raises_regex( - period.IncompatibleFrequency, msg): - rng + delta - with tm.assert_raises_regex( - period.IncompatibleFrequency, msg): - rng += delta - - # int - rng = pd.period_range('2000-01-01 09:00', freq='H', periods=10) - result = rng - 1 - expected = pd.period_range('2000-01-01 08:00', freq='H', periods=10) + other = three_days + rng = pd.period_range('2014-05-01', '2014-05-15', freq='D') + expected = pd.period_range('2014-05-04', '2014-05-18', freq='D') + + result = rng + other + tm.assert_index_equal(result, expected) + + rng += other + tm.assert_index_equal(rng, expected) + + def test_pi_sub_isub_timedeltalike_daily(self, three_days): + # Tick-like 3 Days + other = three_days + rng = pd.period_range('2014-05-01', '2014-05-15', freq='D') + expected = pd.period_range('2014-04-28', '2014-05-12', freq='D') + + result = rng - other + tm.assert_index_equal(result, expected) + + rng -= other + tm.assert_index_equal(rng, expected) + + def test_pi_add_iadd_timedeltalike_freq_mismatch_daily(self, not_daily): + other = not_daily + rng = pd.period_range('2014-05-01', '2014-05-15', freq='D') + msg = 'Input has different freq(=.+)? from PeriodIndex\\(freq=D\\)' + with tm.assert_raises_regex(period.IncompatibleFrequency, msg): + rng + other + with tm.assert_raises_regex(period.IncompatibleFrequency, msg): + rng += other + + def test_pi_sub_timedeltalike_freq_mismatch_daily(self, not_daily): + other = not_daily + rng = pd.period_range('2014-05-01', '2014-05-15', freq='D') + msg = 'Input has different freq(=.+)? from PeriodIndex\\(freq=D\\)' + with tm.assert_raises_regex(period.IncompatibleFrequency, msg): + rng - other + + def test_pi_add_iadd_timedeltalike_hourly(self, two_hours): + other = two_hours + rng = pd.period_range('2014-01-01 10:00', '2014-01-05 10:00', freq='H') + expected = pd.period_range('2014-01-01 12:00', '2014-01-05 12:00', + freq='H') + + result = rng + other + tm.assert_index_equal(result, expected) + + rng += other + tm.assert_index_equal(rng, expected) + + def test_pi_add_timedeltalike_mismatched_freq_hourly(self, not_hourly): + other = not_hourly + rng = pd.period_range('2014-01-01 10:00', '2014-01-05 10:00', freq='H') + msg = 'Input has different freq(=.+)? from PeriodIndex\\(freq=H\\)' + + with tm.assert_raises_regex(period.IncompatibleFrequency, msg): + rng + other + + with tm.assert_raises_regex(period.IncompatibleFrequency, msg): + rng += other + + def test_pi_sub_isub_timedeltalike_hourly(self, two_hours): + other = two_hours + rng = pd.period_range('2014-01-01 10:00', '2014-01-05 10:00', freq='H') + expected = pd.period_range('2014-01-01 08:00', '2014-01-05 08:00', + freq='H') + + result = rng - other + tm.assert_index_equal(result, expected) + + rng -= other + tm.assert_index_equal(rng, expected) + + def test_add_iadd_timedeltalike_annual(self): + # offset + # DateOffset + rng = pd.period_range('2014', '2024', freq='A') + result = rng + pd.offsets.YearEnd(5) + expected = pd.period_range('2019', '2029', freq='A') + tm.assert_index_equal(result, expected) + rng += pd.offsets.YearEnd(5) + tm.assert_index_equal(rng, expected) + + def test_pi_add_iadd_timedeltalike_freq_mismatch_annual(self, mismatched): + other = mismatched + rng = pd.period_range('2014', '2024', freq='A') + msg = ('Input has different freq(=.+)? ' + 'from PeriodIndex\\(freq=A-DEC\\)') + with tm.assert_raises_regex(period.IncompatibleFrequency, msg): + rng + other + with tm.assert_raises_regex(period.IncompatibleFrequency, msg): + rng += other + + def test_pi_sub_isub_timedeltalike_freq_mismatch_annual(self, mismatched): + other = mismatched + rng = pd.period_range('2014', '2024', freq='A') + msg = ('Input has different freq(=.+)? ' + 'from PeriodIndex\\(freq=A-DEC\\)') + with tm.assert_raises_regex(period.IncompatibleFrequency, msg): + rng - other + with tm.assert_raises_regex(period.IncompatibleFrequency, msg): + rng -= other + + def test_pi_add_iadd_timedeltalike_M(self): + rng = pd.period_range('2014-01', '2016-12', freq='M') + expected = pd.period_range('2014-06', '2017-05', freq='M') + + result = rng + pd.offsets.MonthEnd(5) tm.assert_index_equal(result, expected) - rng -= 1 + + rng += pd.offsets.MonthEnd(5) tm.assert_index_equal(rng, expected) + def test_pi_add_iadd_timedeltalike_freq_mismatch_monthly(self, mismatched): + other = mismatched + rng = pd.period_range('2014-01', '2016-12', freq='M') + msg = 'Input has different freq(=.+)? from PeriodIndex\\(freq=M\\)' + with tm.assert_raises_regex(period.IncompatibleFrequency, msg): + rng + other + with tm.assert_raises_regex(period.IncompatibleFrequency, msg): + rng += other + + def test_pi_sub_isub_timedeltalike_freq_mismatch_monthly(self, mismatched): + other = mismatched + rng = pd.period_range('2014-01', '2016-12', freq='M') + msg = 'Input has different freq(=.+)? from PeriodIndex\\(freq=M\\)' + with tm.assert_raises_regex(period.IncompatibleFrequency, msg): + rng - other + with tm.assert_raises_regex(period.IncompatibleFrequency, msg): + rng -= other + # --------------------------------------------------------------- # PeriodIndex.shift is used by __add__ and __sub__ def test_pi_shift_ndarray(self): - idx = PeriodIndex(['2011-01', '2011-02', 'NaT', - '2011-04'], freq='M', name='idx') + idx = PeriodIndex(['2011-01', '2011-02', 'NaT', '2011-04'], + freq='M', name='idx') result = idx.shift(np.array([1, 2, 3, 4])) - expected = PeriodIndex(['2011-02', '2011-04', 'NaT', - '2011-08'], freq='M', name='idx') + expected = PeriodIndex(['2011-02', '2011-04', 'NaT', '2011-08'], + freq='M', name='idx') tm.assert_index_equal(result, expected) - idx = PeriodIndex(['2011-01', '2011-02', 'NaT', - '2011-04'], freq='M', name='idx') result = idx.shift(np.array([1, -2, 3, -4])) - expected = PeriodIndex(['2011-02', '2010-12', 'NaT', - '2010-12'], freq='M', name='idx') + expected = PeriodIndex(['2011-02', '2010-12', 'NaT', '2010-12'], + freq='M', name='idx') tm.assert_index_equal(result, expected) def test_shift(self): @@ -489,11 +559,11 @@ def test_shift_corner_cases(self): tm.assert_index_equal(idx.shift(-3), exp) def test_shift_nat(self): - idx = PeriodIndex(['2011-01', '2011-02', 'NaT', - '2011-04'], freq='M', name='idx') + idx = PeriodIndex(['2011-01', '2011-02', 'NaT', '2011-04'], + freq='M', name='idx') result = idx.shift(1) - expected = PeriodIndex(['2011-02', '2011-03', 'NaT', - '2011-05'], freq='M', name='idx') + expected = PeriodIndex(['2011-02', '2011-03', 'NaT', '2011-05'], + freq='M', name='idx') tm.assert_index_equal(result, expected) assert result.name == expected.name @@ -519,18 +589,18 @@ def _check(self, values, func, expected): # comp op results in bool tm.assert_numpy_array_equal(result, expected) - s = pd.Series(values) - result = func(s) + ser = pd.Series(values) + result = func(ser) exp = pd.Series(expected, name=values.name) tm.assert_series_equal(result, exp) def test_pi_ops(self): - idx = PeriodIndex(['2011-01', '2011-02', '2011-03', - '2011-04'], freq='M', name='idx') + idx = PeriodIndex(['2011-01', '2011-02', '2011-03', '2011-04'], + freq='M', name='idx') - expected = PeriodIndex(['2011-03', '2011-04', - '2011-05', '2011-06'], freq='M', name='idx') + expected = PeriodIndex(['2011-03', '2011-04', '2011-05', '2011-06'], + freq='M', name='idx') self._check(idx, lambda x: x + 2, expected) self._check(idx, lambda x: 2 + x, expected) @@ -544,13 +614,13 @@ def test_pi_ops(self): tm.assert_index_equal(result, exp) def test_pi_ops_errors(self): - idx = PeriodIndex(['2011-01', '2011-02', '2011-03', - '2011-04'], freq='M', name='idx') - s = pd.Series(idx) + idx = PeriodIndex(['2011-01', '2011-02', '2011-03', '2011-04'], + freq='M', name='idx') + ser = pd.Series(idx) msg = r"unsupported operand type\(s\)" - for obj in [idx, s]: + for obj in [idx, ser]: for ng in ["str", 1.5]: with tm.assert_raises_regex(TypeError, msg): obj + ng @@ -581,10 +651,10 @@ def test_pi_ops_errors(self): np.subtract(ng, obj) def test_pi_ops_nat(self): - idx = PeriodIndex(['2011-01', '2011-02', 'NaT', - '2011-04'], freq='M', name='idx') - expected = PeriodIndex(['2011-03', '2011-04', - 'NaT', '2011-06'], freq='M', name='idx') + idx = PeriodIndex(['2011-01', '2011-02', 'NaT', '2011-04'], + freq='M', name='idx') + expected = PeriodIndex(['2011-03', '2011-04', 'NaT', '2011-06'], + freq='M', name='idx') self._check(idx, lambda x: x + 2, expected) self._check(idx, lambda x: 2 + x, expected) self._check(idx, lambda x: np.add(x, 2), expected) @@ -593,10 +663,10 @@ def test_pi_ops_nat(self): self._check(idx + 2, lambda x: np.subtract(x, 2), idx) # freq with mult - idx = PeriodIndex(['2011-01', '2011-02', 'NaT', - '2011-04'], freq='2M', name='idx') - expected = PeriodIndex(['2011-07', '2011-08', - 'NaT', '2011-10'], freq='2M', name='idx') + idx = PeriodIndex(['2011-01', '2011-02', 'NaT', '2011-04'], + freq='2M', name='idx') + expected = PeriodIndex(['2011-07', '2011-08', 'NaT', '2011-10'], + freq='2M', name='idx') self._check(idx, lambda x: x + 3, expected) self._check(idx, lambda x: 3 + x, expected) self._check(idx, lambda x: np.add(x, 3), expected) @@ -605,26 +675,26 @@ def test_pi_ops_nat(self): self._check(idx + 3, lambda x: np.subtract(x, 3), idx) def test_pi_ops_array_int(self): - idx = PeriodIndex(['2011-01', '2011-02', 'NaT', - '2011-04'], freq='M', name='idx') + idx = PeriodIndex(['2011-01', '2011-02', 'NaT', '2011-04'], + freq='M', name='idx') f = lambda x: x + np.array([1, 2, 3, 4]) - exp = PeriodIndex(['2011-02', '2011-04', 'NaT', - '2011-08'], freq='M', name='idx') + exp = PeriodIndex(['2011-02', '2011-04', 'NaT', '2011-08'], + freq='M', name='idx') self._check(idx, f, exp) f = lambda x: np.add(x, np.array([4, -1, 1, 2])) - exp = PeriodIndex(['2011-05', '2011-01', 'NaT', - '2011-06'], freq='M', name='idx') + exp = PeriodIndex(['2011-05', '2011-01', 'NaT', '2011-06'], + freq='M', name='idx') self._check(idx, f, exp) f = lambda x: x - np.array([1, 2, 3, 4]) - exp = PeriodIndex(['2010-12', '2010-12', 'NaT', - '2010-12'], freq='M', name='idx') + exp = PeriodIndex(['2010-12', '2010-12', 'NaT', '2010-12'], + freq='M', name='idx') self._check(idx, f, exp) f = lambda x: np.subtract(x, np.array([3, 2, 3, -2])) - exp = PeriodIndex(['2010-10', '2010-12', 'NaT', - '2011-06'], freq='M', name='idx') + exp = PeriodIndex(['2010-10', '2010-12', 'NaT', '2011-06'], + freq='M', name='idx') self._check(idx, f, exp) def test_pi_ops_offset(self): @@ -648,29 +718,26 @@ def test_pi_ops_offset(self): def test_pi_offset_errors(self): idx = PeriodIndex(['2011-01-01', '2011-02-01', '2011-03-01', '2011-04-01'], freq='D', name='idx') - s = pd.Series(idx) + ser = pd.Series(idx) # Series op is applied per Period instance, thus error is raised # from Period msg_idx = r"Input has different freq from PeriodIndex\(freq=D\)" msg_s = r"Input cannot be converted to Period\(freq=D\)" - for obj, msg in [(idx, msg_idx), (s, msg_s)]: - with tm.assert_raises_regex( - period.IncompatibleFrequency, msg): + for obj, msg in [(idx, msg_idx), (ser, msg_s)]: + with tm.assert_raises_regex(period.IncompatibleFrequency, msg): obj + pd.offsets.Hour(2) - with tm.assert_raises_regex( - period.IncompatibleFrequency, msg): + with tm.assert_raises_regex(period.IncompatibleFrequency, msg): pd.offsets.Hour(2) + obj - with tm.assert_raises_regex( - period.IncompatibleFrequency, msg): + with tm.assert_raises_regex(period.IncompatibleFrequency, msg): obj - pd.offsets.Hour(2) def test_pi_sub_period(self): # GH 13071 - idx = PeriodIndex(['2011-01', '2011-02', '2011-03', - '2011-04'], freq='M', name='idx') + idx = PeriodIndex(['2011-01', '2011-02', '2011-03', '2011-04'], + freq='M', name='idx') result = idx - pd.Period('2012-01', freq='M') exp = pd.Index([-12, -11, -10, -9], name='idx') @@ -695,16 +762,16 @@ def test_pi_sub_period(self): def test_pi_sub_pdnat(self): # GH 13071 - idx = PeriodIndex(['2011-01', '2011-02', 'NaT', - '2011-04'], freq='M', name='idx') + idx = PeriodIndex(['2011-01', '2011-02', 'NaT', '2011-04'], + freq='M', name='idx') exp = pd.TimedeltaIndex([pd.NaT] * 4, name='idx') tm.assert_index_equal(pd.NaT - idx, exp) tm.assert_index_equal(idx - pd.NaT, exp) def test_pi_sub_period_nat(self): # GH 13071 - idx = PeriodIndex(['2011-01', 'NaT', '2011-03', - '2011-04'], freq='M', name='idx') + idx = PeriodIndex(['2011-01', 'NaT', '2011-03', '2011-04'], + freq='M', name='idx') result = idx - pd.Period('2012-01', freq='M') exp = pd.Index([-12, np.nan, -10, -9], name='idx') diff --git a/pandas/tests/indexes/period/test_astype.py b/pandas/tests/indexes/period/test_astype.py new file mode 100644 index 0000000000000..f2126487496c4 --- /dev/null +++ b/pandas/tests/indexes/period/test_astype.py @@ -0,0 +1,99 @@ +# -*- coding: utf-8 -*- + +import numpy as np +import pytest + +import pandas as pd +import pandas.util.testing as tm +from pandas import NaT, Period, PeriodIndex, Int64Index, Index, period_range + + +class TestPeriodIndexAsType(object): + @pytest.mark.parametrize('dtype', [ + float, 'timedelta64', 'timedelta64[ns]']) + def test_astype_raises(self, dtype): + # GH#13149, GH#13209 + idx = PeriodIndex(['2016-05-16', 'NaT', NaT, np.NaN], freq='D') + msg = 'Cannot cast PeriodIndex to dtype' + with tm.assert_raises_regex(TypeError, msg): + idx.astype(dtype) + + def test_astype_conversion(self): + # GH#13149, GH#13209 + idx = PeriodIndex(['2016-05-16', 'NaT', NaT, np.NaN], freq='D') + + result = idx.astype(object) + expected = Index([Period('2016-05-16', freq='D')] + + [Period(NaT, freq='D')] * 3, dtype='object') + tm.assert_index_equal(result, expected) + + result = idx.astype(int) + expected = Int64Index([16937] + [-9223372036854775808] * 3, + dtype=np.int64) + tm.assert_index_equal(result, expected) + + result = idx.astype(str) + expected = Index(str(x) for x in idx) + tm.assert_index_equal(result, expected) + + idx = period_range('1990', '2009', freq='A') + result = idx.astype('i8') + tm.assert_index_equal(result, Index(idx.asi8)) + tm.assert_numpy_array_equal(result.values, idx.asi8) + + def test_astype_object(self): + idx = pd.PeriodIndex([], freq='M') + + exp = np.array([], dtype=object) + tm.assert_numpy_array_equal(idx.astype(object).values, exp) + tm.assert_numpy_array_equal(idx._mpl_repr(), exp) + + idx = pd.PeriodIndex(['2011-01', pd.NaT], freq='M') + + exp = np.array([pd.Period('2011-01', freq='M'), pd.NaT], dtype=object) + tm.assert_numpy_array_equal(idx.astype(object).values, exp) + tm.assert_numpy_array_equal(idx._mpl_repr(), exp) + + exp = np.array([pd.Period('2011-01-01', freq='D'), pd.NaT], + dtype=object) + idx = pd.PeriodIndex(['2011-01-01', pd.NaT], freq='D') + tm.assert_numpy_array_equal(idx.astype(object).values, exp) + tm.assert_numpy_array_equal(idx._mpl_repr(), exp) + + # TODO: de-duplicate this version (from test_ops) with the one above + # (from test_period) + def test_astype_object2(self): + idx = pd.period_range(start='2013-01-01', periods=4, freq='M', + name='idx') + expected_list = [pd.Period('2013-01-31', freq='M'), + pd.Period('2013-02-28', freq='M'), + pd.Period('2013-03-31', freq='M'), + pd.Period('2013-04-30', freq='M')] + expected = pd.Index(expected_list, dtype=object, name='idx') + result = idx.astype(object) + assert isinstance(result, Index) + assert result.dtype == object + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert idx.tolist() == expected_list + + idx = PeriodIndex(['2013-01-01', '2013-01-02', 'NaT', + '2013-01-04'], freq='D', name='idx') + expected_list = [pd.Period('2013-01-01', freq='D'), + pd.Period('2013-01-02', freq='D'), + pd.Period('NaT', freq='D'), + pd.Period('2013-01-04', freq='D')] + expected = pd.Index(expected_list, dtype=object, name='idx') + result = idx.astype(object) + assert isinstance(result, Index) + assert result.dtype == object + tm.assert_index_equal(result, expected) + for i in [0, 1, 3]: + assert result[i] == expected[i] + assert result[2] is pd.NaT + assert result.name == expected.name + + result_list = idx.tolist() + for i in [0, 1, 3]: + assert result_list[i] == expected_list[i] + assert result_list[2] is pd.NaT diff --git a/pandas/tests/indexes/period/test_ops.py b/pandas/tests/indexes/period/test_ops.py index 8745de0c2a7aa..6c272864e0026 100644 --- a/pandas/tests/indexes/period/test_ops.py +++ b/pandas/tests/indexes/period/test_ops.py @@ -25,42 +25,6 @@ def test_ops_properties(self): self.check_ops_properties(PeriodIndex._object_ops, f) self.check_ops_properties(PeriodIndex._bool_ops, f) - def test_astype_object(self): - idx = pd.period_range(start='2013-01-01', periods=4, freq='M', - name='idx') - expected_list = [pd.Period('2013-01-31', freq='M'), - pd.Period('2013-02-28', freq='M'), - pd.Period('2013-03-31', freq='M'), - pd.Period('2013-04-30', freq='M')] - expected = pd.Index(expected_list, dtype=object, name='idx') - result = idx.astype(object) - assert isinstance(result, Index) - assert result.dtype == object - tm.assert_index_equal(result, expected) - assert result.name == expected.name - assert idx.tolist() == expected_list - - idx = PeriodIndex(['2013-01-01', '2013-01-02', 'NaT', - '2013-01-04'], freq='D', name='idx') - expected_list = [pd.Period('2013-01-01', freq='D'), - pd.Period('2013-01-02', freq='D'), - pd.Period('NaT', freq='D'), - pd.Period('2013-01-04', freq='D')] - expected = pd.Index(expected_list, dtype=object, name='idx') - result = idx.astype(object) - assert isinstance(result, Index) - assert result.dtype == object - tm.assert_index_equal(result, expected) - for i in [0, 1, 3]: - assert result[i] == expected[i] - assert result[2] is pd.NaT - assert result.name == expected.name - - result_list = idx.tolist() - for i in [0, 1, 3]: - assert result_list[i] == expected_list[i] - assert result_list[2] is pd.NaT - def test_minmax(self): # monotonic diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index b3f059018493c..4c0c865928031 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -6,7 +6,7 @@ import pandas.util._test_decorators as td from pandas.util import testing as tm from pandas import (PeriodIndex, period_range, notna, DatetimeIndex, NaT, - Index, Period, Int64Index, Series, DataFrame, date_range, + Index, Period, Series, DataFrame, date_range, offsets) from ..datetimelike import DatetimeLike @@ -24,38 +24,6 @@ def setup_method(self, method): def create_index(self): return period_range('20130101', periods=5, freq='D') - def test_astype_conversion(self): - # GH 13149, GH 13209 - idx = PeriodIndex(['2016-05-16', 'NaT', NaT, np.NaN], freq='D') - - result = idx.astype(object) - expected = Index([Period('2016-05-16', freq='D')] + - [Period(NaT, freq='D')] * 3, dtype='object') - tm.assert_index_equal(result, expected) - - result = idx.astype(int) - expected = Int64Index([16937] + [-9223372036854775808] * 3, - dtype=np.int64) - tm.assert_index_equal(result, expected) - - result = idx.astype(str) - expected = Index(str(x) for x in idx) - tm.assert_index_equal(result, expected) - - idx = period_range('1990', '2009', freq='A') - result = idx.astype('i8') - tm.assert_index_equal(result, Index(idx.asi8)) - tm.assert_numpy_array_equal(result.values, idx.asi8) - - @pytest.mark.parametrize('dtype', [ - float, 'timedelta64', 'timedelta64[ns]']) - def test_astype_raises(self, dtype): - # GH 13149, GH 13209 - idx = PeriodIndex(['2016-05-16', 'NaT', NaT, np.NaN], freq='D') - msg = 'Cannot cast PeriodIndex to dtype' - with tm.assert_raises_regex(TypeError, msg): - idx.astype(dtype) - def test_pickle_compat_construction(self): pass @@ -384,25 +352,6 @@ def test_factorize(self): tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, exp_idx) - def test_astype_object(self): - idx = pd.PeriodIndex([], freq='M') - - exp = np.array([], dtype=object) - tm.assert_numpy_array_equal(idx.astype(object).values, exp) - tm.assert_numpy_array_equal(idx._mpl_repr(), exp) - - idx = pd.PeriodIndex(['2011-01', pd.NaT], freq='M') - - exp = np.array([pd.Period('2011-01', freq='M'), pd.NaT], dtype=object) - tm.assert_numpy_array_equal(idx.astype(object).values, exp) - tm.assert_numpy_array_equal(idx._mpl_repr(), exp) - - exp = np.array([pd.Period('2011-01-01', freq='D'), pd.NaT], - dtype=object) - idx = pd.PeriodIndex(['2011-01-01', pd.NaT], freq='D') - tm.assert_numpy_array_equal(idx.astype(object).values, exp) - tm.assert_numpy_array_equal(idx._mpl_repr(), exp) - def test_is_(self): create_index = lambda: PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') @@ -421,13 +370,6 @@ def test_is_(self): assert not index.is_(index - 2) assert not index.is_(index - 0) - def test_comp_period(self): - idx = period_range('2007-01', periods=20, freq='M') - - result = idx < idx[10] - exp = idx.values < idx.values[10] - tm.assert_numpy_array_equal(result, exp) - def test_contains(self): rng = period_range('2007-01', freq='M', periods=10) From 192bd46fd89189b7905d87db95842de6299285d8 Mon Sep 17 00:00:00 2001 From: Tommy <10076072+tommyod@users.noreply.github.com> Date: Sun, 18 Feb 2018 14:38:16 +0100 Subject: [PATCH 120/214] DOC: Updated tutorials with additional info, new version and added some video tutorials (#19748) --- doc/source/tutorials.rst | 51 +++++++++++++++++++++++++++------------- 1 file changed, 35 insertions(+), 16 deletions(-) diff --git a/doc/source/tutorials.rst b/doc/source/tutorials.rst index 710212bc237cd..db9385519bff2 100644 --- a/doc/source/tutorials.rst +++ b/doc/source/tutorials.rst @@ -9,52 +9,52 @@ This is a guide to many pandas tutorials, geared mainly for new users. Internal Guides --------------- -pandas own :ref:`10 Minutes to pandas<10min>` +pandas' own :ref:`10 Minutes to pandas<10min>`. -More complex recipes are in the :ref:`Cookbook` +More complex recipes are in the :ref:`Cookbook`. pandas Cookbook --------------- -The goal of this cookbook (by `Julia Evans `_) is to +The goal of this 2015 cookbook (by `Julia Evans `_) is to give you some concrete examples for getting started with pandas. These are examples with real-world data, and all the bugs and weirdness that entails. -Here are links to the v0.1 release. For an up-to-date table of contents, see the `pandas-cookbook GitHub +Here are links to the v0.2 release. For an up-to-date table of contents, see the `pandas-cookbook GitHub repository `_. To run the examples in this tutorial, you'll need to clone the GitHub repository and get IPython Notebook running. See `How to use this cookbook `_. -- `A quick tour of the IPython Notebook: `_ +- `A quick tour of the IPython Notebook: `_ Shows off IPython's awesome tab completion and magic functions. -- `Chapter 1: `_ +- `Chapter 1: `_ Reading your data into pandas is pretty much the easiest thing. Even when the encoding is wrong! -- `Chapter 2: `_ +- `Chapter 2: `_ It's not totally obvious how to select data from a pandas dataframe. Here we explain the basics (how to take slices and get columns) -- `Chapter 3: `_ +- `Chapter 3: `_ Here we get into serious slicing and dicing and learn how to filter dataframes in complicated ways, really fast. -- `Chapter 4: `_ +- `Chapter 4: `_ Groupby/aggregate is seriously my favorite thing about pandas and I use it all the time. You should probably read this. -- `Chapter 5: `_ +- `Chapter 5: `_ Here you get to find out if it's cold in Montreal in the winter (spoiler: yes). Web scraping with pandas is fun! Here we combine dataframes. -- `Chapter 6: `_ +- `Chapter 6: `_ Strings with pandas are great. It has all these vectorized string operations and they're the best. We will turn a bunch of strings containing "Snow" into vectors of numbers in a trice. -- `Chapter 7: `_ +- `Chapter 7: `_ Cleaning up messy data is never a joy, but with pandas it's easier. -- `Chapter 8: `_ +- `Chapter 8: `_ Parsing Unix timestamps is confusing at first but it turns out to be really easy. -Lessons for New pandas Users +Lessons for new pandas users ---------------------------- For more resources, please visit the main `repository `__. @@ -125,7 +125,7 @@ There are four sections covering selected topics as follows: .. _tutorial-exercises-new-users: -Exercises for New Users +Exercises for new users ----------------------- Practice your skills with real data sets and exercises. For more resources, please visit the main `repository `__. @@ -152,9 +152,14 @@ For more resources, please visit the main `repository `_. +The source may be found in the GitHub repository +`TomAugspurger/effective-pandas `_. + - `Modern Pandas `_ - `Method Chaining `_ - `Indexes `_ @@ -168,6 +173,20 @@ Excel charts with pandas, vincent and xlsxwriter - `Using Pandas and XlsxWriter to create Excel charts `_ +Video Tutorials +--------------- + +- `Pandas From The Ground Up `_ + (2015) (2:24) + `GitHub repo `_ +- `Introduction Into Pandas `_ + (2016) (1:28) + `GitHub repo `_ +- `Pandas: .head() to .tail() `_ + (2016) (1:26) + `GitHub repo `_ + + Various Tutorials ----------------- From 6e37f8782fcc192cf21f4d989588ea665becb616 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 18 Feb 2018 08:09:44 -0800 Subject: [PATCH 121/214] collect index formatting tests (#19661) --- .../tests/indexes/datetimes/test_formats.py | 175 +++++++++++++++++- pandas/tests/indexes/datetimes/test_misc.py | 10 - pandas/tests/indexes/datetimes/test_ops.py | 155 ---------------- pandas/tests/indexes/period/test_formats.py | 161 ++++++++++++++++ pandas/tests/indexes/period/test_indexing.py | 3 - pandas/tests/indexes/period/test_ops.py | 158 ---------------- .../tests/indexes/timedeltas/test_formats.py | 96 ++++++++++ pandas/tests/indexes/timedeltas/test_ops.py | 88 --------- 8 files changed, 431 insertions(+), 415 deletions(-) create mode 100644 pandas/tests/indexes/timedeltas/test_formats.py diff --git a/pandas/tests/indexes/datetimes/test_formats.py b/pandas/tests/indexes/datetimes/test_formats.py index ea2731f66f0ef..0d1a9e65ce6c6 100644 --- a/pandas/tests/indexes/datetimes/test_formats.py +++ b/pandas/tests/indexes/datetimes/test_formats.py @@ -1,6 +1,10 @@ -from pandas import DatetimeIndex +from datetime import datetime +from pandas import DatetimeIndex, Series import numpy as np +import dateutil.tz +import pytz +import pytest import pandas.util.testing as tm import pandas as pd @@ -45,3 +49,172 @@ def test_to_native_types(): result = index.to_native_types(na_rep='pandas') tm.assert_numpy_array_equal(result, expected) + + +class TestDatetimeIndexRendering(object): + def test_dti_repr_short(self): + dr = pd.date_range(start='1/1/2012', periods=1) + repr(dr) + + dr = pd.date_range(start='1/1/2012', periods=2) + repr(dr) + + dr = pd.date_range(start='1/1/2012', periods=3) + repr(dr) + + @pytest.mark.parametrize('method', ['__repr__', '__unicode__', '__str__']) + def test_dti_representation(self, method): + idxs = [] + idxs.append(DatetimeIndex([], freq='D')) + idxs.append(DatetimeIndex(['2011-01-01'], freq='D')) + idxs.append(DatetimeIndex(['2011-01-01', '2011-01-02'], freq='D')) + idxs.append(DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03'], + freq='D')) + idxs.append(DatetimeIndex( + ['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00' + ], freq='H', tz='Asia/Tokyo')) + idxs.append(DatetimeIndex( + ['2011-01-01 09:00', '2011-01-01 10:00', pd.NaT], tz='US/Eastern')) + idxs.append(DatetimeIndex( + ['2011-01-01 09:00', '2011-01-01 10:00', pd.NaT], tz='UTC')) + + exp = [] + exp.append("""DatetimeIndex([], dtype='datetime64[ns]', freq='D')""") + exp.append("DatetimeIndex(['2011-01-01'], dtype='datetime64[ns]', " + "freq='D')") + exp.append("DatetimeIndex(['2011-01-01', '2011-01-02'], " + "dtype='datetime64[ns]', freq='D')") + exp.append("DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03'], " + "dtype='datetime64[ns]', freq='D')") + exp.append("DatetimeIndex(['2011-01-01 09:00:00+09:00', " + "'2011-01-01 10:00:00+09:00', '2011-01-01 11:00:00+09:00']" + ", dtype='datetime64[ns, Asia/Tokyo]', freq='H')") + exp.append("DatetimeIndex(['2011-01-01 09:00:00-05:00', " + "'2011-01-01 10:00:00-05:00', 'NaT'], " + "dtype='datetime64[ns, US/Eastern]', freq=None)") + exp.append("DatetimeIndex(['2011-01-01 09:00:00+00:00', " + "'2011-01-01 10:00:00+00:00', 'NaT'], " + "dtype='datetime64[ns, UTC]', freq=None)""") + + with pd.option_context('display.width', 300): + for indx, expected in zip(idxs, exp): + result = getattr(indx, method)() + assert result == expected + + def test_dti_representation_to_series(self): + idx1 = DatetimeIndex([], freq='D') + idx2 = DatetimeIndex(['2011-01-01'], freq='D') + idx3 = DatetimeIndex(['2011-01-01', '2011-01-02'], freq='D') + idx4 = DatetimeIndex( + ['2011-01-01', '2011-01-02', '2011-01-03'], freq='D') + idx5 = DatetimeIndex(['2011-01-01 09:00', '2011-01-01 10:00', + '2011-01-01 11:00'], freq='H', tz='Asia/Tokyo') + idx6 = DatetimeIndex(['2011-01-01 09:00', '2011-01-01 10:00', pd.NaT], + tz='US/Eastern') + idx7 = DatetimeIndex(['2011-01-01 09:00', '2011-01-02 10:15']) + + exp1 = """Series([], dtype: datetime64[ns])""" + + exp2 = ("0 2011-01-01\n" + "dtype: datetime64[ns]") + + exp3 = ("0 2011-01-01\n" + "1 2011-01-02\n" + "dtype: datetime64[ns]") + + exp4 = ("0 2011-01-01\n" + "1 2011-01-02\n" + "2 2011-01-03\n" + "dtype: datetime64[ns]") + + exp5 = ("0 2011-01-01 09:00:00+09:00\n" + "1 2011-01-01 10:00:00+09:00\n" + "2 2011-01-01 11:00:00+09:00\n" + "dtype: datetime64[ns, Asia/Tokyo]") + + exp6 = ("0 2011-01-01 09:00:00-05:00\n" + "1 2011-01-01 10:00:00-05:00\n" + "2 NaT\n" + "dtype: datetime64[ns, US/Eastern]") + + exp7 = ("0 2011-01-01 09:00:00\n" + "1 2011-01-02 10:15:00\n" + "dtype: datetime64[ns]") + + with pd.option_context('display.width', 300): + for idx, expected in zip([idx1, idx2, idx3, idx4, + idx5, idx6, idx7], + [exp1, exp2, exp3, exp4, + exp5, exp6, exp7]): + result = repr(Series(idx)) + assert result == expected + + def test_dti_summary(self): + # GH#9116 + idx1 = DatetimeIndex([], freq='D') + idx2 = DatetimeIndex(['2011-01-01'], freq='D') + idx3 = DatetimeIndex(['2011-01-01', '2011-01-02'], freq='D') + idx4 = DatetimeIndex( + ['2011-01-01', '2011-01-02', '2011-01-03'], freq='D') + idx5 = DatetimeIndex(['2011-01-01 09:00', '2011-01-01 10:00', + '2011-01-01 11:00'], + freq='H', tz='Asia/Tokyo') + idx6 = DatetimeIndex(['2011-01-01 09:00', '2011-01-01 10:00', pd.NaT], + tz='US/Eastern') + + exp1 = ("DatetimeIndex: 0 entries\n" + "Freq: D") + + exp2 = ("DatetimeIndex: 1 entries, 2011-01-01 to 2011-01-01\n" + "Freq: D") + + exp3 = ("DatetimeIndex: 2 entries, 2011-01-01 to 2011-01-02\n" + "Freq: D") + + exp4 = ("DatetimeIndex: 3 entries, 2011-01-01 to 2011-01-03\n" + "Freq: D") + + exp5 = ("DatetimeIndex: 3 entries, 2011-01-01 09:00:00+09:00 " + "to 2011-01-01 11:00:00+09:00\n" + "Freq: H") + + exp6 = """DatetimeIndex: 3 entries, 2011-01-01 09:00:00-05:00 to NaT""" + + for idx, expected in zip([idx1, idx2, idx3, idx4, idx5, idx6], + [exp1, exp2, exp3, exp4, exp5, exp6]): + result = idx.summary() + assert result == expected + + def test_dti_business_repr(self): + # only really care that it works + repr(pd.bdate_range(datetime(2009, 1, 1), datetime(2010, 1, 1))) + + def test_dti_business_summary(self): + rng = pd.bdate_range(datetime(2009, 1, 1), datetime(2010, 1, 1)) + rng.summary() + rng[2:2].summary() + + def test_dti_business_summary_pytz(self): + pd.bdate_range('1/1/2005', '1/1/2009', tz=pytz.utc).summary() + + def test_dti_business_summary_dateutil(self): + pd.bdate_range('1/1/2005', '1/1/2009', + tz=dateutil.tz.tzutc()).summary() + + def test_dti_custom_business_repr(self): + # only really care that it works + repr(pd.bdate_range(datetime(2009, 1, 1), datetime(2010, 1, 1), + freq='C')) + + def test_dti_custom_business_summary(self): + rng = pd.bdate_range(datetime(2009, 1, 1), datetime(2010, 1, 1), + freq='C') + rng.summary() + rng[2:2].summary() + + def test_dti_custom_business_summary_pytz(self): + pd.bdate_range('1/1/2005', '1/1/2009', freq='C', tz=pytz.utc).summary() + + def test_dti_custom_business_summary_dateutil(self): + pd.bdate_range('1/1/2005', '1/1/2009', freq='C', + tz=dateutil.tz.tzutc()).summary() diff --git a/pandas/tests/indexes/datetimes/test_misc.py b/pandas/tests/indexes/datetimes/test_misc.py index 4a46c3b04bbad..2013b5e6cd6dd 100644 --- a/pandas/tests/indexes/datetimes/test_misc.py +++ b/pandas/tests/indexes/datetimes/test_misc.py @@ -83,16 +83,6 @@ def test_range_edges(self): '1970-01-03', '1970-01-04']) tm.assert_index_equal(idx, exp) - def test_datetimeindex_repr_short(self): - dr = date_range(start='1/1/2012', periods=1) - repr(dr) - - dr = date_range(start='1/1/2012', periods=2) - repr(dr) - - dr = date_range(start='1/1/2012', periods=3) - repr(dr) - class TestDatetime64(object): diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py index bc43b427fe0aa..b42cd454803b8 100644 --- a/pandas/tests/indexes/datetimes/test_ops.py +++ b/pandas/tests/indexes/datetimes/test_ops.py @@ -1,6 +1,4 @@ -import pytz import pytest -import dateutil import warnings import numpy as np from datetime import datetime @@ -153,130 +151,6 @@ def test_repeat(self): tm.assert_raises_regex(ValueError, msg, np.repeat, rng, reps, axis=1) - def test_representation(self): - - idx = [] - idx.append(DatetimeIndex([], freq='D')) - idx.append(DatetimeIndex(['2011-01-01'], freq='D')) - idx.append(DatetimeIndex(['2011-01-01', '2011-01-02'], freq='D')) - idx.append(DatetimeIndex( - ['2011-01-01', '2011-01-02', '2011-01-03'], freq='D')) - idx.append(DatetimeIndex( - ['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00' - ], freq='H', tz='Asia/Tokyo')) - idx.append(DatetimeIndex( - ['2011-01-01 09:00', '2011-01-01 10:00', pd.NaT], tz='US/Eastern')) - idx.append(DatetimeIndex( - ['2011-01-01 09:00', '2011-01-01 10:00', pd.NaT], tz='UTC')) - - exp = [] - exp.append("""DatetimeIndex([], dtype='datetime64[ns]', freq='D')""") - exp.append("DatetimeIndex(['2011-01-01'], dtype='datetime64[ns]', " - "freq='D')") - exp.append("DatetimeIndex(['2011-01-01', '2011-01-02'], " - "dtype='datetime64[ns]', freq='D')") - exp.append("DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03'], " - "dtype='datetime64[ns]', freq='D')") - exp.append("DatetimeIndex(['2011-01-01 09:00:00+09:00', " - "'2011-01-01 10:00:00+09:00', '2011-01-01 11:00:00+09:00']" - ", dtype='datetime64[ns, Asia/Tokyo]', freq='H')") - exp.append("DatetimeIndex(['2011-01-01 09:00:00-05:00', " - "'2011-01-01 10:00:00-05:00', 'NaT'], " - "dtype='datetime64[ns, US/Eastern]', freq=None)") - exp.append("DatetimeIndex(['2011-01-01 09:00:00+00:00', " - "'2011-01-01 10:00:00+00:00', 'NaT'], " - "dtype='datetime64[ns, UTC]', freq=None)""") - - with pd.option_context('display.width', 300): - for indx, expected in zip(idx, exp): - for func in ['__repr__', '__unicode__', '__str__']: - result = getattr(indx, func)() - assert result == expected - - def test_representation_to_series(self): - idx1 = DatetimeIndex([], freq='D') - idx2 = DatetimeIndex(['2011-01-01'], freq='D') - idx3 = DatetimeIndex(['2011-01-01', '2011-01-02'], freq='D') - idx4 = DatetimeIndex( - ['2011-01-01', '2011-01-02', '2011-01-03'], freq='D') - idx5 = DatetimeIndex(['2011-01-01 09:00', '2011-01-01 10:00', - '2011-01-01 11:00'], freq='H', tz='Asia/Tokyo') - idx6 = DatetimeIndex(['2011-01-01 09:00', '2011-01-01 10:00', pd.NaT], - tz='US/Eastern') - idx7 = DatetimeIndex(['2011-01-01 09:00', '2011-01-02 10:15']) - - exp1 = """Series([], dtype: datetime64[ns])""" - - exp2 = ("0 2011-01-01\n" - "dtype: datetime64[ns]") - - exp3 = ("0 2011-01-01\n" - "1 2011-01-02\n" - "dtype: datetime64[ns]") - - exp4 = ("0 2011-01-01\n" - "1 2011-01-02\n" - "2 2011-01-03\n" - "dtype: datetime64[ns]") - - exp5 = ("0 2011-01-01 09:00:00+09:00\n" - "1 2011-01-01 10:00:00+09:00\n" - "2 2011-01-01 11:00:00+09:00\n" - "dtype: datetime64[ns, Asia/Tokyo]") - - exp6 = ("0 2011-01-01 09:00:00-05:00\n" - "1 2011-01-01 10:00:00-05:00\n" - "2 NaT\n" - "dtype: datetime64[ns, US/Eastern]") - - exp7 = ("0 2011-01-01 09:00:00\n" - "1 2011-01-02 10:15:00\n" - "dtype: datetime64[ns]") - - with pd.option_context('display.width', 300): - for idx, expected in zip([idx1, idx2, idx3, idx4, - idx5, idx6, idx7], - [exp1, exp2, exp3, exp4, - exp5, exp6, exp7]): - result = repr(Series(idx)) - assert result == expected - - def test_summary(self): - # GH9116 - idx1 = DatetimeIndex([], freq='D') - idx2 = DatetimeIndex(['2011-01-01'], freq='D') - idx3 = DatetimeIndex(['2011-01-01', '2011-01-02'], freq='D') - idx4 = DatetimeIndex( - ['2011-01-01', '2011-01-02', '2011-01-03'], freq='D') - idx5 = DatetimeIndex(['2011-01-01 09:00', '2011-01-01 10:00', - '2011-01-01 11:00'], - freq='H', tz='Asia/Tokyo') - idx6 = DatetimeIndex(['2011-01-01 09:00', '2011-01-01 10:00', pd.NaT], - tz='US/Eastern') - - exp1 = ("DatetimeIndex: 0 entries\n" - "Freq: D") - - exp2 = ("DatetimeIndex: 1 entries, 2011-01-01 to 2011-01-01\n" - "Freq: D") - - exp3 = ("DatetimeIndex: 2 entries, 2011-01-01 to 2011-01-02\n" - "Freq: D") - - exp4 = ("DatetimeIndex: 3 entries, 2011-01-01 to 2011-01-03\n" - "Freq: D") - - exp5 = ("DatetimeIndex: 3 entries, 2011-01-01 09:00:00+09:00 " - "to 2011-01-01 11:00:00+09:00\n" - "Freq: H") - - exp6 = """DatetimeIndex: 3 entries, 2011-01-01 09:00:00-05:00 to NaT""" - - for idx, expected in zip([idx1, idx2, idx3, idx4, idx5, idx6], - [exp1, exp2, exp3, exp4, exp5, exp6]): - result = idx.summary() - assert result == expected - def test_resolution(self): for freq, expected in zip(['A', 'Q', 'M', 'D', 'H', 'T', 'S', 'L', 'U'], @@ -544,10 +418,6 @@ def test_copy(self): repr(cp) tm.assert_index_equal(cp, self.rng) - def test_repr(self): - # only really care that it works - repr(self.rng) - def test_shift(self): shifted = self.rng.shift(5) assert shifted[0] == self.rng[5] @@ -565,16 +435,6 @@ def test_shift(self): shifted = rng.shift(1, freq=BDay()) assert shifted[0] == rng[0] + BDay() - def test_summary(self): - self.rng.summary() - self.rng[2:2].summary() - - def test_summary_pytz(self): - bdate_range('1/1/2005', '1/1/2009', tz=pytz.utc).summary() - - def test_summary_dateutil(self): - bdate_range('1/1/2005', '1/1/2009', tz=dateutil.tz.tzutc()).summary() - def test_equals(self): assert not self.rng.equals(list(self.rng)) @@ -612,10 +472,6 @@ def test_copy(self): repr(cp) tm.assert_index_equal(cp, self.rng) - def test_repr(self): - # only really care that it works - repr(self.rng) - def test_shift(self): shifted = self.rng.shift(5) @@ -640,16 +496,5 @@ def test_pickle_unpickle(self): unpickled = tm.round_trip_pickle(self.rng) assert unpickled.offset is not None - def test_summary(self): - self.rng.summary() - self.rng[2:2].summary() - - def test_summary_pytz(self): - bdate_range('1/1/2005', '1/1/2009', freq='C', tz=pytz.utc).summary() - - def test_summary_dateutil(self): - bdate_range('1/1/2005', '1/1/2009', freq='C', - tz=dateutil.tz.tzutc()).summary() - def test_equals(self): assert not self.rng.equals(list(self.rng)) diff --git a/pandas/tests/indexes/period/test_formats.py b/pandas/tests/indexes/period/test_formats.py index 533481ce051f7..b1a1060bf86c4 100644 --- a/pandas/tests/indexes/period/test_formats.py +++ b/pandas/tests/indexes/period/test_formats.py @@ -1,6 +1,7 @@ from pandas import PeriodIndex import numpy as np +import pytest import pandas.util.testing as tm import pandas as pd @@ -46,3 +47,163 @@ def test_to_native_types(): result = index.to_native_types(na_rep='pandas') tm.assert_numpy_array_equal(result, expected) + + +class TestPeriodIndexRendering(object): + @pytest.mark.parametrize('method', ['__repr__', '__unicode__', '__str__']) + def test_representation(self, method): + # GH#7601 + idx1 = PeriodIndex([], freq='D') + idx2 = PeriodIndex(['2011-01-01'], freq='D') + idx3 = PeriodIndex(['2011-01-01', '2011-01-02'], freq='D') + idx4 = PeriodIndex(['2011-01-01', '2011-01-02', '2011-01-03'], + freq='D') + idx5 = PeriodIndex(['2011', '2012', '2013'], freq='A') + idx6 = PeriodIndex(['2011-01-01 09:00', '2012-02-01 10:00', 'NaT'], + freq='H') + idx7 = pd.period_range('2013Q1', periods=1, freq="Q") + idx8 = pd.period_range('2013Q1', periods=2, freq="Q") + idx9 = pd.period_range('2013Q1', periods=3, freq="Q") + idx10 = PeriodIndex(['2011-01-01', '2011-02-01'], freq='3D') + + exp1 = """PeriodIndex([], dtype='period[D]', freq='D')""" + + exp2 = """PeriodIndex(['2011-01-01'], dtype='period[D]', freq='D')""" + + exp3 = ("PeriodIndex(['2011-01-01', '2011-01-02'], dtype='period[D]', " + "freq='D')") + + exp4 = ("PeriodIndex(['2011-01-01', '2011-01-02', '2011-01-03'], " + "dtype='period[D]', freq='D')") + + exp5 = ("PeriodIndex(['2011', '2012', '2013'], dtype='period[A-DEC]', " + "freq='A-DEC')") + + exp6 = ("PeriodIndex(['2011-01-01 09:00', '2012-02-01 10:00', 'NaT'], " + "dtype='period[H]', freq='H')") + + exp7 = ("PeriodIndex(['2013Q1'], dtype='period[Q-DEC]', " + "freq='Q-DEC')") + + exp8 = ("PeriodIndex(['2013Q1', '2013Q2'], dtype='period[Q-DEC]', " + "freq='Q-DEC')") + + exp9 = ("PeriodIndex(['2013Q1', '2013Q2', '2013Q3'], " + "dtype='period[Q-DEC]', freq='Q-DEC')") + + exp10 = ("PeriodIndex(['2011-01-01', '2011-02-01'], " + "dtype='period[3D]', freq='3D')") + + for idx, expected in zip([idx1, idx2, idx3, idx4, idx5, + idx6, idx7, idx8, idx9, idx10], + [exp1, exp2, exp3, exp4, exp5, + exp6, exp7, exp8, exp9, exp10]): + result = getattr(idx, method)() + assert result == expected + + def test_representation_to_series(self): + # GH#10971 + idx1 = PeriodIndex([], freq='D') + idx2 = PeriodIndex(['2011-01-01'], freq='D') + idx3 = PeriodIndex(['2011-01-01', '2011-01-02'], freq='D') + idx4 = PeriodIndex(['2011-01-01', '2011-01-02', '2011-01-03'], + freq='D') + idx5 = PeriodIndex(['2011', '2012', '2013'], freq='A') + idx6 = PeriodIndex(['2011-01-01 09:00', '2012-02-01 10:00', 'NaT'], + freq='H') + + idx7 = pd.period_range('2013Q1', periods=1, freq="Q") + idx8 = pd.period_range('2013Q1', periods=2, freq="Q") + idx9 = pd.period_range('2013Q1', periods=3, freq="Q") + + exp1 = """Series([], dtype: object)""" + + exp2 = """0 2011-01-01 +dtype: object""" + + exp3 = """0 2011-01-01 +1 2011-01-02 +dtype: object""" + + exp4 = """0 2011-01-01 +1 2011-01-02 +2 2011-01-03 +dtype: object""" + + exp5 = """0 2011 +1 2012 +2 2013 +dtype: object""" + + exp6 = """0 2011-01-01 09:00 +1 2012-02-01 10:00 +2 NaT +dtype: object""" + + exp7 = """0 2013Q1 +dtype: object""" + + exp8 = """0 2013Q1 +1 2013Q2 +dtype: object""" + + exp9 = """0 2013Q1 +1 2013Q2 +2 2013Q3 +dtype: object""" + + for idx, expected in zip([idx1, idx2, idx3, idx4, idx5, + idx6, idx7, idx8, idx9], + [exp1, exp2, exp3, exp4, exp5, + exp6, exp7, exp8, exp9]): + result = repr(pd.Series(idx)) + assert result == expected + + def test_summary(self): + # GH#9116 + idx1 = PeriodIndex([], freq='D') + idx2 = PeriodIndex(['2011-01-01'], freq='D') + idx3 = PeriodIndex(['2011-01-01', '2011-01-02'], freq='D') + idx4 = PeriodIndex(['2011-01-01', '2011-01-02', '2011-01-03'], + freq='D') + idx5 = PeriodIndex(['2011', '2012', '2013'], freq='A') + idx6 = PeriodIndex(['2011-01-01 09:00', '2012-02-01 10:00', 'NaT'], + freq='H') + + idx7 = pd.period_range('2013Q1', periods=1, freq="Q") + idx8 = pd.period_range('2013Q1', periods=2, freq="Q") + idx9 = pd.period_range('2013Q1', periods=3, freq="Q") + + exp1 = """PeriodIndex: 0 entries +Freq: D""" + + exp2 = """PeriodIndex: 1 entries, 2011-01-01 to 2011-01-01 +Freq: D""" + + exp3 = """PeriodIndex: 2 entries, 2011-01-01 to 2011-01-02 +Freq: D""" + + exp4 = """PeriodIndex: 3 entries, 2011-01-01 to 2011-01-03 +Freq: D""" + + exp5 = """PeriodIndex: 3 entries, 2011 to 2013 +Freq: A-DEC""" + + exp6 = """PeriodIndex: 3 entries, 2011-01-01 09:00 to NaT +Freq: H""" + + exp7 = """PeriodIndex: 1 entries, 2013Q1 to 2013Q1 +Freq: Q-DEC""" + + exp8 = """PeriodIndex: 2 entries, 2013Q1 to 2013Q2 +Freq: Q-DEC""" + + exp9 = """PeriodIndex: 3 entries, 2013Q1 to 2013Q3 +Freq: Q-DEC""" + + for idx, expected in zip([idx1, idx2, idx3, idx4, idx5, + idx6, idx7, idx8, idx9], + [exp1, exp2, exp3, exp4, exp5, + exp6, exp7, exp8, exp9]): + result = idx.summary() + assert result == expected diff --git a/pandas/tests/indexes/period/test_indexing.py b/pandas/tests/indexes/period/test_indexing.py index 6cb4226dffc5a..b913934195260 100644 --- a/pandas/tests/indexes/period/test_indexing.py +++ b/pandas/tests/indexes/period/test_indexing.py @@ -14,9 +14,6 @@ class TestGetItem(object): - def setup_method(self, method): - pass - def test_getitem(self): idx1 = pd.period_range('2011-01-01', '2011-01-31', freq='D', name='idx') diff --git a/pandas/tests/indexes/period/test_ops.py b/pandas/tests/indexes/period/test_ops.py index 6c272864e0026..3b6641bc7ad5c 100644 --- a/pandas/tests/indexes/period/test_ops.py +++ b/pandas/tests/indexes/period/test_ops.py @@ -79,164 +79,6 @@ def test_numpy_minmax(self): tm.assert_raises_regex( ValueError, errmsg, np.argmax, pr, out=0) - def test_representation(self): - # GH 7601 - idx1 = PeriodIndex([], freq='D') - idx2 = PeriodIndex(['2011-01-01'], freq='D') - idx3 = PeriodIndex(['2011-01-01', '2011-01-02'], freq='D') - idx4 = PeriodIndex(['2011-01-01', '2011-01-02', '2011-01-03'], - freq='D') - idx5 = PeriodIndex(['2011', '2012', '2013'], freq='A') - idx6 = PeriodIndex(['2011-01-01 09:00', '2012-02-01 10:00', - 'NaT'], freq='H') - idx7 = pd.period_range('2013Q1', periods=1, freq="Q") - idx8 = pd.period_range('2013Q1', periods=2, freq="Q") - idx9 = pd.period_range('2013Q1', periods=3, freq="Q") - idx10 = PeriodIndex(['2011-01-01', '2011-02-01'], freq='3D') - - exp1 = """PeriodIndex([], dtype='period[D]', freq='D')""" - - exp2 = """PeriodIndex(['2011-01-01'], dtype='period[D]', freq='D')""" - - exp3 = ("PeriodIndex(['2011-01-01', '2011-01-02'], dtype='period[D]', " - "freq='D')") - - exp4 = ("PeriodIndex(['2011-01-01', '2011-01-02', '2011-01-03'], " - "dtype='period[D]', freq='D')") - - exp5 = ("PeriodIndex(['2011', '2012', '2013'], dtype='period[A-DEC]', " - "freq='A-DEC')") - - exp6 = ("PeriodIndex(['2011-01-01 09:00', '2012-02-01 10:00', 'NaT'], " - "dtype='period[H]', freq='H')") - - exp7 = ("PeriodIndex(['2013Q1'], dtype='period[Q-DEC]', " - "freq='Q-DEC')") - - exp8 = ("PeriodIndex(['2013Q1', '2013Q2'], dtype='period[Q-DEC]', " - "freq='Q-DEC')") - - exp9 = ("PeriodIndex(['2013Q1', '2013Q2', '2013Q3'], " - "dtype='period[Q-DEC]', freq='Q-DEC')") - - exp10 = ("PeriodIndex(['2011-01-01', '2011-02-01'], " - "dtype='period[3D]', freq='3D')") - - for idx, expected in zip([idx1, idx2, idx3, idx4, idx5, - idx6, idx7, idx8, idx9, idx10], - [exp1, exp2, exp3, exp4, exp5, - exp6, exp7, exp8, exp9, exp10]): - for func in ['__repr__', '__unicode__', '__str__']: - result = getattr(idx, func)() - assert result == expected - - def test_representation_to_series(self): - # GH 10971 - idx1 = PeriodIndex([], freq='D') - idx2 = PeriodIndex(['2011-01-01'], freq='D') - idx3 = PeriodIndex(['2011-01-01', '2011-01-02'], freq='D') - idx4 = PeriodIndex(['2011-01-01', '2011-01-02', - '2011-01-03'], freq='D') - idx5 = PeriodIndex(['2011', '2012', '2013'], freq='A') - idx6 = PeriodIndex(['2011-01-01 09:00', '2012-02-01 10:00', - 'NaT'], freq='H') - - idx7 = pd.period_range('2013Q1', periods=1, freq="Q") - idx8 = pd.period_range('2013Q1', periods=2, freq="Q") - idx9 = pd.period_range('2013Q1', periods=3, freq="Q") - - exp1 = """Series([], dtype: object)""" - - exp2 = """0 2011-01-01 -dtype: object""" - - exp3 = """0 2011-01-01 -1 2011-01-02 -dtype: object""" - - exp4 = """0 2011-01-01 -1 2011-01-02 -2 2011-01-03 -dtype: object""" - - exp5 = """0 2011 -1 2012 -2 2013 -dtype: object""" - - exp6 = """0 2011-01-01 09:00 -1 2012-02-01 10:00 -2 NaT -dtype: object""" - - exp7 = """0 2013Q1 -dtype: object""" - - exp8 = """0 2013Q1 -1 2013Q2 -dtype: object""" - - exp9 = """0 2013Q1 -1 2013Q2 -2 2013Q3 -dtype: object""" - - for idx, expected in zip([idx1, idx2, idx3, idx4, idx5, - idx6, idx7, idx8, idx9], - [exp1, exp2, exp3, exp4, exp5, - exp6, exp7, exp8, exp9]): - result = repr(pd.Series(idx)) - assert result == expected - - def test_summary(self): - # GH9116 - idx1 = PeriodIndex([], freq='D') - idx2 = PeriodIndex(['2011-01-01'], freq='D') - idx3 = PeriodIndex(['2011-01-01', '2011-01-02'], freq='D') - idx4 = PeriodIndex( - ['2011-01-01', '2011-01-02', '2011-01-03'], freq='D') - idx5 = PeriodIndex(['2011', '2012', '2013'], freq='A') - idx6 = PeriodIndex( - ['2011-01-01 09:00', '2012-02-01 10:00', 'NaT'], freq='H') - - idx7 = pd.period_range('2013Q1', periods=1, freq="Q") - idx8 = pd.period_range('2013Q1', periods=2, freq="Q") - idx9 = pd.period_range('2013Q1', periods=3, freq="Q") - - exp1 = """PeriodIndex: 0 entries -Freq: D""" - - exp2 = """PeriodIndex: 1 entries, 2011-01-01 to 2011-01-01 -Freq: D""" - - exp3 = """PeriodIndex: 2 entries, 2011-01-01 to 2011-01-02 -Freq: D""" - - exp4 = """PeriodIndex: 3 entries, 2011-01-01 to 2011-01-03 -Freq: D""" - - exp5 = """PeriodIndex: 3 entries, 2011 to 2013 -Freq: A-DEC""" - - exp6 = """PeriodIndex: 3 entries, 2011-01-01 09:00 to NaT -Freq: H""" - - exp7 = """PeriodIndex: 1 entries, 2013Q1 to 2013Q1 -Freq: Q-DEC""" - - exp8 = """PeriodIndex: 2 entries, 2013Q1 to 2013Q2 -Freq: Q-DEC""" - - exp9 = """PeriodIndex: 3 entries, 2013Q1 to 2013Q3 -Freq: Q-DEC""" - - for idx, expected in zip([idx1, idx2, idx3, idx4, idx5, - idx6, idx7, idx8, idx9], - [exp1, exp2, exp3, exp4, exp5, - exp6, exp7, exp8, exp9]): - result = idx.summary() - assert result == expected - def test_resolution(self): for freq, expected in zip(['A', 'Q', 'M', 'D', 'H', 'T', 'S', 'L', 'U'], diff --git a/pandas/tests/indexes/timedeltas/test_formats.py b/pandas/tests/indexes/timedeltas/test_formats.py new file mode 100644 index 0000000000000..a8375459d74e4 --- /dev/null +++ b/pandas/tests/indexes/timedeltas/test_formats.py @@ -0,0 +1,96 @@ +# -*- coding: utf-8 -*- + +import pytest + +import pandas as pd +from pandas import TimedeltaIndex + + +class TestTimedeltaIndexRendering(object): + @pytest.mark.parametrize('method', ['__repr__', '__unicode__', '__str__']) + def test_representation(self, method): + idx1 = TimedeltaIndex([], freq='D') + idx2 = TimedeltaIndex(['1 days'], freq='D') + idx3 = TimedeltaIndex(['1 days', '2 days'], freq='D') + idx4 = TimedeltaIndex(['1 days', '2 days', '3 days'], freq='D') + idx5 = TimedeltaIndex(['1 days 00:00:01', '2 days', '3 days']) + + exp1 = """TimedeltaIndex([], dtype='timedelta64[ns]', freq='D')""" + + exp2 = ("TimedeltaIndex(['1 days'], dtype='timedelta64[ns]', " + "freq='D')") + + exp3 = ("TimedeltaIndex(['1 days', '2 days'], " + "dtype='timedelta64[ns]', freq='D')") + + exp4 = ("TimedeltaIndex(['1 days', '2 days', '3 days'], " + "dtype='timedelta64[ns]', freq='D')") + + exp5 = ("TimedeltaIndex(['1 days 00:00:01', '2 days 00:00:00', " + "'3 days 00:00:00'], dtype='timedelta64[ns]', freq=None)") + + with pd.option_context('display.width', 300): + for idx, expected in zip([idx1, idx2, idx3, idx4, idx5], + [exp1, exp2, exp3, exp4, exp5]): + result = getattr(idx, method)() + assert result == expected + + def test_representation_to_series(self): + idx1 = TimedeltaIndex([], freq='D') + idx2 = TimedeltaIndex(['1 days'], freq='D') + idx3 = TimedeltaIndex(['1 days', '2 days'], freq='D') + idx4 = TimedeltaIndex(['1 days', '2 days', '3 days'], freq='D') + idx5 = TimedeltaIndex(['1 days 00:00:01', '2 days', '3 days']) + + exp1 = """Series([], dtype: timedelta64[ns])""" + + exp2 = ("0 1 days\n" + "dtype: timedelta64[ns]") + + exp3 = ("0 1 days\n" + "1 2 days\n" + "dtype: timedelta64[ns]") + + exp4 = ("0 1 days\n" + "1 2 days\n" + "2 3 days\n" + "dtype: timedelta64[ns]") + + exp5 = ("0 1 days 00:00:01\n" + "1 2 days 00:00:00\n" + "2 3 days 00:00:00\n" + "dtype: timedelta64[ns]") + + with pd.option_context('display.width', 300): + for idx, expected in zip([idx1, idx2, idx3, idx4, idx5], + [exp1, exp2, exp3, exp4, exp5]): + result = repr(pd.Series(idx)) + assert result == expected + + def test_summary(self): + # GH#9116 + idx1 = TimedeltaIndex([], freq='D') + idx2 = TimedeltaIndex(['1 days'], freq='D') + idx3 = TimedeltaIndex(['1 days', '2 days'], freq='D') + idx4 = TimedeltaIndex(['1 days', '2 days', '3 days'], freq='D') + idx5 = TimedeltaIndex(['1 days 00:00:01', '2 days', '3 days']) + + exp1 = ("TimedeltaIndex: 0 entries\n" + "Freq: D") + + exp2 = ("TimedeltaIndex: 1 entries, 1 days to 1 days\n" + "Freq: D") + + exp3 = ("TimedeltaIndex: 2 entries, 1 days to 2 days\n" + "Freq: D") + + exp4 = ("TimedeltaIndex: 3 entries, 1 days to 3 days\n" + "Freq: D") + + exp5 = ("TimedeltaIndex: 3 entries, 1 days 00:00:01 to 3 days " + "00:00:00") + + for idx, expected in zip([idx1, idx2, idx3, idx4, idx5], + [exp1, exp2, exp3, exp4, exp5]): + result = idx.summary() + assert result == expected diff --git a/pandas/tests/indexes/timedeltas/test_ops.py b/pandas/tests/indexes/timedeltas/test_ops.py index d154aa2172ef7..690ba66b6f5ef 100644 --- a/pandas/tests/indexes/timedeltas/test_ops.py +++ b/pandas/tests/indexes/timedeltas/test_ops.py @@ -73,94 +73,6 @@ def test_numpy_minmax(self): tm.assert_raises_regex( ValueError, errmsg, np.argmax, td, out=0) - def test_representation(self): - idx1 = TimedeltaIndex([], freq='D') - idx2 = TimedeltaIndex(['1 days'], freq='D') - idx3 = TimedeltaIndex(['1 days', '2 days'], freq='D') - idx4 = TimedeltaIndex(['1 days', '2 days', '3 days'], freq='D') - idx5 = TimedeltaIndex(['1 days 00:00:01', '2 days', '3 days']) - - exp1 = """TimedeltaIndex([], dtype='timedelta64[ns]', freq='D')""" - - exp2 = ("TimedeltaIndex(['1 days'], dtype='timedelta64[ns]', " - "freq='D')") - - exp3 = ("TimedeltaIndex(['1 days', '2 days'], " - "dtype='timedelta64[ns]', freq='D')") - - exp4 = ("TimedeltaIndex(['1 days', '2 days', '3 days'], " - "dtype='timedelta64[ns]', freq='D')") - - exp5 = ("TimedeltaIndex(['1 days 00:00:01', '2 days 00:00:00', " - "'3 days 00:00:00'], dtype='timedelta64[ns]', freq=None)") - - with pd.option_context('display.width', 300): - for idx, expected in zip([idx1, idx2, idx3, idx4, idx5], - [exp1, exp2, exp3, exp4, exp5]): - for func in ['__repr__', '__unicode__', '__str__']: - result = getattr(idx, func)() - assert result == expected - - def test_representation_to_series(self): - idx1 = TimedeltaIndex([], freq='D') - idx2 = TimedeltaIndex(['1 days'], freq='D') - idx3 = TimedeltaIndex(['1 days', '2 days'], freq='D') - idx4 = TimedeltaIndex(['1 days', '2 days', '3 days'], freq='D') - idx5 = TimedeltaIndex(['1 days 00:00:01', '2 days', '3 days']) - - exp1 = """Series([], dtype: timedelta64[ns])""" - - exp2 = """0 1 days -dtype: timedelta64[ns]""" - - exp3 = """0 1 days -1 2 days -dtype: timedelta64[ns]""" - - exp4 = """0 1 days -1 2 days -2 3 days -dtype: timedelta64[ns]""" - - exp5 = """0 1 days 00:00:01 -1 2 days 00:00:00 -2 3 days 00:00:00 -dtype: timedelta64[ns]""" - - with pd.option_context('display.width', 300): - for idx, expected in zip([idx1, idx2, idx3, idx4, idx5], - [exp1, exp2, exp3, exp4, exp5]): - result = repr(pd.Series(idx)) - assert result == expected - - def test_summary(self): - # GH9116 - idx1 = TimedeltaIndex([], freq='D') - idx2 = TimedeltaIndex(['1 days'], freq='D') - idx3 = TimedeltaIndex(['1 days', '2 days'], freq='D') - idx4 = TimedeltaIndex(['1 days', '2 days', '3 days'], freq='D') - idx5 = TimedeltaIndex(['1 days 00:00:01', '2 days', '3 days']) - - exp1 = ("TimedeltaIndex: 0 entries\n" - "Freq: D") - - exp2 = ("TimedeltaIndex: 1 entries, 1 days to 1 days\n" - "Freq: D") - - exp3 = ("TimedeltaIndex: 2 entries, 1 days to 2 days\n" - "Freq: D") - - exp4 = ("TimedeltaIndex: 3 entries, 1 days to 3 days\n" - "Freq: D") - - exp5 = ("TimedeltaIndex: 3 entries, 1 days 00:00:01 to 3 days " - "00:00:00") - - for idx, expected in zip([idx1, idx2, idx3, idx4, idx5], - [exp1, exp2, exp3, exp4, exp5]): - result = idx.summary() - assert result == expected - def test_value_counts_unique(self): # GH 7735 From ad230e8f458c288afd769553d67bd39534d4a1ea Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 18 Feb 2018 08:22:16 -0800 Subject: [PATCH 122/214] finish off tests.tseries.test_timezones (#19739) --- .../tests/scalar/timestamp/test_unary_ops.py | 24 ++++ pandas/tests/tseries/test_timezones.py | 108 ------------------ pandas/tests/tslibs/test_timezones.py | 31 +++++ 3 files changed, 55 insertions(+), 108 deletions(-) delete mode 100644 pandas/tests/tseries/test_timezones.py diff --git a/pandas/tests/scalar/timestamp/test_unary_ops.py b/pandas/tests/scalar/timestamp/test_unary_ops.py index 8a6989c909cb2..994ff86e6fdf9 100644 --- a/pandas/tests/scalar/timestamp/test_unary_ops.py +++ b/pandas/tests/scalar/timestamp/test_unary_ops.py @@ -4,11 +4,13 @@ import pytest import pytz from pytz import utc +from dateutil.tz import gettz import pandas.util.testing as tm import pandas.util._test_decorators as td from pandas.compat import PY3 +from pandas._libs import tslib from pandas._libs.tslibs.frequencies import _INVALID_FREQ_ERROR from pandas import Timestamp, NaT @@ -215,6 +217,28 @@ def test_replace_tzinfo(self): assert result_dt == result_pd assert result_dt == result_pd.to_pydatetime() + @pytest.mark.parametrize('tz, normalize', [ + (pytz.timezone('US/Eastern'), lambda x: x.tzinfo.normalize(x)), + (gettz('US/Eastern'), lambda x: x)]) + def test_replace_across_dst(self, tz, normalize): + # GH#18319 check that 1) timezone is correctly normalized and + # 2) that hour is not incorrectly changed by this normalization + ts_naive = Timestamp('2017-12-03 16:03:30') + ts_aware = tslib._localize_pydatetime(ts_naive, tz) + + # Preliminary sanity-check + assert ts_aware == normalize(ts_aware) + + # Replace across DST boundary + ts2 = ts_aware.replace(month=6) + + # Check that `replace` preserves hour literal + assert (ts2.hour, ts2.minute) == (ts_aware.hour, ts_aware.minute) + + # Check that post-replace object is appropriately normalized + ts2b = normalize(ts2) + assert ts2 == ts2b + # -------------------------------------------------------------- @td.skip_if_windows diff --git a/pandas/tests/tseries/test_timezones.py b/pandas/tests/tseries/test_timezones.py deleted file mode 100644 index 97326dc04a522..0000000000000 --- a/pandas/tests/tseries/test_timezones.py +++ /dev/null @@ -1,108 +0,0 @@ -# pylint: disable-msg=E1101,W0612 -import pytest - -import pytz - -from datetime import datetime - -from pandas._libs.tslibs import timezones -from pandas import Timestamp - - -class TestTimeZoneSupportPytz(object): - - def tz(self, tz): - # Construct a timezone object from a string. Overridden in subclass to - # parameterize tests. - return pytz.timezone(tz) - - def tzstr(self, tz): - # Construct a timezone string from a string. Overridden in subclass to - # parameterize tests. - return tz - - def localize(self, tz, x): - return tz.localize(x) - - def normalize(self, ts): - tzinfo = ts.tzinfo - return tzinfo.normalize(ts) - - def cmptz(self, tz1, tz2): - # Compare two timezones. Overridden in subclass to parameterize - # tests. - return tz1.zone == tz2.zone - - # test utility methods - def test_infer_tz(self): - eastern = self.tz('US/Eastern') - utc = pytz.utc - - _start = datetime(2001, 1, 1) - _end = datetime(2009, 1, 1) - - start = self.localize(eastern, _start) - end = self.localize(eastern, _end) - assert (timezones.infer_tzinfo(start, end) is - self.localize(eastern, _start).tzinfo) - assert (timezones.infer_tzinfo(start, None) is - self.localize(eastern, _start).tzinfo) - assert (timezones.infer_tzinfo(None, end) is - self.localize(eastern, _end).tzinfo) - - start = utc.localize(_start) - end = utc.localize(_end) - assert (timezones.infer_tzinfo(start, end) is utc) - - end = self.localize(eastern, _end) - pytest.raises(Exception, timezones.infer_tzinfo, start, end) - pytest.raises(Exception, timezones.infer_tzinfo, end, start) - - def test_replace_across_dst(self): - # GH#18319 check that 1) timezone is correctly normalized and - # 2) that hour is not incorrectly changed by this normalization - tz = self.tz('US/Eastern') - - ts_naive = Timestamp('2017-12-03 16:03:30') - ts_aware = self.localize(tz, ts_naive) - - # Preliminary sanity-check - assert ts_aware == self.normalize(ts_aware) - - # Replace across DST boundary - ts2 = ts_aware.replace(month=6) - - # Check that `replace` preserves hour literal - assert (ts2.hour, ts2.minute) == (ts_aware.hour, ts_aware.minute) - - # Check that post-replace object is appropriately normalized - ts2b = self.normalize(ts2) - assert ts2 == ts2b - - -class TestTimeZoneSupportDateutil(TestTimeZoneSupportPytz): - - def tz(self, tz): - """ - Construct a dateutil timezone. - Use tslib.maybe_get_tz so that we get the filename on the tz right - on windows. See #7337. - """ - return timezones.maybe_get_tz('dateutil/' + tz) - - def tzstr(self, tz): - """ Construct a timezone string from a string. Overridden in subclass - to parameterize tests. """ - return 'dateutil/' + tz - - def cmptz(self, tz1, tz2): - """ Compare two timezones. Overridden in subclass to parameterize - tests. """ - return tz1 == tz2 - - def localize(self, tz, x): - return x.replace(tzinfo=tz) - - def normalize(self, ts): - # no-op for dateutil - return ts diff --git a/pandas/tests/tslibs/test_timezones.py b/pandas/tests/tslibs/test_timezones.py index 603c5e3fea26f..1bb355f267938 100644 --- a/pandas/tests/tslibs/test_timezones.py +++ b/pandas/tests/tslibs/test_timezones.py @@ -5,6 +5,7 @@ import pytz import dateutil.tz +from pandas._libs import tslib from pandas._libs.tslibs import timezones from pandas import Timestamp @@ -35,3 +36,33 @@ def test_tzlocal(): offset = dateutil.tz.tzlocal().utcoffset(datetime(2011, 1, 1)) offset = offset.total_seconds() * 1000000000 assert ts.value + offset == Timestamp('2011-01-01').value + + +@pytest.mark.parametrize('eastern, localize', [ + (pytz.timezone('US/Eastern'), lambda tz, x: tz.localize(x)), + (dateutil.tz.gettz('US/Eastern'), lambda tz, x: x.replace(tzinfo=tz))]) +def test_infer_tz(eastern, localize): + utc = pytz.utc + + start_naive = datetime(2001, 1, 1) + end_naive = datetime(2009, 1, 1) + + start = localize(eastern, start_naive) + end = localize(eastern, end_naive) + + assert (timezones.infer_tzinfo(start, end) is + tslib._localize_pydatetime(start_naive, eastern).tzinfo) + assert (timezones.infer_tzinfo(start, None) is + tslib._localize_pydatetime(start_naive, eastern).tzinfo) + assert (timezones.infer_tzinfo(None, end) is + tslib._localize_pydatetime(end_naive, eastern).tzinfo) + + start = utc.localize(start_naive) + end = utc.localize(end_naive) + assert timezones.infer_tzinfo(start, end) is utc + + end = tslib._localize_pydatetime(end_naive, eastern) + with pytest.raises(Exception): + timezones.infer_tzinfo(start, end) + with pytest.raises(Exception): + timezones.infer_tzinfo(end, start) From 64e155c18af76fd449d63ea1f22a2593cef72240 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 18 Feb 2018 08:36:24 -0800 Subject: [PATCH 123/214] dispatch frame methods to series versions instead of re-implementing masking etc (#19611) --- pandas/core/frame.py | 78 +++++--------------- pandas/core/indexes/base.py | 49 ++++--------- pandas/core/ops.py | 102 ++++++++++++++++++++++---- pandas/tests/frame/test_arithmetic.py | 17 +++++ 4 files changed, 138 insertions(+), 108 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a001037b573d4..b96af6af3707f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3944,34 +3944,27 @@ def _combine_frame(self, other, func, fill_value=None, level=None): new_index, new_columns = this.index, this.columns def _arith_op(left, right): + # for the mixed_type case where we iterate over columns, + # _arith_op(left, right) is equivalent to + # left._binop(right, func, fill_value=fill_value) left, right = ops.fill_binop(left, right, fill_value) return func(left, right) if this._is_mixed_type or other._is_mixed_type: - - # unique + # iterate over columns if this.columns.is_unique: - - def f(col): - r = _arith_op(this[col].values, other[col].values) - return self._constructor_sliced(r, index=new_index, - dtype=r.dtype) - - result = {col: f(col) for col in this} - - # non-unique + # unique columns + result = {col: _arith_op(this[col], other[col]) + for col in this} + result = self._constructor(result, index=new_index, + columns=new_columns, copy=False) else: - - def f(i): - r = _arith_op(this.iloc[:, i].values, - other.iloc[:, i].values) - return self._constructor_sliced(r, index=new_index, - dtype=r.dtype) - - result = {i: f(i) for i, col in enumerate(this.columns)} + # non-unique columns + result = {i: _arith_op(this.iloc[:, i], other.iloc[:, i]) + for i, col in enumerate(this.columns)} result = self._constructor(result, index=new_index, copy=False) result.columns = new_columns - return result + return result else: result = _arith_op(this.values, other.values) @@ -3979,36 +3972,11 @@ def f(i): return self._constructor(result, index=new_index, columns=new_columns, copy=False) - def _combine_series(self, other, func, fill_value=None, axis=None, - level=None, try_cast=True): - if fill_value is not None: - raise NotImplementedError("fill_value {fill} not supported." - .format(fill=fill_value)) - - if axis is not None: - axis = self._get_axis_name(axis) - if axis == 'index': - return self._combine_match_index(other, func, level=level) - else: - return self._combine_match_columns(other, func, level=level, - try_cast=try_cast) - else: - if not len(other): - return self * np.nan - - if not len(self): - # Ambiguous case, use _series so works with DataFrame - return self._constructor(data=self._series, index=self.index, - columns=self.columns) - - # default axis is columns - return self._combine_match_columns(other, func, level=level, - try_cast=try_cast) - def _combine_match_index(self, other, func, level=None): left, right = self.align(other, join='outer', axis=0, level=level, copy=False) - return self._constructor(func(left.values.T, right.values).T, + new_data = func(left.values.T, right.values).T + return self._constructor(new_data, index=left.index, columns=self.columns, copy=False) @@ -4027,7 +3995,8 @@ def _combine_const(self, other, func, errors='raise', try_cast=True): try_cast=try_cast) return self._constructor(new_data) - def _compare_frame_evaluate(self, other, func, str_rep, try_cast=True): + def _compare_frame(self, other, func, str_rep, try_cast=True): + # compare_frame assumes self._indexed_same(other) import pandas.core.computation.expressions as expressions # unique @@ -4052,19 +4021,6 @@ def _compare(a, b): result.columns = self.columns return result - def _compare_frame(self, other, func, str_rep, try_cast=True): - if not self._indexed_same(other): - raise ValueError('Can only compare identically-labeled ' - 'DataFrame objects') - return self._compare_frame_evaluate(other, func, str_rep, - try_cast=try_cast) - - def _flex_compare_frame(self, other, func, str_rep, level, try_cast=True): - if not self._indexed_same(other): - self, other = self.align(other, 'outer', level=level, copy=False) - return self._compare_frame_evaluate(other, func, str_rep, - try_cast=try_cast) - def combine(self, other, func, fill_value=None, overwrite=True): """ Add two DataFrame objects and do not propagate NaN values, so if for a diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 02dd2dbc25703..7dfa34bd634ad 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -55,7 +55,7 @@ import pandas.core.algorithms as algos import pandas.core.sorting as sorting from pandas.io.formats.printing import pprint_thing -from pandas.core.ops import _comp_method_OBJECT_ARRAY +from pandas.core.ops import _comp_method_OBJECT_ARRAY, make_invalid_op from pandas.core.config import get_option from pandas.core.strings import StringMethods @@ -82,26 +82,6 @@ def _try_get_item(x): return x -def _make_invalid_op(name): - """ - Return a binary method that always raises a TypeError. - - Parameters - ---------- - name : str - - Returns - ------- - invalid_op : function - """ - def invalid_op(self, other=None): - raise TypeError("cannot perform {name} with this index type: " - "{typ}".format(name=name, typ=type(self))) - - invalid_op.__name__ = name - return invalid_op - - class InvalidIndexError(Exception): pass @@ -3994,22 +3974,23 @@ def _evaluate_compare(self, other): @classmethod def _add_numeric_methods_add_sub_disabled(cls): """ add in the numeric add/sub methods to disable """ - cls.__add__ = cls.__radd__ = __iadd__ = _make_invalid_op('__add__') # noqa - cls.__sub__ = __isub__ = _make_invalid_op('__sub__') # noqa + cls.__add__ = cls.__radd__ = __iadd__ = make_invalid_op('__add__') # noqa + cls.__sub__ = __isub__ = make_invalid_op('__sub__') # noqa @classmethod def _add_numeric_methods_disabled(cls): """ add in numeric methods to disable other than add/sub """ - cls.__pow__ = cls.__rpow__ = _make_invalid_op('__pow__') - cls.__mul__ = cls.__rmul__ = _make_invalid_op('__mul__') - cls.__floordiv__ = cls.__rfloordiv__ = _make_invalid_op('__floordiv__') - cls.__truediv__ = cls.__rtruediv__ = _make_invalid_op('__truediv__') + cls.__pow__ = make_invalid_op('__pow__') + cls.__rpow__ = make_invalid_op('__rpow__') + cls.__mul__ = cls.__rmul__ = make_invalid_op('__mul__') + cls.__floordiv__ = cls.__rfloordiv__ = make_invalid_op('__floordiv__') + cls.__truediv__ = cls.__rtruediv__ = make_invalid_op('__truediv__') if not compat.PY3: - cls.__div__ = cls.__rdiv__ = _make_invalid_op('__div__') - cls.__neg__ = _make_invalid_op('__neg__') - cls.__pos__ = _make_invalid_op('__pos__') - cls.__abs__ = _make_invalid_op('__abs__') - cls.__inv__ = _make_invalid_op('__inv__') + cls.__div__ = cls.__rdiv__ = make_invalid_op('__div__') + cls.__neg__ = make_invalid_op('__neg__') + cls.__pos__ = make_invalid_op('__pos__') + cls.__abs__ = make_invalid_op('__abs__') + cls.__inv__ = make_invalid_op('__inv__') def _maybe_update_attributes(self, attrs): """ Update Index attributes (e.g. freq) depending on op """ @@ -4207,8 +4188,8 @@ def logical_func(self, *args, **kwargs): @classmethod def _add_logical_methods_disabled(cls): """ add in logical methods to disable """ - cls.all = _make_invalid_op('all') - cls.any = _make_invalid_op('any') + cls.all = make_invalid_op('all') + cls.any = make_invalid_op('any') Index._add_numeric_methods_disabled() diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 4c234ccb4dd47..fd4fc5540fcec 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -96,6 +96,26 @@ def rxor(left, right): # ----------------------------------------------------------------------------- +def make_invalid_op(name): + """ + Return a binary method that always raises a TypeError. + + Parameters + ---------- + name : str + + Returns + ------- + invalid_op : function + """ + def invalid_op(self, other=None): + raise TypeError("cannot perform {name} with this index type: " + "{typ}".format(name=name, typ=type(self).__name__)) + + invalid_op.__name__ = name + return invalid_op + + def _gen_eval_kwargs(name): """ Find the keyword arguments to pass to numexpr for the given operation. @@ -1047,8 +1067,8 @@ def flex_wrapper(self, other, level=None, fill_value=None, axis=0): elif isinstance(other, (np.ndarray, list, tuple)): if len(other) != len(self): raise ValueError('Lengths must be equal') - return self._binop(self._constructor(other, self.index), op, - level=level, fill_value=fill_value) + other = self._constructor(other, self.index) + return self._binop(other, op, level=level, fill_value=fill_value) else: if fill_value is not None: self = self.fillna(fill_value) @@ -1071,6 +1091,51 @@ def flex_wrapper(self, other, level=None, fill_value=None, axis=0): # ----------------------------------------------------------------------------- # DataFrame +def _combine_series_frame(self, other, func, fill_value=None, axis=None, + level=None, try_cast=True): + """ + Apply binary operator `func` to self, other using alignment and fill + conventions determined by the fill_value, axis, level, and try_cast kwargs. + + Parameters + ---------- + self : DataFrame + other : Series + func : binary operator + fill_value : object, default None + axis : {0, 1, 'columns', 'index', None}, default None + level : int or None, default None + try_cast : bool, default True + + Returns + ------- + result : DataFrame + """ + if fill_value is not None: + raise NotImplementedError("fill_value {fill} not supported." + .format(fill=fill_value)) + + if axis is not None: + axis = self._get_axis_number(axis) + if axis == 0: + return self._combine_match_index(other, func, level=level) + else: + return self._combine_match_columns(other, func, level=level, + try_cast=try_cast) + else: + if not len(other): + return self * np.nan + + if not len(self): + # Ambiguous case, use _series so works with DataFrame + return self._constructor(data=self._series, index=self.index, + columns=self.columns) + + # default axis is columns + return self._combine_match_columns(other, func, level=level, + try_cast=try_cast) + + def _align_method_FRAME(left, right, axis): """ convert rhs to meet lhs dims if input is list, tuple or np.ndarray """ @@ -1179,8 +1244,9 @@ def f(self, other, axis=default_axis, level=None, fill_value=None): if isinstance(other, ABCDataFrame): # Another DataFrame return self._combine_frame(other, na_op, fill_value, level) elif isinstance(other, ABCSeries): - return self._combine_series(other, na_op, fill_value, axis, level, - try_cast=True) + return _combine_series_frame(self, other, na_op, + fill_value=fill_value, axis=axis, + level=level, try_cast=True) else: if fill_value is not None: self = self.fillna(fill_value) @@ -1209,13 +1275,17 @@ def f(self, other, axis=default_axis, level=None): other = _align_method_FRAME(self, other, axis) - if isinstance(other, ABCDataFrame): # Another DataFrame - return self._flex_compare_frame(other, na_op, str_rep, level, - try_cast=False) + if isinstance(other, ABCDataFrame): + # Another DataFrame + if not self._indexed_same(other): + self, other = self.align(other, 'outer', + level=level, copy=False) + return self._compare_frame(other, na_op, str_rep, try_cast=False) elif isinstance(other, ABCSeries): - return self._combine_series(other, na_op, None, axis, level, - try_cast=False) + return _combine_series_frame(self, other, na_op, + fill_value=None, axis=axis, + level=level, try_cast=False) else: return self._combine_const(other, na_op, try_cast=False) @@ -1227,11 +1297,17 @@ def f(self, other, axis=default_axis, level=None): def _comp_method_FRAME(func, name, str_rep): @Appender('Wrapper for comparison method {name}'.format(name=name)) def f(self, other): - if isinstance(other, ABCDataFrame): # Another DataFrame - return self._compare_frame(other, func, str_rep) + if isinstance(other, ABCDataFrame): + # Another DataFrame + if not self._indexed_same(other): + raise ValueError('Can only compare identically-labeled ' + 'DataFrame objects') + return self._compare_frame(other, func, str_rep, try_cast=True) + elif isinstance(other, ABCSeries): - return self._combine_series(other, func, - axis=None, try_cast=False) + return _combine_series_frame(self, other, func, + fill_value=None, axis=None, + level=None, try_cast=False) else: # straight boolean comparisons we want to allow all columns diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index a3a799aed1c55..65afe85628f8e 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -72,6 +72,23 @@ def test_tz_aware_scalar_comparison(self, timestamps): # ------------------------------------------------------------------- # Arithmetic +class TestFrameFlexArithmetic(object): + def test_df_add_flex_filled_mixed_dtypes(self): + # GH#19611 + dti = pd.date_range('2016-01-01', periods=3) + ser = pd.Series(['1 Day', 'NaT', '2 Days'], dtype='timedelta64[ns]') + df = pd.DataFrame({'A': dti, 'B': ser}) + other = pd.DataFrame({'A': ser, 'B': ser}) + fill = pd.Timedelta(days=1).to_timedelta64() + result = df.add(other, fill_value=fill) + + expected = pd.DataFrame( + {'A': pd.Series(['2016-01-02', '2016-01-03', '2016-01-05'], + dtype='datetime64[ns]'), + 'B': ser * 2}) + tm.assert_frame_equal(result, expected) + + class TestFrameMulDiv(object): """Tests for DataFrame multiplication and division""" # ------------------------------------------------------------------ From c49cd54828c363b42ffd3cd2fad971a9f09bef9b Mon Sep 17 00:00:00 2001 From: William Ayd Date: Sun, 18 Feb 2018 08:39:49 -0800 Subject: [PATCH 124/214] Removed if...else for K > 1 (#19734) --- pandas/_libs/groupby_helper.pxi.in | 176 ++++++++--------------------- 1 file changed, 47 insertions(+), 129 deletions(-) diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index fe4d31516d839..93fbb4477e2d0 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -56,36 +56,19 @@ def group_add_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, with nogil: - if K > 1: - - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[lab, j] += 1 - sumx[lab, j] += val - - else: - - for i in range(N): - lab = labels[i] - if lab < 0: - continue + for i in range(N): + lab = labels[i] + if lab < 0: + continue - counts[lab] += 1 - val = values[i, 0] + counts[lab] += 1 + for j in range(K): + val = values[i, j] # not nan if val == val: - nobs[lab, 0] += 1 - sumx[lab, 0] += val + nobs[lab, j] += 1 + sumx[lab, j] += val for i in range(ncounts): for j in range(K): @@ -119,33 +102,19 @@ def group_prod_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, N, K = ( values).shape with nogil: - if K > 1: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[lab, j] += 1 - prodx[lab, j] *= val - else: - for i in range(N): - lab = labels[i] - if lab < 0: - continue + for i in range(N): + lab = labels[i] + if lab < 0: + continue - counts[lab] += 1 - val = values[i, 0] + counts[lab] += 1 + for j in range(K): + val = values[i, j] # not nan if val == val: - nobs[lab, 0] += 1 - prodx[lab, 0] *= val + nobs[lab, j] += 1 + prodx[lab, j] *= val for i in range(ncounts): for j in range(K): @@ -231,31 +200,18 @@ def group_mean_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, N, K = ( values).shape with nogil: - if K > 1: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - # not nan - if val == val: - nobs[lab, j] += 1 - sumx[lab, j] += val - else: - for i in range(N): - lab = labels[i] - if lab < 0: - continue + for i in range(N): + lab = labels[i] + if lab < 0: + continue - counts[lab] += 1 - val = values[i, 0] + counts[lab] += 1 + for j in range(K): + val = values[i, j] # not nan if val == val: - nobs[lab, 0] += 1 - sumx[lab, 0] += val + nobs[lab, j] += 1 + sumx[lab, j] += val for i in range(ncounts): for j in range(K): @@ -670,33 +626,14 @@ def group_max_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, N, K = ( values).shape with nogil: - if K > 1: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - {{if name == 'int64'}} - if val != {{nan_val}}: - {{else}} - if val == val and val != {{nan_val}}: - {{endif}} - nobs[lab, j] += 1 - if val > maxx[lab, j]: - maxx[lab, j] = val - else: - for i in range(N): - lab = labels[i] - if lab < 0: - continue + for i in range(N): + lab = labels[i] + if lab < 0: + continue - counts[lab] += 1 - val = values[i, 0] + counts[lab] += 1 + for j in range(K): + val = values[i, j] # not nan {{if name == 'int64'}} @@ -704,9 +641,9 @@ def group_max_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, {{else}} if val == val and val != {{nan_val}}: {{endif}} - nobs[lab, 0] += 1 - if val > maxx[lab, 0]: - maxx[lab, 0] = val + nobs[lab, j] += 1 + if val > maxx[lab, j]: + maxx[lab, j] = val for i in range(ncounts): for j in range(K): @@ -744,33 +681,14 @@ def group_min_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, N, K = ( values).shape with nogil: - if K > 1: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - {{if name == 'int64'}} - if val != {{nan_val}}: - {{else}} - if val == val and val != {{nan_val}}: - {{endif}} - nobs[lab, j] += 1 - if val < minx[lab, j]: - minx[lab, j] = val - else: - for i in range(N): - lab = labels[i] - if lab < 0: - continue + for i in range(N): + lab = labels[i] + if lab < 0: + continue - counts[lab] += 1 - val = values[i, 0] + counts[lab] += 1 + for j in range(K): + val = values[i, j] # not nan {{if name == 'int64'}} @@ -778,9 +696,9 @@ def group_min_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, {{else}} if val == val and val != {{nan_val}}: {{endif}} - nobs[lab, 0] += 1 - if val < minx[lab, 0]: - minx[lab, 0] = val + nobs[lab, j] += 1 + if val < minx[lab, j]: + minx[lab, j] = val for i in range(ncounts): for j in range(K): From c4770c74198ccc76c5a7667c920dbeb7fb2af2b0 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 18 Feb 2018 08:47:29 -0800 Subject: [PATCH 125/214] Dispatch categorical Series ops to Categorical (#19582) --- doc/source/whatsnew/v0.23.0.txt | 2 + pandas/core/arrays/categorical.py | 3 ++ pandas/core/indexes/category.py | 25 ++++++--- pandas/core/ops.py | 72 +++++++++++++++----------- pandas/tests/indexes/common.py | 1 + pandas/tests/series/test_arithmetic.py | 34 ++++++++++++ 6 files changed, 99 insertions(+), 38 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index a2198d9103528..11c49995372f5 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -849,3 +849,5 @@ Other ^^^^^ - Improved error message when attempting to use a Python keyword as an identifier in a ``numexpr`` backed query (:issue:`18221`) +- Comparisons between :class:`Series` and :class:`Index` would return a ``Series`` with an incorrect name, ignoring the ``Index``'s name attribute (:issue:`19582`) +- diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index bcf9cb7646704..7354115f8295e 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -53,6 +53,9 @@ def f(self, other): # results depending whether categories are the same or not is kind of # insane, so be a bit stricter here and use the python3 idea of # comparing only things of equal type. + if isinstance(other, ABCSeries): + return NotImplemented + if not self.ordered: if op in ['__lt__', '__gt__', '__le__', '__ge__']: raise TypeError("Unordered Categoricals can only compare " diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index a4d0f787cc6ec..218851b1713f2 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -1,3 +1,5 @@ +import operator + import numpy as np from pandas._libs import index as libindex @@ -738,7 +740,9 @@ def _codes_for_groupby(self, sort): def _add_comparison_methods(cls): """ add in comparison methods """ - def _make_compare(opname): + def _make_compare(op): + opname = '__{op}__'.format(op=op.__name__) + def _evaluate_compare(self, other): # if we have a Categorical type, then must have the same @@ -761,16 +765,21 @@ def _evaluate_compare(self, other): "have the same categories and ordered " "attributes") - return getattr(self.values, opname)(other) + result = op(self.values, other) + if isinstance(result, ABCSeries): + # Dispatch to pd.Categorical returned NotImplemented + # and we got a Series back; down-cast to ndarray + result = result.values + return result return compat.set_function_name(_evaluate_compare, opname, cls) - cls.__eq__ = _make_compare('__eq__') - cls.__ne__ = _make_compare('__ne__') - cls.__lt__ = _make_compare('__lt__') - cls.__gt__ = _make_compare('__gt__') - cls.__le__ = _make_compare('__le__') - cls.__ge__ = _make_compare('__ge__') + cls.__eq__ = _make_compare(operator.eq) + cls.__ne__ = _make_compare(operator.ne) + cls.__lt__ = _make_compare(operator.lt) + cls.__gt__ = _make_compare(operator.gt) + cls.__le__ = _make_compare(operator.le) + cls.__ge__ = _make_compare(operator.ge) def _delegate_method(self, name, *args, **kwargs): """ method delegation to the ._values """ diff --git a/pandas/core/ops.py b/pandas/core/ops.py index fd4fc5540fcec..dff2b6844af94 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -819,7 +819,7 @@ def dispatch_to_index_op(op, left, right, index_class): # avoid accidentally allowing integer add/sub. For datetime64[tz] dtypes, # left_idx may inherit a freq from a cached DatetimeIndex. # See discussion in GH#19147. - if left_idx.freq is not None: + if getattr(left_idx, 'freq', None) is not None: left_idx = left_idx._shallow_copy(freq=None) try: result = op(left_idx, right) @@ -867,9 +867,8 @@ def na_op(x, y): # dispatch to the categorical if we have a categorical # in either operand - if is_categorical_dtype(x): - return op(x, y) - elif is_categorical_dtype(y) and not is_scalar(y): + if is_categorical_dtype(y) and not is_scalar(y): + # The `not is_scalar(y)` check excludes the string "category" return op(y, x) elif is_object_dtype(x.dtype): @@ -917,17 +916,36 @@ def wrapper(self, other, axis=None): if axis is not None: self._get_axis_number(axis) + res_name = _get_series_op_result_name(self, other) + if isinstance(other, ABCDataFrame): # pragma: no cover # Defer to DataFrame implementation; fail early return NotImplemented + elif isinstance(other, ABCSeries) and not self._indexed_same(other): + raise ValueError("Can only compare identically-labeled " + "Series objects") + + elif is_categorical_dtype(self): + # Dispatch to Categorical implementation; pd.CategoricalIndex + # behavior is non-canonical GH#19513 + res_values = dispatch_to_index_op(op, self, other, pd.Categorical) + return self._constructor(res_values, index=self.index, + name=res_name) + + elif is_timedelta64_dtype(self): + res_values = dispatch_to_index_op(op, self, other, + pd.TimedeltaIndex) + return self._constructor(res_values, index=self.index, + name=res_name) + elif isinstance(other, ABCSeries): - name = com._maybe_match_name(self, other) - if not self._indexed_same(other): - msg = 'Can only compare identically-labeled Series objects' - raise ValueError(msg) + # By this point we have checked that self._indexed_same(other) res_values = na_op(self.values, other.values) - return self._constructor(res_values, index=self.index, name=name) + # rename is needed in case res_name is None and res_values.name + # is not. + return self._constructor(res_values, index=self.index, + name=res_name).rename(res_name) elif isinstance(other, (np.ndarray, pd.Index)): # do not check length of zerodim array @@ -937,15 +955,17 @@ def wrapper(self, other, axis=None): raise ValueError('Lengths must match to compare') res_values = na_op(self.values, np.asarray(other)) - return self._constructor(res_values, - index=self.index).__finalize__(self) - - elif (isinstance(other, pd.Categorical) and - not is_categorical_dtype(self)): - raise TypeError("Cannot compare a Categorical for op {op} with " - "Series of dtype {typ}.\nIf you want to compare " - "values, use 'series np.asarray(other)'." - .format(op=op, typ=self.dtype)) + result = self._constructor(res_values, index=self.index) + # rename is needed in case res_name is None and self.name + # is not. + return result.__finalize__(self).rename(res_name) + + elif isinstance(other, pd.Categorical): + # ordering of checks matters; by this point we know + # that not is_categorical_dtype(self) + res_values = op(self.values, other) + return self._constructor(res_values, index=self.index, + name=res_name) elif is_scalar(other) and isna(other): # numpy does not like comparisons vs None @@ -956,16 +976,9 @@ def wrapper(self, other, axis=None): return self._constructor(res_values, index=self.index, name=self.name, dtype='bool') - if is_categorical_dtype(self): - # cats are a special case as get_values() would return an ndarray, - # which would then not take categories ordering into account - # we can go directly to op, as the na_op would just test again and - # dispatch to it. - with np.errstate(all='ignore'): - res = op(self.values, other) else: values = self.get_values() - if isinstance(other, (list, np.ndarray)): + if isinstance(other, list): other = np.asarray(other) with np.errstate(all='ignore'): @@ -975,10 +988,9 @@ def wrapper(self, other, axis=None): .format(typ=type(other))) # always return a full value series here - res = com._values_from_object(res) - - res = pd.Series(res, index=self.index, name=self.name, dtype='bool') - return res + res_values = com._values_from_object(res) + return pd.Series(res_values, index=self.index, + name=res_name, dtype='bool') return wrapper diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 2d8d70aa2ac84..1162662bf9a08 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -790,6 +790,7 @@ def test_equals_op(self): series_d = Series(array_d) with tm.assert_raises_regex(ValueError, "Lengths must match"): index_a == series_b + tm.assert_numpy_array_equal(index_a == series_a, expected1) tm.assert_numpy_array_equal(index_a == series_c, expected2) diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index 94da97ef45301..f727edf8fb7d8 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -43,6 +43,40 @@ def test_ser_flex_cmp_return_dtypes_empty(self, opname): result = getattr(empty, opname)(const).get_dtype_counts() tm.assert_series_equal(result, Series([1], ['bool'])) + @pytest.mark.parametrize('op', [operator.eq, operator.ne, + operator.le, operator.lt, + operator.ge, operator.gt]) + @pytest.mark.parametrize('names', [(None, None, None), + ('foo', 'bar', None), + ('baz', 'baz', 'baz')]) + def test_ser_cmp_result_names(self, names, op): + # datetime64 dtype + dti = pd.date_range('1949-06-07 03:00:00', + freq='H', periods=5, name=names[0]) + ser = Series(dti).rename(names[1]) + result = op(ser, dti) + assert result.name == names[2] + + # datetime64tz dtype + dti = dti.tz_localize('US/Central') + ser = Series(dti).rename(names[1]) + result = op(ser, dti) + assert result.name == names[2] + + # timedelta64 dtype + tdi = dti - dti.shift(1) + ser = Series(tdi).rename(names[1]) + result = op(ser, tdi) + assert result.name == names[2] + + # categorical + if op in [operator.eq, operator.ne]: + # categorical dtype comparisons raise for inequalities + cidx = tdi.astype('category') + ser = Series(cidx).rename(names[1]) + result = op(ser, cidx) + assert result.name == names[2] + class TestTimestampSeriesComparison(object): def test_dt64ser_cmp_period_scalar(self): From c086a518fe728deda168a4f800e4224e497ad19a Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Sun, 18 Feb 2018 17:14:38 +0000 Subject: [PATCH 126/214] DOC/BLD: Pinning sphinx to 1.5, as 1.7 has been released and it's incompatible with vendored numpydoc (#19743) --- ci/requirements_dev.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/requirements_dev.txt b/ci/requirements_dev.txt index 82f8de277c57b..a474658fa2922 100644 --- a/ci/requirements_dev.txt +++ b/ci/requirements_dev.txt @@ -7,4 +7,4 @@ pytest>=3.1 python-dateutil>=2.5.0 pytz setuptools>=3.3 -sphinx +sphinx=1.5* From 5bb5e33ffe01eb7c82e8eed327f8745cfacdaada Mon Sep 17 00:00:00 2001 From: ZhuBaohe Date: Mon, 19 Feb 2018 01:46:53 +0800 Subject: [PATCH 127/214] DOC: correct merge_asof example (#19737) --- pandas/core/reshape/merge.py | 4 +-- pandas/tests/reshape/merge/test_merge_asof.py | 29 +++++++++++++++---- 2 files changed, 26 insertions(+), 7 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 4b99b0407cfcc..7b1a0875bba59 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -457,8 +457,8 @@ def merge_asof(left, right, on=None, time ticker price quantity bid ask 0 2016-05-25 13:30:00.023 MSFT 51.95 75 NaN NaN 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98 - 2 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93 - 3 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93 + 2 2016-05-25 13:30:00.048 GOOG 720.77 100 NaN NaN + 3 2016-05-25 13:30:00.048 GOOG 720.92 100 NaN NaN 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN See also diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index 2f48aef1894a9..cebbcc41c3e17 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -92,11 +92,30 @@ def test_examples2(self): by='ticker', tolerance=pd.Timedelta('2ms')) - pd.merge_asof(trades, quotes, - on='time', - by='ticker', - tolerance=pd.Timedelta('10ms'), - allow_exact_matches=False) + expected = pd.DataFrame({ + 'time': pd.to_datetime(['20160525 13:30:00.023', + '20160525 13:30:00.038', + '20160525 13:30:00.048', + '20160525 13:30:00.048', + '20160525 13:30:00.048']), + 'ticker': ['MSFT', 'MSFT', 'GOOG', 'GOOG', 'AAPL'], + 'price': [51.95, 51.95, + 720.77, 720.92, 98.00], + 'quantity': [75, 155, + 100, 100, 100], + 'bid': [np.nan, 51.97, np.nan, + np.nan, np.nan], + 'ask': [np.nan, 51.98, np.nan, + np.nan, np.nan]}, + columns=['time', 'ticker', 'price', 'quantity', + 'bid', 'ask']) + + result = pd.merge_asof(trades, quotes, + on='time', + by='ticker', + tolerance=pd.Timedelta('10ms'), + allow_exact_matches=False) + assert_frame_equal(result, expected) def test_examples3(self): """ doc-string examples """ From 9df5ab781f903ce00407630aa9db661e3f739bcf Mon Sep 17 00:00:00 2001 From: Thomas A Caswell Date: Sun, 18 Feb 2018 18:32:33 -0500 Subject: [PATCH 128/214] FIX: const-correctness in numpy helpers (#19749) In python 3.7 the return type of PyUnicode_AsUTF8 changed from (char *) to (const char *). PyUnicode_FromString also takes (const char *) as input, also be explicit about that. https://bugs.python.org/issue28769 commit 2a404b63d48d73bbaa007d89efb7a01048475acd in cpython --- pandas/_libs/src/numpy_helper.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/src/numpy_helper.h b/pandas/_libs/src/numpy_helper.h index 844be9b292be3..5cfa51dc8a0be 100644 --- a/pandas/_libs/src/numpy_helper.h +++ b/pandas/_libs/src/numpy_helper.h @@ -32,7 +32,7 @@ PANDAS_INLINE PyObject* get_value_1d(PyArrayObject* ap, Py_ssize_t i) { // returns ASCII or UTF8 (py3) view on python str // python object owns memory, should not be freed -PANDAS_INLINE char* get_c_string(PyObject* obj) { +PANDAS_INLINE const char* get_c_string(PyObject* obj) { #if PY_VERSION_HEX >= 0x03000000 return PyUnicode_AsUTF8(obj); #else @@ -40,7 +40,7 @@ PANDAS_INLINE char* get_c_string(PyObject* obj) { #endif } -PANDAS_INLINE PyObject* char_to_string(char* data) { +PANDAS_INLINE PyObject* char_to_string(const char* data) { #if PY_VERSION_HEX >= 0x03000000 return PyUnicode_FromString(data); #else From 997638574d6a6bb5017f0e4f1cca4545e4be28d5 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 19 Feb 2018 14:18:17 +0100 Subject: [PATCH 129/214] DOC/BLD: update vendored IPython.sphinxext version (#19765) updated to commit cc353b25b0fff58e4ed13899df9b3c8153df01d9 from ipython/ipython --- .../ipython_console_highlighting.py | 120 +------ .../ipython_sphinxext/ipython_directive.py | 339 +++++++++++------- 2 files changed, 225 insertions(+), 234 deletions(-) diff --git a/doc/sphinxext/ipython_sphinxext/ipython_console_highlighting.py b/doc/sphinxext/ipython_sphinxext/ipython_console_highlighting.py index c5ec26aefd442..b93a151fb3cb0 100644 --- a/doc/sphinxext/ipython_sphinxext/ipython_console_highlighting.py +++ b/doc/sphinxext/ipython_sphinxext/ipython_console_highlighting.py @@ -1,116 +1,28 @@ -"""reST directive for syntax-highlighting ipython interactive sessions. - -XXX - See what improvements can be made based on the new (as of Sept 2009) -'pycon' lexer for the python console. At the very least it will give better -highlighted tracebacks. """ +reST directive for syntax-highlighting ipython interactive sessions. -#----------------------------------------------------------------------------- -# Needed modules - -# Standard library -import re - -# Third party -from pygments.lexer import Lexer, do_insertions -from pygments.lexers.agile import (PythonConsoleLexer, PythonLexer, - PythonTracebackLexer) -from pygments.token import Comment, Generic +""" from sphinx import highlighting - -#----------------------------------------------------------------------------- -# Global constants -line_re = re.compile('.*?\n') - -#----------------------------------------------------------------------------- -# Code begins - classes and functions - - -class IPythonConsoleLexer(Lexer): - - """ - For IPython console output or doctests, such as: - - .. sourcecode:: ipython - - In [1]: a = 'foo' - - In [2]: a - Out[2]: 'foo' - - In [3]: print(a) - foo - - In [4]: 1 / 0 - - Notes: - - - Tracebacks are not currently supported. - - - It assumes the default IPython prompts, not customized ones. - """ - - name = 'IPython console session' - aliases = ['ipython'] - mimetypes = ['text/x-ipython-console'] - input_prompt = re.compile("(In \[[0-9]+\]: )|( \.\.\.+:)") - output_prompt = re.compile("(Out\[[0-9]+\]: )|( \.\.\.+:)") - continue_prompt = re.compile(" \.\.\.+:") - tb_start = re.compile("\-+") - - def get_tokens_unprocessed(self, text): - pylexer = PythonLexer(**self.options) - tblexer = PythonTracebackLexer(**self.options) - - curcode = '' - insertions = [] - for match in line_re.finditer(text): - line = match.group() - input_prompt = self.input_prompt.match(line) - continue_prompt = self.continue_prompt.match(line.rstrip()) - output_prompt = self.output_prompt.match(line) - if line.startswith("#"): - insertions.append((len(curcode), - [(0, Comment, line)])) - elif input_prompt is not None: - insertions.append((len(curcode), - [(0, Generic.Prompt, input_prompt.group())])) - curcode += line[input_prompt.end():] - elif continue_prompt is not None: - insertions.append((len(curcode), - [(0, Generic.Prompt, continue_prompt.group())])) - curcode += line[continue_prompt.end():] - elif output_prompt is not None: - # Use the 'error' token for output. We should probably make - # our own token, but error is typically in a bright color like - # red, so it works fine for our output prompts. - insertions.append((len(curcode), - [(0, Generic.Error, output_prompt.group())])) - curcode += line[output_prompt.end():] - else: - if curcode: - for item in do_insertions(insertions, - pylexer.get_tokens_unprocessed(curcode)): - yield item - curcode = '' - insertions = [] - yield match.start(), Generic.Output, line - if curcode: - for item in do_insertions(insertions, - pylexer.get_tokens_unprocessed(curcode)): - yield item - +from IPython.lib.lexers import IPyLexer def setup(app): """Setup as a sphinx extension.""" # This is only a lexer, so adding it below to pygments appears sufficient. - # But if somebody knows that the right API usage should be to do that via + # But if somebody knows what the right API usage should be to do that via # sphinx, by all means fix it here. At least having this setup.py # suppresses the sphinx warning we'd get without it. - pass + metadata = {'parallel_read_safe': True, 'parallel_write_safe': True} + return metadata + +# Register the extension as a valid pygments lexer. +# Alternatively, we could register the lexer with pygments instead. This would +# require using setuptools entrypoints: http://pygments.org/docs/plugins + +ipy2 = IPyLexer(python3=False) +ipy3 = IPyLexer(python3=True) -#----------------------------------------------------------------------------- -# Register the extension as a valid pygments lexer -highlighting.lexers['ipython'] = IPythonConsoleLexer() +highlighting.lexers['ipython'] = ipy2 +highlighting.lexers['ipython2'] = ipy2 +highlighting.lexers['ipython3'] = ipy3 diff --git a/doc/sphinxext/ipython_sphinxext/ipython_directive.py b/doc/sphinxext/ipython_sphinxext/ipython_directive.py index 5616d732eb1c6..a0e6728861b66 100644 --- a/doc/sphinxext/ipython_sphinxext/ipython_directive.py +++ b/doc/sphinxext/ipython_sphinxext/ipython_directive.py @@ -83,8 +83,29 @@ See http://matplotlib.org/sampledoc/ipython_directive.html for additional documentation. -ToDo ----- +Pseudo-Decorators +================= + +Note: Only one decorator is supported per input. If more than one decorator +is specified, then only the last one is used. + +In addition to the Pseudo-Decorators/options described at the above link, +several enhancements have been made. The directive will emit a message to the +console at build-time if code-execution resulted in an exception or warning. +You can suppress these on a per-block basis by specifying the :okexcept: +or :okwarning: options: + +.. code-block:: rst + + .. ipython:: + :okexcept: + :okwarning: + + In [1]: 1/0 + In [2]: # raise warning. + +To Do +----- - Turn the ad-hoc test() function into a real test suite. - Break up ipython-specific functionality from matplotlib stuff into better @@ -98,48 +119,31 @@ - VáclavŠmilauer : Prompt generalizations. - Skipper Seabold, refactoring, cleanups, pure python addition """ -from __future__ import print_function -from __future__ import unicode_literals #----------------------------------------------------------------------------- # Imports #----------------------------------------------------------------------------- # Stdlib +import atexit +import errno import os import re import sys import tempfile import ast -from pandas.compat import zip, range, map, lmap, u, text_type, cStringIO as StringIO import warnings - -# To keep compatibility with various python versions -try: - from hashlib import md5 -except ImportError: - from md5 import md5 +import shutil +from io import StringIO # Third-party -import sphinx from docutils.parsers.rst import directives -from docutils import nodes -from sphinx.util.compat import Directive +from docutils.parsers.rst import Directive # Our own -try: - from traitlets.config import Config -except ImportError: - from IPython import Config +from traitlets.config import Config from IPython import InteractiveShell from IPython.core.profiledir import ProfileDir -from IPython.utils import io -from IPython.utils.py3compat import PY3 - -if PY3: - from io import StringIO -else: - from StringIO import StringIO #----------------------------------------------------------------------------- # Globals @@ -191,8 +195,8 @@ def block_parser(part, rgxin, rgxout, fmtin, fmtout): continue if line_stripped.startswith('@'): - # we're assuming at most one decorator -- may need to - # rethink + # Here is where we assume there is, at most, one decorator. + # Might need to rethink this. decorator = line_stripped continue @@ -223,12 +227,17 @@ def block_parser(part, rgxin, rgxout, fmtin, fmtout): if matchout or nextline.startswith('#'): break elif nextline.startswith(continuation): + # The default ipython_rgx* treat the space following the colon as optional. + # However, If the space is there we must consume it or code + # employing the cython_magic extension will fail to execute. + # + # This works with the default ipython_rgx* patterns, + # If you modify them, YMMV. nextline = nextline[Nc:] if nextline and nextline[0] == ' ': nextline = nextline[1:] inputline += '\n' + nextline - else: rest.append(nextline) i+= 1 @@ -250,42 +259,19 @@ def block_parser(part, rgxin, rgxout, fmtin, fmtout): return block -class DecodingStringIO(StringIO, object): - def __init__(self,buf='',encodings=('utf8',), *args, **kwds): - super(DecodingStringIO, self).__init__(buf, *args, **kwds) - self.set_encodings(encodings) - - def set_encodings(self, encodings): - self.encodings = encodings - - def write(self,data): - if isinstance(data, text_type): - return super(DecodingStringIO, self).write(data) - else: - for enc in self.encodings: - try: - data = data.decode(enc) - return super(DecodingStringIO, self).write(data) - except : - pass - # default to brute utf8 if no encoding succeeded - return super(DecodingStringIO, self).write(data.decode('utf8', 'replace')) - - class EmbeddedSphinxShell(object): """An embedded IPython instance to run inside Sphinx""" - def __init__(self, exec_lines=None,state=None): + def __init__(self, exec_lines=None): - self.cout = DecodingStringIO(u'') + self.cout = StringIO() if exec_lines is None: exec_lines = [] - self.state = state - # Create config object for IPython config = Config() + config.HistoryManager.hist_file = ':memory:' config.InteractiveShell.autocall = False config.InteractiveShell.autoindent = False config.InteractiveShell.colors = 'NoColor' @@ -297,17 +283,9 @@ def __init__(self, exec_lines=None,state=None): profile = ProfileDir.create_profile_dir(pdir) # Create and initialize global ipython, but don't start its mainloop. - # This will persist across different EmbededSphinxShell instances. + # This will persist across different EmbeddedSphinxShell instances. IP = InteractiveShell.instance(config=config, profile_dir=profile) - - # io.stdout redirect must be done after instantiating InteractiveShell - io.stdout = self.cout - io.stderr = self.cout - - # For debugging, so we can see normal output, use this: - #from IPython.utils.io import Tee - #io.stdout = Tee(self.cout, channel='stdout') # dbg - #io.stderr = Tee(self.cout, channel='stderr') # dbg + atexit.register(self.cleanup) # Store a few parts of IPython we'll need. self.IP = IP @@ -316,12 +294,17 @@ def __init__(self, exec_lines=None,state=None): self.input = '' self.output = '' + self.tmp_profile_dir = tmp_profile_dir self.is_verbatim = False self.is_doctest = False self.is_suppress = False # Optionally, provide more detailed information to shell. + # this is assigned by the SetUp method of IPythonDirective + # to point at itself. + # + # So, you can access handy things at self.directive.state self.directive = None # on the first call to the savefig decorator, we'll import @@ -332,6 +315,9 @@ def __init__(self, exec_lines=None,state=None): for line in exec_lines: self.process_input_line(line, store_history=False) + def cleanup(self): + shutil.rmtree(self.tmp_profile_dir, ignore_errors=True) + def clear_cout(self): self.cout.seek(0) self.cout.truncate(0) @@ -346,11 +332,7 @@ def process_input_line(self, line, store_history=True): splitter.push(line) more = splitter.push_accepts_more() if not more: - try: - source_raw = splitter.source_raw_reset()[1] - except: - # recent ipython #4504 - source_raw = splitter.raw_reset() + source_raw = splitter.raw_reset() self.IP.run_cell(source_raw, store_history=store_history) finally: sys.stdout = stdout @@ -368,9 +350,9 @@ def process_image(self, decorator): source_dir = self.source_dir saveargs = decorator.split(' ') filename = saveargs[1] - # insert relative path to image file in source - outfile = os.path.relpath(os.path.join(savefig_dir,filename), - source_dir) + # insert relative path to image file in source (as absolute path for Sphinx) + outfile = '/' + os.path.relpath(os.path.join(savefig_dir,filename), + source_dir) imagerows = ['.. image:: %s'%outfile] @@ -403,17 +385,10 @@ def process_input(self, data, input_prompt, lineno): is_savefig = decorator is not None and \ decorator.startswith('@savefig') - # set the encodings to be used by DecodingStringIO - # to convert the execution output into unicode if - # needed. this attrib is set by IpythonDirective.run() - # based on the specified block options, defaulting to ['ut - self.cout.set_encodings(self.output_encoding) - input_lines = input.split('\n') - if len(input_lines) > 1: - if input_lines[-1] != "": - input_lines.append('') # make sure there's a blank line + if input_lines[-1] != "": + input_lines.append('') # make sure there's a blank line # so splitter buffer gets reset continuation = ' %s:'%''.join(['.']*(len(str(lineno))+2)) @@ -456,30 +431,75 @@ def process_input(self, data, input_prompt, lineno): ret.append(formatted_line) if not is_suppress and len(rest.strip()) and is_verbatim: - # the "rest" is the standard output of the - # input, which needs to be added in - # verbatim mode + # The "rest" is the standard output of the input. This needs to be + # added when in verbatim mode. If there is no "rest", then we don't + # add it, as the new line will be added by the processed output. ret.append(rest) + # Fetch the processed output. (This is not the submitted output.) self.cout.seek(0) - output = self.cout.read() + processed_output = self.cout.read() if not is_suppress and not is_semicolon: - ret.append(output) - elif is_semicolon: # get spacing right + # + # In IPythonDirective.run, the elements of `ret` are eventually + # combined such that '' entries correspond to newlines. So if + # `processed_output` is equal to '', then the adding it to `ret` + # ensures that there is a blank line between consecutive inputs + # that have no outputs, as in: + # + # In [1]: x = 4 + # + # In [2]: x = 5 + # + # When there is processed output, it has a '\n' at the tail end. So + # adding the output to `ret` will provide the necessary spacing + # between consecutive input/output blocks, as in: + # + # In [1]: x + # Out[1]: 5 + # + # In [2]: x + # Out[2]: 5 + # + # When there is stdout from the input, it also has a '\n' at the + # tail end, and so this ensures proper spacing as well. E.g.: + # + # In [1]: print x + # 5 + # + # In [2]: x = 5 + # + # When in verbatim mode, `processed_output` is empty (because + # nothing was passed to IP. Sometimes the submitted code block has + # an Out[] portion and sometimes it does not. When it does not, we + # need to ensure proper spacing, so we have to add '' to `ret`. + # However, if there is an Out[] in the submitted code, then we do + # not want to add a newline as `process_output` has stuff to add. + # The difficulty is that `process_input` doesn't know if + # `process_output` will be called---so it doesn't know if there is + # Out[] in the code block. The requires that we include a hack in + # `process_block`. See the comments there. + # + ret.append(processed_output) + elif is_semicolon: + # Make sure there is a newline after the semicolon. ret.append('') # context information - filename = self.state.document.current_source - lineno = self.state.document.current_line + filename = "Unknown" + lineno = 0 + if self.directive.state: + filename = self.directive.state.document.current_source + lineno = self.directive.state.document.current_line # output any exceptions raised during execution to stdout # unless :okexcept: has been specified. - if not is_okexcept and "Traceback" in output: + if not is_okexcept and "Traceback" in processed_output: s = "\nException in %s at block ending on line %s\n" % (filename, lineno) s += "Specify :okexcept: as an option in the ipython:: block to suppress this message\n" sys.stdout.write('\n\n>>>' + ('-' * 73)) sys.stdout.write(s) - sys.stdout.write(output) + sys.stdout.write(processed_output) sys.stdout.write('<<<' + ('-' * 73) + '\n\n') # output any warning raised during execution to stdout @@ -490,28 +510,32 @@ def process_input(self, data, input_prompt, lineno): s += "Specify :okwarning: as an option in the ipython:: block to suppress this message\n" sys.stdout.write('\n\n>>>' + ('-' * 73)) sys.stdout.write(s) - sys.stdout.write('-' * 76 + '\n') + sys.stdout.write(('-' * 76) + '\n') s=warnings.formatwarning(w.message, w.category, w.filename, w.lineno, w.line) sys.stdout.write(s) sys.stdout.write('<<<' + ('-' * 73) + '\n') self.cout.truncate(0) - return (ret, input_lines, output, is_doctest, decorator, image_file, - image_directive) + + return (ret, input_lines, processed_output, + is_doctest, decorator, image_file, image_directive) - def process_output(self, data, output_prompt, - input_lines, output, is_doctest, decorator, image_file): + def process_output(self, data, output_prompt, input_lines, output, + is_doctest, decorator, image_file): """ Process data block for OUTPUT token. """ + # Recall: `data` is the submitted output, and `output` is the processed + # output from `input_lines`. + TAB = ' ' * 4 if is_doctest and output is not None: - found = output + found = output # This is the processed output found = found.strip() submitted = data.strip() @@ -522,7 +546,7 @@ def process_output(self, data, output_prompt, source = self.directive.state.document.current_source content = self.directive.content # Add tabs and join into a single string. - content = '\n'.join(TAB + line for line in content) + content = '\n'.join([TAB + line for line in content]) # Make sure the output contains the output prompt. ind = found.find(output_prompt) @@ -553,6 +577,31 @@ def process_output(self, data, output_prompt, else: self.custom_doctest(decorator, input_lines, found, submitted) + # When in verbatim mode, this holds additional submitted output + # to be written in the final Sphinx output. + # https://github.com/ipython/ipython/issues/5776 + out_data = [] + + is_verbatim = decorator=='@verbatim' or self.is_verbatim + if is_verbatim and data.strip(): + # Note that `ret` in `process_block` has '' as its last element if + # the code block was in verbatim mode. So if there is no submitted + # output, then we will have proper spacing only if we do not add + # an additional '' to `out_data`. This is why we condition on + # `and data.strip()`. + + # The submitted output has no output prompt. If we want the + # prompt and the code to appear, we need to join them now + # instead of adding them separately---as this would create an + # undesired newline. How we do this ultimately depends on the + # format of the output regex. I'll do what works for the default + # prompt for now, and we might have to adjust if it doesn't work + # in other cases. Finally, the submitted output does not have + # a trailing newline, so we must add it manually. + out_data.append("{0} {1}\n".format(output_prompt, data)) + + return out_data + def process_comment(self, data): """Process data fPblock for COMMENT token.""" if not self.is_suppress: @@ -563,9 +612,7 @@ def save_image(self, image_file): Saves the image file to disk. """ self.ensure_pyplot() - command = ('plt.gcf().savefig("%s", bbox_inches="tight", ' - 'dpi=100)' % image_file) - + command = 'plt.gcf().savefig("%s")'%image_file #print 'SAVEFIG', command # dbg self.process_input_line('bookmark ipy_thisdir', store_history=False) self.process_input_line('cd -b ipy_savedir', store_history=False) @@ -588,18 +635,53 @@ def process_block(self, block): image_file = None image_directive = None + found_input = False for token, data in block: if token == COMMENT: out_data = self.process_comment(data) elif token == INPUT: - (out_data, input_lines, output, is_doctest, decorator, - image_file, image_directive) = \ + found_input = True + (out_data, input_lines, output, is_doctest, + decorator, image_file, image_directive) = \ self.process_input(data, input_prompt, lineno) elif token == OUTPUT: + if not found_input: + + TAB = ' ' * 4 + linenumber = 0 + source = 'Unavailable' + content = 'Unavailable' + if self.directive: + linenumber = self.directive.state.document.current_line + source = self.directive.state.document.current_source + content = self.directive.content + # Add tabs and join into a single string. + content = '\n'.join([TAB + line for line in content]) + + e = ('\n\nInvalid block: Block contains an output prompt ' + 'without an input prompt.\n\n' + 'Document source: {0}\n\n' + 'Content begins at line {1}: \n\n{2}\n\n' + 'Problematic block within content: \n\n{TAB}{3}\n\n') + e = e.format(source, linenumber, content, block, TAB=TAB) + + # Write, rather than include in exception, since Sphinx + # will truncate tracebacks. + sys.stdout.write(e) + raise RuntimeError('An invalid block was detected.') + out_data = \ - self.process_output(data, output_prompt, - input_lines, output, is_doctest, - decorator, image_file) + self.process_output(data, output_prompt, input_lines, + output, is_doctest, decorator, + image_file) + if out_data: + # Then there was user submitted output in verbatim mode. + # We need to remove the last element of `ret` that was + # added in `process_input`, as it is '' and would introduce + # an undesirable newline. + assert(ret[-1] == '') + del ret[-1] + if out_data: ret.extend(out_data) @@ -740,8 +822,7 @@ class IPythonDirective(Directive): 'verbatim' : directives.flag, 'doctest' : directives.flag, 'okexcept': directives.flag, - 'okwarning': directives.flag, - 'output_encoding': directives.unchanged_required + 'okwarning': directives.flag } shell = None @@ -753,14 +834,9 @@ def get_config_options(self): config = self.state.document.settings.env.config # get config variables to set figure output directory - confdir = self.state.document.settings.env.app.confdir savefig_dir = config.ipython_savefig_dir - source_dir = os.path.dirname(self.state.document.current_source) - if savefig_dir is None: - savefig_dir = config.html_static_path - if isinstance(savefig_dir, list): - savefig_dir = savefig_dir[0] # safe to assume only one path? - savefig_dir = os.path.join(confdir, savefig_dir) + source_dir = self.state.document.settings.env.srcdir + savefig_dir = os.path.join(source_dir, savefig_dir) # get regex and prompt stuff rgxin = config.ipython_rgxin @@ -779,6 +855,12 @@ def setup(self): (savefig_dir, source_dir, rgxin, rgxout, promptin, promptout, mplbackend, exec_lines, hold_count) = self.get_config_options() + try: + os.makedirs(savefig_dir) + except OSError as e: + if e.errno != errno.EEXIST: + raise + if self.shell is None: # We will be here many times. However, when the # EmbeddedSphinxShell is created, its interactive shell member @@ -786,13 +868,11 @@ def setup(self): if mplbackend and 'matplotlib.backends' not in sys.modules: import matplotlib - # Repeated calls to use() will not hurt us since `mplbackend` - # is the same each time. matplotlib.use(mplbackend) # Must be called after (potentially) importing matplotlib and # setting its backend since exec_lines might import pylab. - self.shell = EmbeddedSphinxShell(exec_lines, self.state) + self.shell = EmbeddedSphinxShell(exec_lines) # Store IPython directive to enable better error messages self.shell.directive = self @@ -800,14 +880,9 @@ def setup(self): # reset the execution count if we haven't processed this doc #NOTE: this may be borked if there are multiple seen_doc tmp files #check time stamp? - if self.state.document.current_source not in self.seen_docs: + if not self.state.document.current_source in self.seen_docs: self.shell.IP.history_manager.reset() self.shell.IP.execution_count = 1 - try: - self.shell.IP.prompt_manager.width = 0 - except AttributeError: - # GH14003: class promptManager has removed after IPython 5.x - pass self.seen_docs.add(self.state.document.current_source) # and attach to shell so we don't have to pass them around @@ -846,13 +921,13 @@ def run(self): self.shell.is_okexcept = 'okexcept' in options self.shell.is_okwarning = 'okwarning' in options - self.shell.output_encoding = [options.get('output_encoding', 'utf8')] - # handle pure python code if 'python' in self.arguments: content = self.content self.content = self.shell.process_pure_python(content) + # parts consists of all text within the ipython-block. + # Each part is an input/output block. parts = '\n'.join(self.content).split('\n\n') lines = ['.. code-block:: ipython', ''] @@ -863,7 +938,8 @@ def run(self): if len(block): rows, figure = self.shell.process_block(block) for row in rows: - lines.extend([' %s'%line for line in row.split('\n')]) + lines.extend([' {0}'.format(line) + for line in row.split('\n')]) if figure is not None: figures.append(figure) @@ -873,7 +949,7 @@ def run(self): lines.extend(figure.split('\n')) lines.append('') - if len(lines)>2: + if len(lines) > 2: if debug: print('\n'.join(lines)) else: @@ -893,7 +969,7 @@ def setup(app): setup.app = app app.add_directive('ipython', IPythonDirective) - app.add_config_value('ipython_savefig_dir', None, 'env') + app.add_config_value('ipython_savefig_dir', 'savefig', 'env') app.add_config_value('ipython_rgxin', re.compile('In \[(\d+)\]:\s?(.*)\s*'), 'env') app.add_config_value('ipython_rgxout', @@ -914,6 +990,9 @@ def setup(app): app.add_config_value('ipython_holdcount', True, 'env') + metadata = {'parallel_read_safe': True, 'parallel_write_safe': True} + return metadata + # Simple smoke test, needs to be converted to a proper automatic test. def test(): @@ -1074,7 +1153,7 @@ def test(): #ipython_directive.DEBUG = True # dbg #options = dict(suppress=True) # dbg - options = dict() + options = {} for example in examples: content = example.split('\n') IPythonDirective('debug', arguments=None, options=options, From 02b59b63a5bb8bb03d5c6a0013399906d5f355a2 Mon Sep 17 00:00:00 2001 From: topper-123 Date: Mon, 19 Feb 2018 13:50:56 +0000 Subject: [PATCH 130/214] add test for numpy ops, esp. nanmin/max bug for np<1.13 (#19753) --- pandas/tests/test_nanops.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index df3c49a73d227..dffb303af6ae1 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -13,6 +13,7 @@ import pandas.core.nanops as nanops import pandas.util.testing as tm import pandas.util._test_decorators as td +from pandas.compat.numpy import _np_version_under1p13 use_bn = nanops._USE_BOTTLENECK @@ -1015,3 +1016,34 @@ def test_use_bottleneck(): assert not pd.get_option('use_bottleneck') pd.set_option('use_bottleneck', use_bn) + + +@pytest.mark.parametrize("numpy_op, expected", [ + (np.sum, 10), + (np.nansum, 10), + (np.mean, 2.5), + (np.nanmean, 2.5), + (np.median, 2.5), + (np.nanmedian, 2.5), + (np.min, 1), + (np.max, 4), +]) +def test_numpy_ops(numpy_op, expected): + # GH8383 + result = numpy_op(pd.Series([1, 2, 3, 4])) + assert result == expected + + +@pytest.mark.parametrize("numpy_op, expected", [ + (np.nanmin, 1), + (np.nanmax, 4), +]) +def test_numpy_ops_np_version_under1p13(numpy_op, expected): + # GH8383 + result = numpy_op(pd.Series([1, 2, 3, 4])) + if _np_version_under1p13: + # bug for numpy < 1.13, where result is a series, should be a scalar + with pytest.raises(ValueError): + assert result == expected + else: + assert result == expected From 966035b6060e5b80fb8b3b0a36ed548cfc4a549c Mon Sep 17 00:00:00 2001 From: ZhuBaohe Date: Mon, 19 Feb 2018 21:52:13 +0800 Subject: [PATCH 131/214] DOC: correct Period.strftime exsample (#19758) --- pandas/_libs/tslibs/period.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index c11a8b149bc13..32ffe4e6d0453 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1403,7 +1403,7 @@ cdef class _Period(object): Examples -------- - >>> a = Period(freq='Q@JUL', year=2006, quarter=1) + >>> a = Period(freq='Q-JUL', year=2006, quarter=1) >>> a.strftime('%F-Q%q') '2006-Q1' >>> # Output the last month in the quarter of this date From c1f0c63191e318981b5a244a3f240090df63f1d0 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Mon, 19 Feb 2018 10:38:41 -0500 Subject: [PATCH 132/214] ENH: fake http proxy in case of --skip-network testing (#19757) --- ci/script_multi.sh | 5 +++++ ci/script_single.sh | 5 +++++ pandas/tests/test_downstream.py | 2 ++ 3 files changed, 12 insertions(+) diff --git a/ci/script_multi.sh b/ci/script_multi.sh index 6c354fc4cab0b..45c61ee3172fe 100755 --- a/ci/script_multi.sh +++ b/ci/script_multi.sh @@ -12,6 +12,11 @@ if [ -n "$LOCALE_OVERRIDE" ]; then python -c "$pycmd" fi +# Enforce absent network during testing by faking a proxy +if echo "$TEST_ARGS" | grep -e --skip-network -q; then + export http_proxy=http://1.2.3.4 https_proxy=http://1.2.3.4; +fi + # Workaround for pytest-xdist flaky collection order # https://github.com/pytest-dev/pytest/issues/920 # https://github.com/pytest-dev/pytest/issues/1075 diff --git a/ci/script_single.sh b/ci/script_single.sh index 74b0e897f1d73..021a5a7714fb5 100755 --- a/ci/script_single.sh +++ b/ci/script_single.sh @@ -16,6 +16,11 @@ if [ "$SLOW" ]; then TEST_ARGS="--only-slow --skip-network" fi +# Enforce absent network during testing by faking a proxy +if echo "$TEST_ARGS" | grep -e --skip-network -q; then + export http_proxy=http://1.2.3.4 https_proxy=http://1.2.3.4; +fi + if [ "$PIP_BUILD_TEST" ]; then echo "We are not running pytest as this is a build test." diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index b438d6a6137b0..a595d9f18d6b8 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -53,6 +53,7 @@ def test_xarray(df): assert df.to_xarray() is not None +@tm.network def test_statsmodels(): statsmodels = import_module('statsmodels') # noqa @@ -73,6 +74,7 @@ def test_scikit_learn(df): clf.predict(digits.data[-1:]) +@tm.network def test_seaborn(): seaborn = import_module('seaborn') From 73ecfe6cec0dee2b3497aa589b9cb1fbcf011572 Mon Sep 17 00:00:00 2001 From: ZhuBaohe Date: Tue, 20 Feb 2018 00:10:32 +0800 Subject: [PATCH 133/214] DOC: correct Panel.apply exsample (#19766) --- pandas/core/panel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/panel.py b/pandas/core/panel.py index 2cb80e938afb9..7f973992fb07f 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -1020,7 +1020,7 @@ def apply(self, func, axis='major', **kwargs): Equivalent to previous: - >>> p.apply(lambda x: x.sum(), axis='minor') + >>> p.apply(lambda x: x.sum(), axis='major') Return the shapes of each DataFrame over axis 2 (i.e the shapes of items x major), as a Series From 718d0679600fa025872d8b66c198a6781b312d70 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 19 Feb 2018 08:45:30 -0800 Subject: [PATCH 134/214] split off scalar tests to submodules (#19752) --- pandas/tests/scalar/interval/__init__.py | 0 .../scalar/{ => interval}/test_interval.py | 0 pandas/tests/scalar/period/__init__.py | 0 .../tests/scalar/{ => period}/test_period.py | 0 .../scalar/{ => period}/test_period_asfreq.py | 0 pandas/tests/scalar/timedelta/__init__.py | 0 .../tests/scalar/timedelta/test_arithmetic.py | 422 ++++++++++++++++++ .../scalar/{ => timedelta}/test_timedelta.py | 164 +------ .../scalar/{ => timestamp}/test_timestamp.py | 0 9 files changed, 423 insertions(+), 163 deletions(-) create mode 100644 pandas/tests/scalar/interval/__init__.py rename pandas/tests/scalar/{ => interval}/test_interval.py (100%) create mode 100644 pandas/tests/scalar/period/__init__.py rename pandas/tests/scalar/{ => period}/test_period.py (100%) rename pandas/tests/scalar/{ => period}/test_period_asfreq.py (100%) create mode 100644 pandas/tests/scalar/timedelta/__init__.py create mode 100644 pandas/tests/scalar/timedelta/test_arithmetic.py rename pandas/tests/scalar/{ => timedelta}/test_timedelta.py (83%) rename pandas/tests/scalar/{ => timestamp}/test_timestamp.py (100%) diff --git a/pandas/tests/scalar/interval/__init__.py b/pandas/tests/scalar/interval/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/scalar/test_interval.py b/pandas/tests/scalar/interval/test_interval.py similarity index 100% rename from pandas/tests/scalar/test_interval.py rename to pandas/tests/scalar/interval/test_interval.py diff --git a/pandas/tests/scalar/period/__init__.py b/pandas/tests/scalar/period/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/scalar/test_period.py b/pandas/tests/scalar/period/test_period.py similarity index 100% rename from pandas/tests/scalar/test_period.py rename to pandas/tests/scalar/period/test_period.py diff --git a/pandas/tests/scalar/test_period_asfreq.py b/pandas/tests/scalar/period/test_period_asfreq.py similarity index 100% rename from pandas/tests/scalar/test_period_asfreq.py rename to pandas/tests/scalar/period/test_period_asfreq.py diff --git a/pandas/tests/scalar/timedelta/__init__.py b/pandas/tests/scalar/timedelta/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/scalar/timedelta/test_arithmetic.py b/pandas/tests/scalar/timedelta/test_arithmetic.py new file mode 100644 index 0000000000000..90c911c24f6a9 --- /dev/null +++ b/pandas/tests/scalar/timedelta/test_arithmetic.py @@ -0,0 +1,422 @@ +# -*- coding: utf-8 -*- +""" +Tests for scalar Timedelta arithmetic ops +""" +from datetime import datetime, timedelta +import operator + +import numpy as np +import pytest + +import pandas as pd +import pandas.util.testing as tm +from pandas.core import ops +from pandas import Timedelta, Timestamp, NaT + + +class TestTimedeltaAdditionSubtraction(object): + """ + Tests for Timedelta methods: + + __add__, __radd__, + __sub__, __rsub__ + """ + @pytest.mark.parametrize('ten_seconds', [ + Timedelta(10, unit='s'), + timedelta(seconds=10), + np.timedelta64(10, 's'), + np.timedelta64(10000000000, 'ns'), + pd.offsets.Second(10)]) + def test_td_add_sub_ten_seconds(self, ten_seconds): + # GH#6808 + base = Timestamp('20130101 09:01:12.123456') + expected_add = Timestamp('20130101 09:01:22.123456') + expected_sub = Timestamp('20130101 09:01:02.123456') + + result = base + ten_seconds + assert result == expected_add + + result = base - ten_seconds + assert result == expected_sub + + @pytest.mark.parametrize('one_day_ten_secs', [ + Timedelta('1 day, 00:00:10'), + Timedelta('1 days, 00:00:10'), + timedelta(days=1, seconds=10), + np.timedelta64(1, 'D') + np.timedelta64(10, 's'), + pd.offsets.Day() + pd.offsets.Second(10)]) + def test_td_add_sub_one_day_ten_seconds(self, one_day_ten_secs): + # GH#6808 + base = Timestamp('20130102 09:01:12.123456') + expected_add = Timestamp('20130103 09:01:22.123456') + expected_sub = Timestamp('20130101 09:01:02.123456') + + result = base + one_day_ten_secs + assert result == expected_add + + result = base - one_day_ten_secs + assert result == expected_sub + + @pytest.mark.parametrize('op', [operator.add, ops.radd]) + def test_td_add_datetimelike_scalar(self, op): + # GH#19738 + td = Timedelta(10, unit='d') + + result = op(td, datetime(2016, 1, 1)) + if op is operator.add: + # datetime + Timedelta does _not_ call Timedelta.__radd__, + # so we get a datetime back instead of a Timestamp + assert isinstance(result, Timestamp) + assert result == Timestamp(2016, 1, 11) + + result = op(td, Timestamp('2018-01-12 18:09')) + assert isinstance(result, Timestamp) + assert result == Timestamp('2018-01-22 18:09') + + result = op(td, np.datetime64('2018-01-12')) + assert isinstance(result, Timestamp) + assert result == Timestamp('2018-01-22') + + result = op(td, NaT) + assert result is NaT + + with pytest.raises(TypeError): + op(td, 2) + with pytest.raises(TypeError): + op(td, 2.0) + + @pytest.mark.parametrize('op', [operator.add, ops.radd]) + def test_td_add_td(self, op): + td = Timedelta(10, unit='d') + + result = op(td, Timedelta(days=10)) + assert isinstance(result, Timedelta) + assert result == Timedelta(days=20) + + @pytest.mark.parametrize('op', [operator.add, ops.radd]) + def test_td_add_pytimedelta(self, op): + td = Timedelta(10, unit='d') + result = op(td, timedelta(days=9)) + assert isinstance(result, Timedelta) + assert result == Timedelta(days=19) + + @pytest.mark.xfail(reason='GH#19738 argument not converted to Timedelta') + @pytest.mark.parametrize('op', [operator.add, ops.radd]) + def test_td_add_timedelta64(self, op): + td = Timedelta(10, unit='d') + result = op(td, np.timedelta64(-4, 'D')) + assert isinstance(result, Timedelta) + assert result == Timedelta(days=6) + + @pytest.mark.parametrize('op', [operator.add, ops.radd]) + def test_td_add_offset(self, op): + td = Timedelta(10, unit='d') + + result = op(td, pd.offsets.Hour(6)) + assert isinstance(result, Timedelta) + assert result == Timedelta(days=10, hours=6) + + def test_td_sub_td(self): + td = Timedelta(10, unit='d') + expected = Timedelta(0, unit='ns') + result = td - td + assert isinstance(result, Timedelta) + assert result == expected + + def test_td_sub_pytimedelta(self): + td = Timedelta(10, unit='d') + expected = Timedelta(0, unit='ns') + result = td - td.to_pytimedelta() + assert isinstance(result, Timedelta) + assert result == expected + + @pytest.mark.xfail(reason='GH#19738 argument not converted to Timedelta') + def test_td_sub_timedelta64(self): + td = Timedelta(10, unit='d') + expected = Timedelta(0, unit='ns') + result = td - td.to_timedelta64() + assert isinstance(result, Timedelta) + # comparison fails even if we comment out the isinstance assertion + assert result == expected + + def test_td_sub_nat(self): + td = Timedelta(10, unit='d') + result = td - NaT + assert result is NaT + + @pytest.mark.xfail(reason='GH#19738 argument not converted to Timedelta') + def test_td_sub_td64_nat(self): + td = Timedelta(10, unit='d') + result = td - np.timedelta64('NaT') + assert result is NaT + + def test_td_sub_offset(self): + td = Timedelta(10, unit='d') + result = td - pd.offsets.Hour(1) + assert isinstance(result, Timedelta) + assert result == Timedelta(239, unit='h') + + def test_td_sub_numeric_raises(self): + td = td = Timedelta(10, unit='d') + with pytest.raises(TypeError): + td - 2 + with pytest.raises(TypeError): + td - 2.0 + + def test_td_rsub_pytimedelta(self): + td = Timedelta(10, unit='d') + expected = Timedelta(0, unit='ns') + + result = td.to_pytimedelta() - td + assert isinstance(result, Timedelta) + assert result == expected + + @pytest.mark.xfail(reason='GH#19738 argument not converted to Timedelta') + def test_td_rsub_timedelta64(self): + td = Timedelta(10, unit='d') + expected = Timedelta(0, unit='ns') + + result = td.to_timedelta64() - td + assert isinstance(result, Timedelta) + assert result == expected + + def test_td_rsub_nat(self): + td = Timedelta(10, unit='d') + result = NaT - td + assert result is NaT + + result = np.datetime64('NaT') - td + assert result is NaT + + @pytest.mark.xfail(reason='GH#19738 argument not converted to Timedelta') + def test_td_rsub_td64_nat(self): + td = Timedelta(10, unit='d') + result = np.timedelta64('NaT') - td + assert result is NaT + + def test_td_rsub_offset(self): + result = pd.offsets.Hour(1) - Timedelta(10, unit='d') + assert isinstance(result, Timedelta) + assert result == Timedelta(-239, unit='h') + + def test_td_rsub_numeric_raises(self): + td = td = Timedelta(10, unit='d') + with pytest.raises(TypeError): + 2 - td + with pytest.raises(TypeError): + 2.0 - td + + +class TestTimedeltaMultiplicationDivision(object): + """ + Tests for Timedelta methods: + + __mul__, __rmul__, + __div__, __rdiv__, + __truediv__, __rtruediv__, + __floordiv__, __rfloordiv__, + __mod__, __rmod__, + __divmod__, __rdivmod__ + """ + + # --------------------------------------------------------------- + # Timedelta.__mul__, __rmul__ + + @pytest.mark.parametrize('op', [operator.mul, ops.rmul]) + def test_td_mul_scalar(self, op): + # GH#19738 + td = Timedelta(minutes=3) + + result = op(td, 2) + assert result == Timedelta(minutes=6) + + result = op(td, 1.5) + assert result == Timedelta(minutes=4, seconds=30) + + assert op(td, np.nan) is NaT + + assert op(-1, td).value == -1 * td.value + assert op(-1.0, td).value == -1.0 * td.value + + with pytest.raises(TypeError): + # timedelta * datetime is gibberish + op(td, Timestamp(2016, 1, 2)) + + with pytest.raises(TypeError): + # invalid multiply with another timedelta + op(td, td) + + # --------------------------------------------------------------- + # Timedelta.__div__, __truediv__ + + def test_td_div_timedeltalike_scalar(self): + # GH#19738 + td = Timedelta(10, unit='d') + + result = td / pd.offsets.Hour(1) + assert result == 240 + + assert td / td == 1 + assert td / np.timedelta64(60, 'h') == 4 + + assert np.isnan(td / NaT) + + def test_td_div_numeric_scalar(self): + # GH#19738 + td = Timedelta(10, unit='d') + + result = td / 2 + assert isinstance(result, Timedelta) + assert result == Timedelta(days=5) + + result = td / 5.0 + assert isinstance(result, Timedelta) + assert result == Timedelta(days=2) + + # --------------------------------------------------------------- + # Timedelta.__rdiv__ + + def test_td_rdiv_timedeltalike_scalar(self): + # GH#19738 + td = Timedelta(10, unit='d') + result = pd.offsets.Hour(1) / td + assert result == 1 / 240.0 + + assert np.timedelta64(60, 'h') / td == 0.25 + + # --------------------------------------------------------------- + # Timedelta.__floordiv__ + + def test_td_floordiv_timedeltalike_scalar(self): + # GH#18846 + td = Timedelta(hours=3, minutes=4) + scalar = Timedelta(hours=3, minutes=3) + + assert td // scalar == 1 + assert -td // scalar.to_pytimedelta() == -2 + assert (2 * td) // scalar.to_timedelta64() == 2 + + def test_td_floordiv_null_scalar(self): + # GH#18846 + td = Timedelta(hours=3, minutes=4) + + assert td // np.nan is NaT + assert np.isnan(td // NaT) + assert np.isnan(td // np.timedelta64('NaT')) + + def test_td_floordiv_invalid_scalar(self): + # GH#18846 + td = Timedelta(hours=3, minutes=4) + + with pytest.raises(TypeError): + td // np.datetime64('2016-01-01', dtype='datetime64[us]') + + def test_td_floordiv_numeric_scalar(self): + # GH#18846 + td = Timedelta(hours=3, minutes=4) + + expected = Timedelta(hours=1, minutes=32) + assert td // 2 == expected + assert td // 2.0 == expected + assert td // np.float64(2.0) == expected + assert td // np.int32(2.0) == expected + assert td // np.uint8(2.0) == expected + + def test_floordiv_timedeltalike_array(self): + # GH#18846 + td = Timedelta(hours=3, minutes=4) + scalar = Timedelta(hours=3, minutes=3) + + # Array-like others + assert td // np.array(scalar.to_timedelta64()) == 1 + + res = (3 * td) // np.array([scalar.to_timedelta64()]) + expected = np.array([3], dtype=np.int64) + tm.assert_numpy_array_equal(res, expected) + + res = (10 * td) // np.array([scalar.to_timedelta64(), + np.timedelta64('NaT')]) + expected = np.array([10, np.nan]) + tm.assert_numpy_array_equal(res, expected) + + def test_td_floordiv_numeric_series(self): + # GH#18846 + td = Timedelta(hours=3, minutes=4) + ser = pd.Series([1], dtype=np.int64) + res = td // ser + assert res.dtype.kind == 'm' + + # --------------------------------------------------------------- + # Timedelta.__rfloordiv__ + + def test_td_rfloordiv_timedeltalike_scalar(self): + # GH#18846 + td = Timedelta(hours=3, minutes=3) + scalar = Timedelta(hours=3, minutes=4) + + # scalar others + # x // Timedelta is defined only for timedelta-like x. int-like, + # float-like, and date-like, in particular, should all either + # a) raise TypeError directly or + # b) return NotImplemented, following which the reversed + # operation will raise TypeError. + assert td.__rfloordiv__(scalar) == 1 + assert (-td).__rfloordiv__(scalar.to_pytimedelta()) == -2 + assert (2 * td).__rfloordiv__(scalar.to_timedelta64()) == 0 + + def test_td_rfloordiv_null_scalar(self): + # GH#18846 + td = Timedelta(hours=3, minutes=3) + + assert np.isnan(td.__rfloordiv__(NaT)) + assert np.isnan(td.__rfloordiv__(np.timedelta64('NaT'))) + + def test_td_rfloordiv_invalid_scalar(self): + # GH#18846 + td = Timedelta(hours=3, minutes=3) + + dt64 = np.datetime64('2016-01-01', dtype='datetime64[us]') + with pytest.raises(TypeError): + td.__rfloordiv__(dt64) + + def test_td_rfloordiv_numeric_scalar(self): + # GH#18846 + td = Timedelta(hours=3, minutes=3) + + assert td.__rfloordiv__(np.nan) is NotImplemented + assert td.__rfloordiv__(3.5) is NotImplemented + assert td.__rfloordiv__(2) is NotImplemented + + with pytest.raises(TypeError): + td.__rfloordiv__(np.float64(2.0)) + with pytest.raises(TypeError): + td.__rfloordiv__(np.int32(2.0)) + with pytest.raises(TypeError): + td.__rfloordiv__(np.uint8(9)) + + def test_td_rfloordiv_timedeltalike_array(self): + # GH#18846 + td = Timedelta(hours=3, minutes=3) + scalar = Timedelta(hours=3, minutes=4) + + # Array-like others + assert td.__rfloordiv__(np.array(scalar.to_timedelta64())) == 1 + + res = td.__rfloordiv__(np.array([(3 * scalar).to_timedelta64()])) + expected = np.array([3], dtype=np.int64) + tm.assert_numpy_array_equal(res, expected) + + arr = np.array([(10 * scalar).to_timedelta64(), + np.timedelta64('NaT')]) + res = td.__rfloordiv__(arr) + expected = np.array([10, np.nan]) + tm.assert_numpy_array_equal(res, expected) + + def test_td_rfloordiv_numeric_series(self): + # GH#18846 + td = Timedelta(hours=3, minutes=3) + ser = pd.Series([1], dtype=np.int64) + res = td.__rfloordiv__(ser) + assert res is NotImplemented + with pytest.raises(TypeError): + ser // td diff --git a/pandas/tests/scalar/test_timedelta.py b/pandas/tests/scalar/timedelta/test_timedelta.py similarity index 83% rename from pandas/tests/scalar/test_timedelta.py rename to pandas/tests/scalar/timedelta/test_timedelta.py index 667266be2a89b..420b66b4ce0dc 100644 --- a/pandas/tests/scalar/test_timedelta.py +++ b/pandas/tests/scalar/timedelta/test_timedelta.py @@ -74,46 +74,6 @@ class Other: assert td.__mul__(other) is NotImplemented assert td.__floordiv__(other) is NotImplemented - def test_timedelta_ops_scalar(self): - # GH 6808 - base = pd.to_datetime('20130101 09:01:12.123456') - expected_add = pd.to_datetime('20130101 09:01:22.123456') - expected_sub = pd.to_datetime('20130101 09:01:02.123456') - - for offset in [pd.to_timedelta(10, unit='s'), timedelta(seconds=10), - np.timedelta64(10, 's'), - np.timedelta64(10000000000, 'ns'), - pd.offsets.Second(10)]: - result = base + offset - assert result == expected_add - - result = base - offset - assert result == expected_sub - - base = pd.to_datetime('20130102 09:01:12.123456') - expected_add = pd.to_datetime('20130103 09:01:22.123456') - expected_sub = pd.to_datetime('20130101 09:01:02.123456') - - for offset in [pd.to_timedelta('1 day, 00:00:10'), - pd.to_timedelta('1 days, 00:00:10'), - timedelta(days=1, seconds=10), - np.timedelta64(1, 'D') + np.timedelta64(10, 's'), - pd.offsets.Day() + pd.offsets.Second(10)]: - result = base + offset - assert result == expected_add - - result = base - offset - assert result == expected_sub - - def test_ops_offsets(self): - td = Timedelta(10, unit='d') - assert Timedelta(241, unit='h') == td + pd.offsets.Hour(1) - assert Timedelta(241, unit='h') == pd.offsets.Hour(1) + td - assert 240 == td / pd.offsets.Hour(1) - assert 1 / 240.0 == pd.offsets.Hour(1) / td - assert Timedelta(239, unit='h') == td - pd.offsets.Hour(1) - assert Timedelta(-239, unit='h') == pd.offsets.Hour(1) - td - def test_unary_ops(self): td = Timedelta(10, unit='d') @@ -129,130 +89,8 @@ def test_unary_ops(self): def test_binary_ops_nat(self): td = Timedelta(10, unit='d') - - assert (td - pd.NaT) is pd.NaT - assert (td + pd.NaT) is pd.NaT + # FIXME: The next test is wrong: td * NaT should raise assert (td * pd.NaT) is pd.NaT - assert (td / pd.NaT) is np.nan - assert (td // pd.NaT) is np.nan - assert (td // np.timedelta64('NaT')) is np.nan - - def test_binary_ops_integers(self): - td = Timedelta(10, unit='d') - - assert td * 2 == Timedelta(20, unit='d') - assert td / 2 == Timedelta(5, unit='d') - assert td // 2 == Timedelta(5, unit='d') - - # invert - assert td * -1 == Timedelta('-10d') - assert -1 * td == Timedelta('-10d') - - # can't operate with integers - pytest.raises(TypeError, lambda: td + 2) - pytest.raises(TypeError, lambda: td - 2) - - def test_binary_ops_with_timedelta(self): - td = Timedelta(10, unit='d') - - assert td - td == Timedelta(0, unit='ns') - assert td + td == Timedelta(20, unit='d') - assert td / td == 1 - - # invalid multiply with another timedelta - pytest.raises(TypeError, lambda: td * td) - - def test_floordiv(self): - # GH#18846 - td = Timedelta(hours=3, minutes=4) - scalar = Timedelta(hours=3, minutes=3) - - # scalar others - assert td // scalar == 1 - assert -td // scalar.to_pytimedelta() == -2 - assert (2 * td) // scalar.to_timedelta64() == 2 - - assert td // np.nan is pd.NaT - assert np.isnan(td // pd.NaT) - assert np.isnan(td // np.timedelta64('NaT')) - - with pytest.raises(TypeError): - td // np.datetime64('2016-01-01', dtype='datetime64[us]') - - expected = Timedelta(hours=1, minutes=32) - assert td // 2 == expected - assert td // 2.0 == expected - assert td // np.float64(2.0) == expected - assert td // np.int32(2.0) == expected - assert td // np.uint8(2.0) == expected - - # Array-like others - assert td // np.array(scalar.to_timedelta64()) == 1 - - res = (3 * td) // np.array([scalar.to_timedelta64()]) - expected = np.array([3], dtype=np.int64) - tm.assert_numpy_array_equal(res, expected) - - res = (10 * td) // np.array([scalar.to_timedelta64(), - np.timedelta64('NaT')]) - expected = np.array([10, np.nan]) - tm.assert_numpy_array_equal(res, expected) - - ser = pd.Series([1], dtype=np.int64) - res = td // ser - assert res.dtype.kind == 'm' - - def test_rfloordiv(self): - # GH#18846 - td = Timedelta(hours=3, minutes=3) - scalar = Timedelta(hours=3, minutes=4) - - # scalar others - # x // Timedelta is defined only for timedelta-like x. int-like, - # float-like, and date-like, in particular, should all either - # a) raise TypeError directly or - # b) return NotImplemented, following which the reversed - # operation will raise TypeError. - assert td.__rfloordiv__(scalar) == 1 - assert (-td).__rfloordiv__(scalar.to_pytimedelta()) == -2 - assert (2 * td).__rfloordiv__(scalar.to_timedelta64()) == 0 - - assert np.isnan(td.__rfloordiv__(pd.NaT)) - assert np.isnan(td.__rfloordiv__(np.timedelta64('NaT'))) - - dt64 = np.datetime64('2016-01-01', dtype='datetime64[us]') - with pytest.raises(TypeError): - td.__rfloordiv__(dt64) - - assert td.__rfloordiv__(np.nan) is NotImplemented - assert td.__rfloordiv__(3.5) is NotImplemented - assert td.__rfloordiv__(2) is NotImplemented - - with pytest.raises(TypeError): - td.__rfloordiv__(np.float64(2.0)) - with pytest.raises(TypeError): - td.__rfloordiv__(np.int32(2.0)) - with pytest.raises(TypeError): - td.__rfloordiv__(np.uint8(9)) - - # Array-like others - assert td.__rfloordiv__(np.array(scalar.to_timedelta64())) == 1 - - res = td.__rfloordiv__(np.array([(3 * scalar).to_timedelta64()])) - expected = np.array([3], dtype=np.int64) - tm.assert_numpy_array_equal(res, expected) - - arr = np.array([(10 * scalar).to_timedelta64(), - np.timedelta64('NaT')]) - res = td.__rfloordiv__(arr) - expected = np.array([10, np.nan]) - tm.assert_numpy_array_equal(res, expected) - - ser = pd.Series([1], dtype=np.int64) - res = td.__rfloordiv__(ser) - assert res is NotImplemented - with pytest.raises(TypeError): - ser // td class TestTimedeltaComparison(object): diff --git a/pandas/tests/scalar/test_timestamp.py b/pandas/tests/scalar/timestamp/test_timestamp.py similarity index 100% rename from pandas/tests/scalar/test_timestamp.py rename to pandas/tests/scalar/timestamp/test_timestamp.py From a76e5b48c56e5429377aee07e8d7119d264fdbae Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 19 Feb 2018 18:05:14 -0500 Subject: [PATCH 135/214] CI: remove PIP & old conda build in favor of pandas-ci buildsx (#19775) --- .travis.yml | 17 ++------------ ci/install_travis.sh | 28 ++---------------------- ci/requirements-3.6_PIP_BUILD_TEST.build | 6 ----- ci/requirements-3.6_PIP_BUILD_TEST.pip | 6 ----- ci/requirements-3.6_PIP_BUILD_TEST.sh | 7 ------ ci/script_multi.sh | 18 +-------------- ci/script_single.sh | 5 +---- 7 files changed, 6 insertions(+), 81 deletions(-) delete mode 100644 ci/requirements-3.6_PIP_BUILD_TEST.build delete mode 100644 ci/requirements-3.6_PIP_BUILD_TEST.pip delete mode 100644 ci/requirements-3.6_PIP_BUILD_TEST.sh diff --git a/.travis.yml b/.travis.yml index b1168f18315c3..22ef6c819c6d4 100644 --- a/.travis.yml +++ b/.travis.yml @@ -50,9 +50,6 @@ matrix: packages: - python-gtk2 # In allow_failures - - dist: trusty - env: - - JOB="3.5" TEST_ARGS="--skip-slow --skip-network" - dist: trusty env: - JOB="3.6" TEST_ARGS="--skip-slow --skip-network" PANDAS_TESTING_MODE="deprecate" CONDA_FORGE=true COVERAGE=true @@ -63,36 +60,26 @@ matrix: # In allow_failures - dist: trusty env: - - JOB="3.6_PIP_BUILD_TEST" TEST_ARGS="--skip-slow" PIP_BUILD_TEST=true + - JOB="3.6_NUMPY_DEV" TEST_ARGS="--skip-slow --skip-network" PANDAS_TESTING_MODE="deprecate" addons: apt: packages: - xsel # In allow_failures - - dist: trusty - env: - - JOB="3.6_NUMPY_DEV" TEST_ARGS="--skip-slow --skip-network" PANDAS_TESTING_MODE="deprecate" - # In allow_failures - dist: trusty env: - JOB="3.6_DOC" DOC=true allow_failures: - - dist: trusty - env: - - JOB="3.5" TEST_ARGS="--skip-slow --skip-network" - dist: trusty env: - JOB="2.7_SLOW" SLOW=true - dist: trusty env: - - JOB="3.6_PIP_BUILD_TEST" TEST_ARGS="--skip-slow" PIP_BUILD_TEST=true + - JOB="3.6_NUMPY_DEV" TEST_ARGS="--skip-slow --skip-network" PANDAS_TESTING_MODE="deprecate" addons: apt: packages: - xsel - - dist: trusty - env: - - JOB="3.6_NUMPY_DEV" TEST_ARGS="--skip-slow --skip-network" PANDAS_TESTING_MODE="deprecate" - dist: trusty env: - JOB="3.6_DOC" DOC=true diff --git a/ci/install_travis.sh b/ci/install_travis.sh index 458ff083b65eb..9ccb4baf25505 100755 --- a/ci/install_travis.sh +++ b/ci/install_travis.sh @@ -115,15 +115,6 @@ if [ "$COVERAGE" ]; then pip install coverage pytest-cov fi -echo -if [ -z "$PIP_BUILD_TEST" ] ; then - - # build but don't install - echo "[build em]" - time python setup.py build_ext --inplace || exit 1 - -fi - # we may have run installations echo echo "[conda installs]" @@ -161,23 +152,8 @@ conda list pandas pip list --format columns |grep pandas # build and install -echo - -if [ "$PIP_BUILD_TEST" ]; then - - # build & install testing - echo "[building release]" - time bash scripts/build_dist_for_release.sh || exit 1 - conda uninstall -y cython - time pip install dist/*tar.gz || exit 1 - -else - - # install our pandas - echo "[running setup.py develop]" - python setup.py develop || exit 1 - -fi +echo "[running setup.py develop]" +python setup.py develop || exit 1 echo echo "[show pandas]" diff --git a/ci/requirements-3.6_PIP_BUILD_TEST.build b/ci/requirements-3.6_PIP_BUILD_TEST.build deleted file mode 100644 index 1c4b46aea3865..0000000000000 --- a/ci/requirements-3.6_PIP_BUILD_TEST.build +++ /dev/null @@ -1,6 +0,0 @@ -python=3.6* -python-dateutil -pytz -nomkl -numpy -cython diff --git a/ci/requirements-3.6_PIP_BUILD_TEST.pip b/ci/requirements-3.6_PIP_BUILD_TEST.pip deleted file mode 100644 index f4617133cad5b..0000000000000 --- a/ci/requirements-3.6_PIP_BUILD_TEST.pip +++ /dev/null @@ -1,6 +0,0 @@ -xarray -geopandas -seaborn -pandas_datareader -statsmodels -scikit-learn diff --git a/ci/requirements-3.6_PIP_BUILD_TEST.sh b/ci/requirements-3.6_PIP_BUILD_TEST.sh deleted file mode 100644 index 3a8cf673b32f2..0000000000000 --- a/ci/requirements-3.6_PIP_BUILD_TEST.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash - -source activate pandas - -echo "install 36 PIP_BUILD_TEST" - -conda install -n pandas -c conda-forge pyarrow dask pyqt qtpy diff --git a/ci/script_multi.sh b/ci/script_multi.sh index 45c61ee3172fe..2b2d4d5488b91 100755 --- a/ci/script_multi.sh +++ b/ci/script_multi.sh @@ -23,23 +23,7 @@ fi export PYTHONHASHSEED=$(python -c 'import random; print(random.randint(1, 4294967295))') echo PYTHONHASHSEED=$PYTHONHASHSEED -if [ "$PIP_BUILD_TEST" ] ; then - echo "[build-test]" - - echo "[env]" - pip list --format columns |grep pandas - - echo "[running]" - cd /tmp - unset PYTHONPATH - - echo "[build-test: single]" - python -c 'import pandas; pandas.test(["--skip-slow", "--skip-network", "-r xX", "-m single"])' - - echo "[build-test: not single]" - python -c 'import pandas; pandas.test(["-n 2", "--skip-slow", "--skip-network", "-r xX", "-m not single"])' - -elif [ "$DOC" ]; then +if [ "$DOC" ]; then echo "We are not running pytest as this is a doc-build" elif [ "$COVERAGE" ]; then diff --git a/ci/script_single.sh b/ci/script_single.sh index 021a5a7714fb5..f376c920ac71b 100755 --- a/ci/script_single.sh +++ b/ci/script_single.sh @@ -21,10 +21,7 @@ if echo "$TEST_ARGS" | grep -e --skip-network -q; then export http_proxy=http://1.2.3.4 https_proxy=http://1.2.3.4; fi -if [ "$PIP_BUILD_TEST" ]; then - echo "We are not running pytest as this is a build test." - -elif [ "$DOC" ]; then +if [ "$DOC" ]; then echo "We are not running pytest as this is a doc-build" elif [ "$COVERAGE" ]; then From e0f6beab9f94640e7a8686af33f96d37a9d94de9 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 19 Feb 2018 15:16:04 -0800 Subject: [PATCH 136/214] ENH: implement Timedelta.__mod__ and __divmod__ (#19755) --- doc/source/timedeltas.rst | 14 ++ doc/source/whatsnew/v0.23.0.txt | 14 ++ pandas/_libs/tslibs/timedeltas.pyx | 18 ++ .../tests/scalar/timedelta/test_arithmetic.py | 186 ++++++++++++++++++ 4 files changed, 232 insertions(+) diff --git a/doc/source/timedeltas.rst b/doc/source/timedeltas.rst index 50cff4c7bbdfb..5f3a01f0725d4 100644 --- a/doc/source/timedeltas.rst +++ b/doc/source/timedeltas.rst @@ -283,6 +283,20 @@ Rounded division (floor-division) of a ``timedelta64[ns]`` Series by a scalar td // pd.Timedelta(days=3, hours=4) pd.Timedelta(days=3, hours=4) // td +.. _timedeltas.mod_divmod: + +The mod (%) and divmod operations are defined for ``Timedelta`` when operating with another timedelta-like or with a numeric argument. + +.. ipython:: python + + pd.Timedelta(hours=37) % datetime.timedelta(hours=2) + + # divmod against a timedelta-like returns a pair (int, Timedelta) + divmod(datetime.timedelta(hours=2), pd.Timedelta(minutes=11)) + + # divmod against a numeric returns a pair (Timedelta, Timedelta) + divmod(pd.Timedelta(hours=25), 86400000000000) + Attributes ---------- diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 11c49995372f5..aa1e434aae6e9 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -117,6 +117,20 @@ resetting indexes. See the :ref:`Sorting by Indexes and Values # Sort by 'second' (index) and 'A' (column) df_multi.sort_values(by=['second', 'A']) +.. _whatsnew_0230.enhancements.timedelta_mod + +Timedelta mod method +^^^^^^^^^^^^^^^^^^^^ + +``mod`` (%) and ``divmod`` operations are now defined on ``Timedelta`` objects +when operating with either timedelta-like or with numeric arguments. +See the :ref:`documentation here `. (:issue:`19365`) + +.. ipython:: python + + td = pd.Timedelta(hours=37) + td % pd.Timedelta(minutes=45) + .. _whatsnew_0230.enhancements.ran_inf: ``.rank()`` handles ``inf`` values when ``NaN`` are present diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 37693068e0974..f10175fddd00b 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -1149,6 +1149,24 @@ class Timedelta(_Timedelta): return np.nan return other.value // self.value + def __mod__(self, other): + # Naive implementation, room for optimization + return self.__divmod__(other)[1] + + def __rmod__(self, other): + # Naive implementation, room for optimization + return self.__rdivmod__(other)[1] + + def __divmod__(self, other): + # Naive implementation, room for optimization + div = self // other + return div, self - div * other + + def __rdivmod__(self, other): + # Naive implementation, room for optimization + div = other // self + return div, other - div * self + cdef _floordiv(int64_t value, right): return value // right diff --git a/pandas/tests/scalar/timedelta/test_arithmetic.py b/pandas/tests/scalar/timedelta/test_arithmetic.py index 90c911c24f6a9..43e9491b9de0b 100644 --- a/pandas/tests/scalar/timedelta/test_arithmetic.py +++ b/pandas/tests/scalar/timedelta/test_arithmetic.py @@ -420,3 +420,189 @@ def test_td_rfloordiv_numeric_series(self): assert res is NotImplemented with pytest.raises(TypeError): ser // td + + def test_mod_timedeltalike(self): + # GH#19365 + td = Timedelta(hours=37) + + # Timedelta-like others + result = td % Timedelta(hours=6) + assert isinstance(result, Timedelta) + assert result == Timedelta(hours=1) + + result = td % timedelta(minutes=60) + assert isinstance(result, Timedelta) + assert result == Timedelta(0) + + result = td % NaT + assert result is NaT + + @pytest.mark.xfail(reason='GH#19378 floordiv td64 returns td64') + def test_mod_timedelta64_nat(self): + # GH#19365 + td = Timedelta(hours=37) + + result = td % np.timedelta64('NaT', 'ns') + assert result is NaT + + @pytest.mark.xfail(reason='GH#19378 floordiv td64 returns td64') + def test_mod_timedelta64(self): + # GH#19365 + td = Timedelta(hours=37) + + result = td % np.timedelta64(2, 'h') + assert isinstance(result, Timedelta) + assert result == Timedelta(hours=1) + + @pytest.mark.xfail(reason='GH#19378 floordiv by Tick not implemented') + def test_mod_offset(self): + # GH#19365 + td = Timedelta(hours=37) + + result = td % pd.offsets.Hour(5) + assert isinstance(result, Timedelta) + assert result == Timedelta(hours=2) + + # ---------------------------------------------------------------- + # Timedelta.__mod__, __rmod__ + + def test_mod_numeric(self): + # GH#19365 + td = Timedelta(hours=37) + + # Numeric Others + result = td % 2 + assert isinstance(result, Timedelta) + assert result == Timedelta(0) + + result = td % 1e12 + assert isinstance(result, Timedelta) + assert result == Timedelta(minutes=3, seconds=20) + + result = td % int(1e12) + assert isinstance(result, Timedelta) + assert result == Timedelta(minutes=3, seconds=20) + + def test_mod_invalid(self): + # GH#19365 + td = Timedelta(hours=37) + + with pytest.raises(TypeError): + td % pd.Timestamp('2018-01-22') + + with pytest.raises(TypeError): + td % [] + + def test_rmod_pytimedelta(self): + # GH#19365 + td = Timedelta(minutes=3) + + result = timedelta(minutes=4) % td + assert isinstance(result, Timedelta) + assert result == Timedelta(minutes=1) + + @pytest.mark.xfail(reason='GH#19378 floordiv by Tick not implemented') + def test_rmod_timedelta64(self): + # GH#19365 + td = Timedelta(minutes=3) + result = np.timedelta64(5, 'm') % td + assert isinstance(result, Timedelta) + assert result == Timedelta(minutes=2) + + def test_rmod_invalid(self): + # GH#19365 + td = Timedelta(minutes=3) + + with pytest.raises(TypeError): + pd.Timestamp('2018-01-22') % td + + with pytest.raises(TypeError): + 15 % td + + with pytest.raises(TypeError): + 16.0 % td + + with pytest.raises(TypeError): + np.array([22, 24]) % td + + # ---------------------------------------------------------------- + # Timedelta.__divmod__, __rdivmod__ + + def test_divmod_numeric(self): + # GH#19365 + td = Timedelta(days=2, hours=6) + + result = divmod(td, 53 * 3600 * 1e9) + assert result[0] == Timedelta(1, unit='ns') + assert isinstance(result[1], Timedelta) + assert result[1] == Timedelta(hours=1) + + assert result + result = divmod(td, np.nan) + assert result[0] is pd.NaT + assert result[1] is pd.NaT + + def test_divmod(self): + # GH#19365 + td = Timedelta(days=2, hours=6) + + result = divmod(td, timedelta(days=1)) + assert result[0] == 2 + assert isinstance(result[1], Timedelta) + assert result[1] == Timedelta(hours=6) + + result = divmod(td, 54) + assert result[0] == Timedelta(hours=1) + assert isinstance(result[1], Timedelta) + assert result[1] == Timedelta(0) + + result = divmod(td, pd.NaT) + assert np.isnan(result[0]) + assert result[1] is pd.NaT + + @pytest.mark.xfail(reason='GH#19378 floordiv by Tick not implemented') + def test_divmod_offset(self): + # GH#19365 + td = Timedelta(days=2, hours=6) + + result = divmod(td, pd.offsets.Hour(-4)) + assert result[0] == -14 + assert isinstance(result[1], Timedelta) + assert result[1] == Timedelta(hours=-2) + + def test_divmod_invalid(self): + # GH#19365 + td = Timedelta(days=2, hours=6) + + with pytest.raises(TypeError): + divmod(td, pd.Timestamp('2018-01-22')) + + def test_rdivmod_pytimedelta(self): + # GH#19365 + result = divmod(timedelta(days=2, hours=6), Timedelta(days=1)) + assert result[0] == 2 + assert isinstance(result[1], Timedelta) + assert result[1] == Timedelta(hours=6) + + @pytest.mark.xfail(reason='GH#19378 floordiv by Tick not implemented') + def test_rdivmod_offset(self): + result = divmod(pd.offsets.Hour(54), Timedelta(hours=-4)) + assert result[0] == -14 + assert isinstance(result[1], Timedelta) + assert result[1] == Timedelta(hours=-2) + + def test_rdivmod_invalid(self): + # GH#19365 + td = Timedelta(minutes=3) + + with pytest.raises(TypeError): + divmod(pd.Timestamp('2018-01-22'), td) + + with pytest.raises(TypeError): + divmod(15, td) + + with pytest.raises(TypeError): + divmod(16.0, td) + + with pytest.raises(TypeError): + divmod(np.array([22, 24]), td) From 21dbe7ac61b2ec5b5248051c80937d59f226c57e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 19 Feb 2018 15:16:36 -0800 Subject: [PATCH 137/214] Fix Timedelta floordiv, rfloordiv with offset, fix td64 return types (#19770) --- doc/source/whatsnew/v0.23.0.txt | 2 ++ pandas/_libs/tslibs/timedeltas.pyx | 16 +++++++++++++++- .../tests/scalar/timedelta/test_arithmetic.py | 18 +++++++++++------- 3 files changed, 28 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index aa1e434aae6e9..2f820043d7b6f 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -730,6 +730,8 @@ Datetimelike - Bug in :class:`Timestamp` and :func:`to_datetime` where a string representing a barely out-of-bounds timestamp would be incorrectly rounded down instead of raising ``OutOfBoundsDatetime`` (:issue:`19382`) - Bug in :func:`Timestamp.floor` :func:`DatetimeIndex.floor` where time stamps far in the future and past were not rounded correctly (:issue:`19206`) - Bug in :func:`to_datetime` where passing an out-of-bounds datetime with ``errors='coerce'`` and ``utc=True`` would raise ``OutOfBoundsDatetime`` instead of parsing to ``NaT`` (:issue:`19612`) +- Bug in :func:`Timedelta.__add__`, :func:`Timedelta.__sub__` where adding or subtracting a ``np.timedelta64`` object would return another ``np.timedelta64`` instead of a ``Timedelta`` (:issue:`19738`) +- Bug in :func:`Timedelta.__floordiv__`, :func:`Timedelta.__rfloordiv__` where operating with a ``Tick`` object would raise a ``TypeError`` instead of returning a numeric value (:issue:`19738`) - Timezones diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index f10175fddd00b..4483225e1801d 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -478,11 +478,16 @@ def _binary_op_method_timedeltalike(op, name): elif other is NaT: return NaT + elif is_timedelta64_object(other): + # convert to Timedelta below; avoid catching this in + # has-dtype check before then + pass + elif is_datetime64_object(other) or PyDateTime_CheckExact(other): # the PyDateTime_CheckExact case is for a datetime object that # is specifically *not* a Timestamp, as the Timestamp case will be # handled after `_validate_ops_compat` returns False below - from ..tslib import Timestamp + from timestamps import Timestamp return op(self, Timestamp(other)) # We are implicitly requiring the canonical behavior to be # defined by Timestamp methods. @@ -503,6 +508,9 @@ def _binary_op_method_timedeltalike(op, name): # failed to parse as timedelta return NotImplemented + if other is NaT: + # e.g. if original other was timedelta64('NaT') + return NaT return Timedelta(op(self.value, other.value), unit='ns') f.__name__ = name @@ -1096,6 +1104,9 @@ class Timedelta(_Timedelta): # just defer if hasattr(other, '_typ'): # Series, DataFrame, ... + if other._typ == 'dateoffset' and hasattr(other, 'delta'): + # Tick offset + return self // other.delta return NotImplemented if hasattr(other, 'dtype'): @@ -1128,6 +1139,9 @@ class Timedelta(_Timedelta): # just defer if hasattr(other, '_typ'): # Series, DataFrame, ... + if other._typ == 'dateoffset' and hasattr(other, 'delta'): + # Tick offset + return other.delta // self return NotImplemented if hasattr(other, 'dtype'): diff --git a/pandas/tests/scalar/timedelta/test_arithmetic.py b/pandas/tests/scalar/timedelta/test_arithmetic.py index 43e9491b9de0b..48da23f3575ab 100644 --- a/pandas/tests/scalar/timedelta/test_arithmetic.py +++ b/pandas/tests/scalar/timedelta/test_arithmetic.py @@ -100,7 +100,6 @@ def test_td_add_pytimedelta(self, op): assert isinstance(result, Timedelta) assert result == Timedelta(days=19) - @pytest.mark.xfail(reason='GH#19738 argument not converted to Timedelta') @pytest.mark.parametrize('op', [operator.add, ops.radd]) def test_td_add_timedelta64(self, op): td = Timedelta(10, unit='d') @@ -130,13 +129,11 @@ def test_td_sub_pytimedelta(self): assert isinstance(result, Timedelta) assert result == expected - @pytest.mark.xfail(reason='GH#19738 argument not converted to Timedelta') def test_td_sub_timedelta64(self): td = Timedelta(10, unit='d') expected = Timedelta(0, unit='ns') result = td - td.to_timedelta64() assert isinstance(result, Timedelta) - # comparison fails even if we comment out the isinstance assertion assert result == expected def test_td_sub_nat(self): @@ -144,7 +141,6 @@ def test_td_sub_nat(self): result = td - NaT assert result is NaT - @pytest.mark.xfail(reason='GH#19738 argument not converted to Timedelta') def test_td_sub_td64_nat(self): td = Timedelta(10, unit='d') result = td - np.timedelta64('NaT') @@ -171,7 +167,6 @@ def test_td_rsub_pytimedelta(self): assert isinstance(result, Timedelta) assert result == expected - @pytest.mark.xfail(reason='GH#19738 argument not converted to Timedelta') def test_td_rsub_timedelta64(self): td = Timedelta(10, unit='d') expected = Timedelta(0, unit='ns') @@ -188,7 +183,6 @@ def test_td_rsub_nat(self): result = np.datetime64('NaT') - td assert result is NaT - @pytest.mark.xfail(reason='GH#19738 argument not converted to Timedelta') def test_td_rsub_td64_nat(self): td = Timedelta(10, unit='d') result = np.timedelta64('NaT') - td @@ -304,6 +298,12 @@ def test_td_floordiv_null_scalar(self): assert np.isnan(td // NaT) assert np.isnan(td // np.timedelta64('NaT')) + def test_td_floordiv_offsets(self): + # GH#19738 + td = Timedelta(hours=3, minutes=4) + assert td // pd.offsets.Hour(1) == 3 + assert td // pd.offsets.Minute(2) == 92 + def test_td_floordiv_invalid_scalar(self): # GH#18846 td = Timedelta(hours=3, minutes=4) @@ -322,7 +322,7 @@ def test_td_floordiv_numeric_scalar(self): assert td // np.int32(2.0) == expected assert td // np.uint8(2.0) == expected - def test_floordiv_timedeltalike_array(self): + def test_td_floordiv_timedeltalike_array(self): # GH#18846 td = Timedelta(hours=3, minutes=4) scalar = Timedelta(hours=3, minutes=3) @@ -371,6 +371,10 @@ def test_td_rfloordiv_null_scalar(self): assert np.isnan(td.__rfloordiv__(NaT)) assert np.isnan(td.__rfloordiv__(np.timedelta64('NaT'))) + def test_td_rfloordiv_offsets(self): + # GH#19738 + assert pd.offsets.Hour(1) // Timedelta(minutes=25) == 2 + def test_td_rfloordiv_invalid_scalar(self): # GH#18846 td = Timedelta(hours=3, minutes=3) From 8aa55a9d6e5e208a19939b27475f534bd51a997d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 19 Feb 2018 15:19:02 -0800 Subject: [PATCH 138/214] Reduce redirection in ops (#19649) --- pandas/core/ops.py | 228 ++++++++++++++++++++++++----------- pandas/core/panel.py | 3 +- pandas/core/sparse/array.py | 4 +- pandas/core/sparse/series.py | 4 +- 4 files changed, 159 insertions(+), 80 deletions(-) diff --git a/pandas/core/ops.py b/pandas/core/ops.py index dff2b6844af94..da65f1f31ed2a 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -208,6 +208,78 @@ def _get_frame_op_default_axis(name): return 'columns' +def _get_opstr(op, cls): + """ + Find the operation string, if any, to pass to numexpr for this + operation. + + Parameters + ---------- + op : binary operator + cls : class + + Returns + ------- + op_str : string or None + """ + # numexpr is available for non-sparse classes + subtyp = getattr(cls, '_subtyp', '') + use_numexpr = 'sparse' not in subtyp + + if not use_numexpr: + # if we're not using numexpr, then don't pass a str_rep + return None + + return {operator.add: '+', + radd: '+', + operator.mul: '*', + rmul: '*', + operator.sub: '-', + rsub: '-', + operator.truediv: '/', + rtruediv: '/', + operator.floordiv: '//', + rfloordiv: '//', + operator.mod: None, # TODO: Why None for mod but '%' for rmod? + rmod: '%', + operator.pow: '**', + rpow: '**', + operator.eq: '==', + operator.ne: '!=', + operator.le: '<=', + operator.lt: '<', + operator.ge: '>=', + operator.gt: '>', + operator.and_: '&', + rand_: '&', + operator.or_: '|', + ror_: '|', + operator.xor: '^', + rxor: '^', + divmod: None, + rdivmod: None}[op] + + +def _get_op_name(op, special): + """ + Find the name to attach to this method according to conventions + for special and non-special methods. + + Parameters + ---------- + op : binary operator + special : bool + + Returns + ------- + op_name : str + """ + opname = op.__name__.strip('_') + if special: + opname = '__{opname}__'.format(opname=opname) + return opname + + # ----------------------------------------------------------------------------- # Docstring Generation and Templates @@ -501,48 +573,29 @@ def _create_methods(cls, arith_method, comp_method, bool_method, # creates actual methods based upon arithmetic, comp and bool method # constructors. - # numexpr is available for non-sparse classes - subtyp = getattr(cls, '_subtyp', '') - use_numexpr = 'sparse' not in subtyp - have_divmod = issubclass(cls, ABCSeries) # divmod is available for Series and SparseSeries - # if we're not using numexpr, then don't pass a str_rep - if use_numexpr: - op = lambda x: x - else: - op = lambda x: None - if special: - - def names(x): - if x[-1] == "_": - return "__{name}_".format(name=x) - else: - return "__{name}__".format(name=x) - else: - names = lambda x: x - # yapf: disable new_methods = dict( - add=arith_method(operator.add, names('add'), op('+')), - radd=arith_method(radd, names('radd'), op('+')), - sub=arith_method(operator.sub, names('sub'), op('-')), - mul=arith_method(operator.mul, names('mul'), op('*')), - truediv=arith_method(operator.truediv, names('truediv'), op('/')), - floordiv=arith_method(operator.floordiv, names('floordiv'), op('//')), + add=arith_method(cls, operator.add, special), + radd=arith_method(cls, radd, special), + sub=arith_method(cls, operator.sub, special), + mul=arith_method(cls, operator.mul, special), + truediv=arith_method(cls, operator.truediv, special), + floordiv=arith_method(cls, operator.floordiv, special), # Causes a floating point exception in the tests when numexpr enabled, # so for now no speedup - mod=arith_method(operator.mod, names('mod'), None), - pow=arith_method(operator.pow, names('pow'), op('**')), + mod=arith_method(cls, operator.mod, special), + pow=arith_method(cls, operator.pow, special), # not entirely sure why this is necessary, but previously was included # so it's here to maintain compatibility - rmul=arith_method(operator.mul, names('rmul'), op('*')), - rsub=arith_method(rsub, names('rsub'), op('-')), - rtruediv=arith_method(rtruediv, names('rtruediv'), op('/')), - rfloordiv=arith_method(rfloordiv, names('rfloordiv'), op('//')), - rpow=arith_method(rpow, names('rpow'), op('**')), - rmod=arith_method(rmod, names('rmod'), op('%'))) + rmul=arith_method(cls, rmul, special), + rsub=arith_method(cls, rsub, special), + rtruediv=arith_method(cls, rtruediv, special), + rfloordiv=arith_method(cls, rfloordiv, special), + rpow=arith_method(cls, rpow, special), + rmod=arith_method(cls, rmod, special)) # yapf: enable new_methods['div'] = new_methods['truediv'] new_methods['rdiv'] = new_methods['rtruediv'] @@ -550,26 +603,30 @@ def names(x): # Comp methods never had a default axis set if comp_method: new_methods.update(dict( - eq=comp_method(operator.eq, names('eq'), op('==')), - ne=comp_method(operator.ne, names('ne'), op('!=')), - lt=comp_method(operator.lt, names('lt'), op('<')), - gt=comp_method(operator.gt, names('gt'), op('>')), - le=comp_method(operator.le, names('le'), op('<=')), - ge=comp_method(operator.ge, names('ge'), op('>=')))) + eq=comp_method(cls, operator.eq, special), + ne=comp_method(cls, operator.ne, special), + lt=comp_method(cls, operator.lt, special), + gt=comp_method(cls, operator.gt, special), + le=comp_method(cls, operator.le, special), + ge=comp_method(cls, operator.ge, special))) if bool_method: new_methods.update( - dict(and_=bool_method(operator.and_, names('and_'), op('&')), - or_=bool_method(operator.or_, names('or_'), op('|')), + dict(and_=bool_method(cls, operator.and_, special), + or_=bool_method(cls, operator.or_, special), # For some reason ``^`` wasn't used in original. - xor=bool_method(operator.xor, names('xor'), op('^')), - rand_=bool_method(rand_, names('rand_'), op('&')), - ror_=bool_method(ror_, names('ror_'), op('|')), - rxor=bool_method(rxor, names('rxor'), op('^')))) + xor=bool_method(cls, operator.xor, special), + rand_=bool_method(cls, rand_, special), + ror_=bool_method(cls, ror_, special), + rxor=bool_method(cls, rxor, special))) if have_divmod: # divmod doesn't have an op that is supported by numexpr - new_methods['divmod'] = arith_method(divmod, names('divmod'), None) + new_methods['divmod'] = arith_method(cls, divmod, special) - new_methods = {names(k): v for k, v in new_methods.items()} + if special: + dunderize = lambda x: '__{name}__'.format(name=x.strip('_')) + else: + dunderize = lambda x: x + new_methods = {dunderize(k): v for k, v in new_methods.items()} return new_methods @@ -596,16 +653,15 @@ def add_special_arithmetic_methods(cls, arith_method=None, Parameters ---------- arith_method : function (optional) - factory for special arithmetic methods, with op string: - f(op, name, str_rep) + factory for special arithmetic methods: + f(cls, op, special) comp_method : function (optional) - factory for rich comparison - signature: f(op, name, str_rep) + factory for rich comparison - signature: f(cls, op, special) bool_method : function (optional) - factory for boolean methods - signature: f(op, name, str_rep) + factory for boolean methods - signature: f(cls, op, special) """ new_methods = _create_methods(cls, arith_method, comp_method, bool_method, special=True) - # inplace operators (I feel like these should get passed an `inplace=True` # or just be removed @@ -645,8 +701,7 @@ def f(self, other): add_methods(cls, new_methods=new_methods) -def add_flex_arithmetic_methods(cls, flex_arith_method, - flex_comp_method=None, flex_bool_method=None): +def add_flex_arithmetic_methods(cls, flex_arith_method, flex_comp_method=None): """ Adds the full suite of flex arithmetic methods (``pow``, ``mul``, ``add``) to the class. @@ -654,13 +709,13 @@ def add_flex_arithmetic_methods(cls, flex_arith_method, Parameters ---------- flex_arith_method : function - factory for flex arithmetic methods, with op string: - f(op, name, str_rep) + factory for flex arithmetic methods: + f(cls, op, special) flex_comp_method : function, optional, - factory for rich comparison - signature: f(op, name, str_rep) + factory for rich comparison - signature: f(cls, op, special) """ new_methods = _create_methods(cls, flex_arith_method, - flex_comp_method, flex_bool_method, + flex_comp_method, bool_method=None, special=False) new_methods.update(dict(multiply=new_methods['mul'], subtract=new_methods['sub'], @@ -719,11 +774,13 @@ def _construct_divmod_result(left, result, index, name, dtype): ) -def _arith_method_SERIES(op, name, str_rep): +def _arith_method_SERIES(cls, op, special): """ Wrapper function for Series arithmetic operations, to avoid code duplication. """ + str_rep = _get_opstr(op, cls) + name = _get_op_name(op, special) eval_kwargs = _gen_eval_kwargs(name) fill_zeros = _gen_fill_zeros(name) construct_result = (_construct_divmod_result @@ -856,11 +913,12 @@ def _comp_method_OBJECT_ARRAY(op, x, y): return result -def _comp_method_SERIES(op, name, str_rep): +def _comp_method_SERIES(cls, op, special): """ Wrapper function for Series arithmetic operations, to avoid code duplication. """ + name = _get_op_name(op, special) masker = _gen_eval_kwargs(name).get('masker', False) def na_op(x, y): @@ -995,7 +1053,7 @@ def wrapper(self, other, axis=None): return wrapper -def _bool_method_SERIES(op, name, str_rep): +def _bool_method_SERIES(cls, op, special): """ Wrapper function for Series arithmetic operations, to avoid code duplication. @@ -1066,7 +1124,8 @@ def wrapper(self, other): return wrapper -def _flex_method_SERIES(op, name, str_rep): +def _flex_method_SERIES(cls, op, special): + name = _get_op_name(op, special) doc = _make_flex_doc(name, 'series') @Appender(doc) @@ -1192,7 +1251,9 @@ def to_series(right): return right -def _arith_method_FRAME(op, name, str_rep=None): +def _arith_method_FRAME(cls, op, special): + str_rep = _get_opstr(op, cls) + name = _get_op_name(op, special) eval_kwargs = _gen_eval_kwargs(name) fill_zeros = _gen_fill_zeros(name) default_axis = _get_frame_op_default_axis(name) @@ -1270,7 +1331,9 @@ def f(self, other, axis=default_axis, level=None, fill_value=None): return f -def _flex_comp_method_FRAME(op, name, str_rep=None): +def _flex_comp_method_FRAME(cls, op, special): + str_rep = _get_opstr(op, cls) + name = _get_op_name(op, special) default_axis = _get_frame_op_default_axis(name) def na_op(x, y): @@ -1306,7 +1369,10 @@ def f(self, other, axis=default_axis, level=None): return f -def _comp_method_FRAME(func, name, str_rep): +def _comp_method_FRAME(cls, func, special): + str_rep = _get_opstr(func, cls) + name = _get_op_name(func, special) + @Appender('Wrapper for comparison method {name}'.format(name=name)) def f(self, other): if isinstance(other, ABCDataFrame): @@ -1345,8 +1411,10 @@ def f(self, other): # ----------------------------------------------------------------------------- # Panel -def _arith_method_PANEL(op, name, str_rep=None): +def _arith_method_PANEL(cls, op, special): # work only for scalars + name = _get_op_name(op, special) + def f(self, other): if not is_scalar(other): raise ValueError('Simple arithmetic with {name} can only be ' @@ -1359,7 +1427,10 @@ def f(self, other): return f -def _comp_method_PANEL(op, name, str_rep=None): +def _comp_method_PANEL(cls, op, special): + str_rep = _get_opstr(op, cls) + name = _get_op_name(op, special) + def na_op(x, y): import pandas.core.computation.expressions as expressions @@ -1389,7 +1460,9 @@ def f(self, other, axis=None): return f -def _flex_method_PANEL(op, name, str_rep=None): +def _flex_method_PANEL(cls, op, special): + str_rep = _get_opstr(op, cls) + name = _get_op_name(op, special) eval_kwargs = _gen_eval_kwargs(name) fill_zeros = _gen_fill_zeros(name) @@ -1427,18 +1500,19 @@ def f(self, other, axis=0): comp_method=_comp_method_PANEL, bool_method=_arith_method_PANEL) +panel_flex_funcs = dict(flex_arith_method=_flex_method_PANEL, + flex_comp_method=_comp_method_PANEL) # ----------------------------------------------------------------------------- # Sparse -def _arith_method_SPARSE_SERIES(op, name, str_rep=None): +def _arith_method_SPARSE_SERIES(cls, op, special): """ Wrapper function for Series arithmetic operations, to avoid code duplication. - - str_rep is not used, but is present for compatibility. """ + name = _get_op_name(op, special) def wrapper(self, other): if isinstance(other, ABCDataFrame): @@ -1476,11 +1550,12 @@ def _sparse_series_op(left, right, op, name): return left._constructor(result, index=new_index, name=new_name) -def _arith_method_SPARSE_ARRAY(op, name, str_rep=None): +def _arith_method_SPARSE_ARRAY(cls, op, special): """ Wrapper function for Series arithmetic operations, to avoid code duplication. """ + name = _get_op_name(op, special) def wrapper(self, other): from pandas.core.sparse.array import ( @@ -1508,3 +1583,12 @@ def wrapper(self, other): name = name[2:-2] wrapper.__name__ = name return wrapper + + +sparse_array_special_funcs = dict(arith_method=_arith_method_SPARSE_ARRAY, + comp_method=_arith_method_SPARSE_ARRAY, + bool_method=_arith_method_SPARSE_ARRAY) + +sparse_series_special_funcs = dict(arith_method=_arith_method_SPARSE_SERIES, + comp_method=_arith_method_SPARSE_SERIES, + bool_method=None) diff --git a/pandas/core/panel.py b/pandas/core/panel.py index 7f973992fb07f..3be1e3ef8734d 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -1528,8 +1528,7 @@ def _extract_axis(self, data, axis=0, intersect=False): 'minor_axis': 'columns'}) ops.add_special_arithmetic_methods(Panel, **ops.panel_special_funcs) -ops.add_flex_arithmetic_methods(Panel, ops._flex_method_PANEL, - flex_comp_method=ops._comp_method_PANEL) +ops.add_flex_arithmetic_methods(Panel, **ops.panel_flex_funcs) Panel._add_numeric_operations() diff --git a/pandas/core/sparse/array.py b/pandas/core/sparse/array.py index 3cbae717d0e07..4f7152666f7bf 100644 --- a/pandas/core/sparse/array.py +++ b/pandas/core/sparse/array.py @@ -844,6 +844,4 @@ def _make_index(length, indices, kind): ops.add_special_arithmetic_methods(SparseArray, - arith_method=ops._arith_method_SPARSE_ARRAY, - comp_method=ops._arith_method_SPARSE_ARRAY, - bool_method=ops._arith_method_SPARSE_ARRAY) + **ops.sparse_array_special_funcs) diff --git a/pandas/core/sparse/series.py b/pandas/core/sparse/series.py index 62a467bec2683..335a4c80adc63 100644 --- a/pandas/core/sparse/series.py +++ b/pandas/core/sparse/series.py @@ -817,6 +817,4 @@ def from_coo(cls, A, dense_index=False): # overwrite basic arithmetic to use SparseSeries version # force methods to overwrite previous definitions. ops.add_special_arithmetic_methods(SparseSeries, - ops._arith_method_SPARSE_SERIES, - comp_method=ops._arith_method_SPARSE_SERIES, - bool_method=None) + **ops.sparse_series_special_funcs) From e022d9a0886b245afe4af4cc43904a112cd00882 Mon Sep 17 00:00:00 2001 From: Matt Kirk Date: Tue, 20 Feb 2018 06:38:42 +0700 Subject: [PATCH 139/214] Fix the non cython build for cpp extensions (#19707) --- setup.py | 96 +++++++++++++++++++++++++++++--------------------------- 1 file changed, 50 insertions(+), 46 deletions(-) diff --git a/setup.py b/setup.py index c66979dd19ef0..c7784260d79ca 100755 --- a/setup.py +++ b/setup.py @@ -311,7 +311,6 @@ class CheckSDist(sdist_class): 'pandas/_libs/missing.pyx', 'pandas/_libs/reduction.pyx', 'pandas/_libs/testing.pyx', - 'pandas/_libs/window.pyx', 'pandas/_libs/skiplist.pyx', 'pandas/_libs/sparse.pyx', 'pandas/_libs/parsers.pyx', @@ -331,6 +330,10 @@ class CheckSDist(sdist_class): 'pandas/_libs/writers.pyx', 'pandas/io/sas/sas.pyx'] + _cpp_pyxfiles = ['pandas/_libs/window.pyx', + 'pandas/io/msgpack/_packer.pyx', + 'pandas/io/msgpack/_unpacker.pyx'] + def initialize_options(self): sdist_class.initialize_options(self) @@ -338,12 +341,17 @@ def run(self): if 'cython' in cmdclass: self.run_command('cython') else: - for pyxfile in self._pyxfiles: - cfile = pyxfile[:-3] + 'c' - msg = ("C-source file '{source}' not found.\n" - "Run 'setup.py cython' before sdist.".format( - source=cfile)) - assert os.path.isfile(cfile), msg + # If we are not running cython then + # compile the extensions correctly + pyx_files = [(self._pyxfiles, 'c'), (self._cpp_pyxfiles, 'cpp')] + + for pyxfiles, extension in pyx_files: + for pyxfile in pyxfiles: + sourcefile = pyxfile[:-3] + extension + msg = ("{extension}-source file '{source}' not found.\n" + "Run 'setup.py cython' before sdist.".format( + source=sourcefile, extension=extension)) + assert os.path.isfile(sourcefile), msg sdist_class.run(self) @@ -417,6 +425,11 @@ def get_tag(self): cmdclass['build_src'] = DummyBuildSrc cmdclass['build_ext'] = CheckingBuildExt +if sys.byteorder == 'big': + endian_macro = [('__BIG_ENDIAN__', '1')] +else: + endian_macro = [('__LITTLE_ENDIAN__', '1')] + lib_depends = ['inference'] @@ -453,6 +466,7 @@ def pxd(name): 'pandas/_libs/src/datetime/np_datetime_strings.h'] np_datetime_sources = ['pandas/_libs/src/datetime/np_datetime.c', 'pandas/_libs/src/datetime/np_datetime_strings.c'] + tseries_depends = np_datetime_headers + ['pandas/_libs/tslibs/np_datetime.pxd'] # some linux distros require it @@ -618,17 +632,42 @@ def pxd(name): '_libs.window': { 'pyxfile': '_libs/window', 'pxdfiles': ['_libs/skiplist', '_libs/src/util'], - 'language': 'c++'}, + 'language': 'c++', + 'suffix': '.cpp'}, '_libs.writers': { 'pyxfile': '_libs/writers', 'pxdfiles': ['_libs/src/util']}, 'io.sas._sas': { - 'pyxfile': 'io/sas/sas'}} + 'pyxfile': 'io/sas/sas'}, + 'io.msgpack._packer': { + 'macros': endian_macro, + 'depends': ['pandas/_libs/src/msgpack/pack.h', + 'pandas/_libs/src/msgpack/pack_template.h'], + 'include': ['pandas/_libs/src/msgpack'] + common_include, + 'language': 'c++', + 'suffix': '.cpp', + 'pyxfile': 'io/msgpack/_packer', + 'subdir': 'io/msgpack'}, + 'io.msgpack._unpacker': { + 'depends': ['pandas/_libs/src/msgpack/unpack.h', + 'pandas/_libs/src/msgpack/unpack_define.h', + 'pandas/_libs/src/msgpack/unpack_template.h'], + 'macros': endian_macro, + 'include': ['pandas/_libs/src/msgpack'] + common_include, + 'language': 'c++', + 'suffix': '.cpp', + 'pyxfile': 'io/msgpack/_unpacker', + 'subdir': 'io/msgpack' + } +} extensions = [] for name, data in ext_data.items(): - sources = [srcpath(data['pyxfile'], suffix=suffix, subdir='')] + source_suffix = suffix if suffix == '.pyx' else data.get('suffix', '.c') + + sources = [srcpath(data['pyxfile'], suffix=source_suffix, subdir='')] + pxds = [pxd(x) for x in data.get('pxdfiles', [])] if suffix == '.pyx' and pxds: sources.extend(pxds) @@ -642,46 +681,11 @@ def pxd(name): depends=data.get('depends', []), include_dirs=include, language=data.get('language', 'c'), + define_macros=data.get('macros', []), extra_compile_args=extra_compile_args) extensions.append(obj) -# ---------------------------------------------------------------------- -# msgpack - -if sys.byteorder == 'big': - macros = [('__BIG_ENDIAN__', '1')] -else: - macros = [('__LITTLE_ENDIAN__', '1')] - -msgpack_include = ['pandas/_libs/src/msgpack'] + common_include -msgpack_suffix = suffix if suffix == '.pyx' else '.cpp' -unpacker_depends = ['pandas/_libs/src/msgpack/unpack.h', - 'pandas/_libs/src/msgpack/unpack_define.h', - 'pandas/_libs/src/msgpack/unpack_template.h'] - -packer_ext = Extension('pandas.io.msgpack._packer', - depends=['pandas/_libs/src/msgpack/pack.h', - 'pandas/_libs/src/msgpack/pack_template.h'], - sources=[srcpath('_packer', - suffix=msgpack_suffix, - subdir='io/msgpack')], - language='c++', - include_dirs=msgpack_include, - define_macros=macros, - extra_compile_args=extra_compile_args) -unpacker_ext = Extension('pandas.io.msgpack._unpacker', - depends=unpacker_depends, - sources=[srcpath('_unpacker', - suffix=msgpack_suffix, - subdir='io/msgpack')], - language='c++', - include_dirs=msgpack_include, - define_macros=macros, - extra_compile_args=extra_compile_args) -extensions.append(packer_ext) -extensions.append(unpacker_ext) - # ---------------------------------------------------------------------- # ujson From af5e8ecc800493c0f2578daa25354ae32e08b544 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 19 Feb 2018 20:21:33 -0500 Subject: [PATCH 140/214] DOC: whatsnew typo cleanup --- doc/source/whatsnew/v0.23.0.txt | 53 ++++++++++++++++----------------- 1 file changed, 26 insertions(+), 27 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 2f820043d7b6f..7bd47c7172671 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -16,10 +16,10 @@ New features .. _whatsnew_0210.enhancements.limit_area: ``DataFrame.interpolate`` has gained the ``limit_area`` kwarg -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ :meth:`DataFrame.interpolate` has gained a ``limit_area`` parameter to allow further control of which ``NaN`` s are replaced. -Use `limit_area='inside'` to fill only NaNs surrounded by valid values or use `limit_area='outside'` to fill only ``NaN`` s +Use ``limit_area='inside'`` to fill only NaNs surrounded by valid values or use ``limit_area='outside'`` to fill only ``NaN`` s outside the existing valid values while preserving those inside. (:issue:`16284`) See the :ref:`full documentation here `. @@ -352,13 +352,13 @@ Dependencies have increased minimum versions We have updated our minimum supported versions of dependencies (:issue:`15184`). If installed, we now require: - +-----------------+-----------------+----------+ - | Package | Minimum Version | Required | - +=================+=================+==========+ - | python-dateutil | 2.5.0 | X | - +-----------------+-----------------+----------+ - | openpyxl | 2.4.0 | | - +-----------------+-----------------+----------+ ++-----------------+-----------------+----------+ +| Package | Minimum Version | Required | ++=================+=================+==========+ +| python-dateutil | 2.5.0 | X | ++-----------------+-----------------+----------+ +| openpyxl | 2.4.0 | | ++-----------------+-----------------+----------+ .. _whatsnew_0230.api_breaking.deprecate_panel: @@ -391,7 +391,7 @@ Convert to an xarray DataArray .. _whatsnew_0230.api_breaking.apply: Changes to make output of ``DataFrame.apply`` consistent -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ :func:`DataFrame.apply` was inconsistent when applying an arbitrary user-defined-function that returned a list-like with ``axis=1``. Several bugs and inconsistencies are resolved. If the applied function returns a Series, then pandas will return a DataFrame; otherwise a Series will be returned, this includes the case @@ -454,7 +454,7 @@ Returning a ``Series`` allows one to control the exact return structure and colu .. ipython:: python - df.apply(lambda x: Series([1, 2, 3], index=['D', 'E', 'F']]), axis=1) + df.apply(lambda x: Series([1, 2, 3], index=['D', 'E', 'F']), axis=1) .. _whatsnew_0230.api_breaking.build_changes: @@ -555,7 +555,7 @@ Other API Changes - ``pandas.tseries.frequencies.get_freq_group()`` and ``pandas.tseries.frequencies.DAYS`` are removed from the public API (:issue:`18034`) - :func:`Series.truncate` and :func:`DataFrame.truncate` will raise a ``ValueError`` if the index is not sorted instead of an unhelpful ``KeyError`` (:issue:`17935`) - :func:`Index.map` can now accept ``Series`` and dictionary input objects (:issue:`12756`, :issue:`18482`, :issue:`18509`). -- :func:`Dataframe.unstack` will now default to filling with ``np.nan`` for ``object`` columns. (:issue:`12815`) +- :func:`DataFrame.unstack` will now default to filling with ``np.nan`` for ``object`` columns. (:issue:`12815`) - :class:`IntervalIndex` constructor will raise if the ``closed`` parameter conflicts with how the input data is inferred to be closed (:issue:`18421`) - Inserting missing values into indexes will work for all types of indexes and automatically insert the correct type of missing value (``NaN``, ``NaT``, etc.) regardless of the type passed in (:issue:`18295`) - Restricted ``DateOffset`` keyword arguments. Previously, ``DateOffset`` subclasses allowed arbitrary keyword arguments which could lead to unexpected behavior. Now, only valid arguments will be accepted. (:issue:`17176`, :issue:`18226`). @@ -620,7 +620,7 @@ Removal of prior version deprecations/changes - The ``pandas.io.wb`` and ``pandas.io.data`` stub modules have been removed (:issue:`13735`) - ``Categorical.from_array`` has been removed (:issue:`13854`) - The ``freq`` and ``how`` parameters have been removed from the ``rolling``/``expanding``/``ewm`` methods of DataFrame - and Series (deprecated since v0.18). Instead, resample before calling the methods. (:issue:18601 & :issue:18668) + and Series (deprecated since v0.18). Instead, resample before calling the methods. (:issue:`18601` & :issue:`18668`) - ``DatetimeIndex.to_datetime``, ``Timestamp.to_datetime``, ``PeriodIndex.to_datetime``, and ``Index.to_datetime`` have been removed (:issue:`8254`, :issue:`14096`, :issue:`14113`) - :func:`read_csv` has dropped the ``skip_footer`` parameter (:issue:`13386`) - :func:`read_csv` has dropped the ``as_recarray`` parameter (:issue:`13373`) @@ -631,7 +631,7 @@ Removal of prior version deprecations/changes - ``pandas.tseries.frequencies.get_standard_freq`` has been removed in favor of ``pandas.tseries.frequencies.to_offset(freq).rule_code`` (:issue:`13874`) - The ``freqstr`` keyword has been removed from ``pandas.tseries.frequencies.to_offset`` in favor of ``freq`` (:issue:`13874`) - The ``Panel4D`` and ``PanelND`` classes have been removed (:issue:`13776`) -- The ``Panel``class has dropped the ``to_long``and ``toLong`` methods (:issue:`19077`) +- The ``Panel`` class has dropped the ``to_long``and ``toLong`` methods (:issue:`19077`) - The options ``display.line_with`` and ``display.height`` are removed in favor of ``display.width`` and ``display.max_rows`` respectively (:issue:`4391`, :issue:`19107`) - The ``labels`` attribute of the ``Categorical`` class has been removed in favor of :attribute:`Categorical.codes` (:issue:`7768`) - The ``flavor`` parameter have been removed from func:`to_sql` method (:issue:`13611`) @@ -672,7 +672,7 @@ Documentation Changes Rewrote some sentences for greater clarity, added more dynamic references to functions, methods and classes. (:issue:`18941`, :issue:`18948`, :issue:`18973`, :issue:`19017`) -- + .. _whatsnew_0230.bug_fixes: @@ -704,7 +704,7 @@ Categorical ``self`` but in a different order (:issue:`19551`) - Bug in :meth:`Index.astype` with a categorical dtype where the resultant index is not converted to a :class:`CategoricalIndex` for all types of index (:issue:`18630`) - Bug in :meth:`Series.astype` and ``Categorical.astype()`` where an existing categorical data does not get updated (:issue:`10696`, :issue:`18593`) -- Bug in :class:`Index` constructor with ``dtype=CategoricalDtype(...)`` where ``categories`` and ``ordered`` are not maintained (issue:`19032`) +- Bug in :class:`Index` constructor with ``dtype=CategoricalDtype(...)`` where ``categories`` and ``ordered`` are not maintained (:issue:`19032`) Datetimelike ^^^^^^^^^^^^ @@ -718,7 +718,7 @@ Datetimelike - Bug in :class:`DatetimeIndex` and :class:`TimedeltaIndex` where adding or subtracting an array-like of ``DateOffset`` objects either raised (``np.array``, ``pd.Index``) or broadcast incorrectly (``pd.Series``) (:issue:`18849`) - Bug in :class:`Series` floor-division where operating on a scalar ``timedelta`` raises an exception (:issue:`18846`) - Bug in :class:`Series`` with ``dtype='timedelta64[ns]`` where addition or subtraction of ``TimedeltaIndex`` had results cast to ``dtype='int64'`` (:issue:`17250`) -- Bug in :class:`TimedeltaIndex` where division by a ``Series`` would return a ``TimedeltaIndex`` instead of a ``Series`` (issue:`19042`) +- Bug in :class:`TimedeltaIndex` where division by a ``Series`` would return a ``TimedeltaIndex`` instead of a ``Series`` (:issue:`19042`) - Bug in :class:`Series` with ``dtype='timedelta64[ns]`` where addition or subtraction of ``TimedeltaIndex`` could return a ``Series`` with an incorrect name (:issue:`19043`) - Bug in :class:`DatetimeIndex` where the repr was not showing high-precision time values at the end of a day (e.g., 23:59:59.999999999) (:issue:`19030`) - Bug where dividing a scalar timedelta-like object with :class:`TimedeltaIndex` performed the reciprocal operation (:issue:`19125`) @@ -732,7 +732,7 @@ Datetimelike - Bug in :func:`to_datetime` where passing an out-of-bounds datetime with ``errors='coerce'`` and ``utc=True`` would raise ``OutOfBoundsDatetime`` instead of parsing to ``NaT`` (:issue:`19612`) - Bug in :func:`Timedelta.__add__`, :func:`Timedelta.__sub__` where adding or subtracting a ``np.timedelta64`` object would return another ``np.timedelta64`` instead of a ``Timedelta`` (:issue:`19738`) - Bug in :func:`Timedelta.__floordiv__`, :func:`Timedelta.__rfloordiv__` where operating with a ``Tick`` object would raise a ``TypeError`` instead of returning a numeric value (:issue:`19738`) -- + Timezones ^^^^^^^^^ @@ -791,11 +791,11 @@ MultiIndex - Bug in :func:`MultiIndex.set_labels` which would cause casting (and potentially clipping) of the new labels if the ``level`` argument is not 0 or a list like [0, 1, ... ] (:issue:`19057`) - Bug in :func:`MultiIndex.get_level_values` which would return an invalid index on level of ints with missing values (:issue:`17924`) - Bug in :func:`MultiIndex.remove_unused_levels` which would fill nan values (:issue:`18417`) -- Bug in :func:`MultiIndex.from_tuples`` which would fail to take zipped tuples in python3 (:issue:`18434`) -- Bug in :func:`MultiIndex.get_loc`` which would fail to automatically cast values between float and int (:issue:`18818`, :issue:`15994`) -- Bug in :func:`MultiIndex.get_loc`` which would cast boolean to integer labels (:issue:`19086`) -- Bug in :func:`MultiIndex.get_loc`` which would fail to locate keys containing ``NaN`` (:issue:`18485`) -- Bug in :func:`MultiIndex.get_loc`` in large :class:`MultiIndex`, would fail when levels had different dtypes (:issue:`18520`) +- Bug in :func:`MultiIndex.from_tuples` which would fail to take zipped tuples in python3 (:issue:`18434`) +- Bug in :func:`MultiIndex.get_loc` which would fail to automatically cast values between float and int (:issue:`18818`, :issue:`15994`) +- Bug in :func:`MultiIndex.get_loc` which would cast boolean to integer labels (:issue:`19086`) +- Bug in :func:`MultiIndex.get_loc` which would fail to locate keys containing ``NaN`` (:issue:`18485`) +- Bug in :func:`MultiIndex.get_loc` in large :class:`MultiIndex`, would fail when levels had different dtypes (:issue:`18520`) I/O @@ -817,10 +817,10 @@ I/O Plotting ^^^^^^^^ -- :func: `DataFrame.plot` now raises a ``ValueError`` when the ``x`` or ``y`` argument is improperly formed (:issue:`18671`) +- :func:`DataFrame.plot` now raises a ``ValueError`` when the ``x`` or ``y`` argument is improperly formed (:issue:`18671`) - Bug in formatting tick labels with ``datetime.time()`` and fractional seconds (:issue:`18478`). - :meth:`Series.plot.kde` has exposed the args ``ind`` and ``bw_method`` in the docstring (:issue:`18461`). The argument ``ind`` may now also be an integer (number of sample points). -- + Groupby/Resample/Rolling ^^^^^^^^^^^^^^^^^^^^^^^^ @@ -860,10 +860,9 @@ Reshaping - Improved error message for :func:`DataFrame.merge` when there is no common merge key (:issue:`19427`) - Bug in :func:`DataFrame.join` which does an *outer* instead of a *left* join when being called with multiple DataFrames and some have non-unique indices (:issue:`19624`) - :func:`Series.rename` now accepts ``axis`` as a kwarg (:issue:`18589`) +- Comparisons between :class:`Series` and :class:`Index` would return a ``Series`` with an incorrect name, ignoring the ``Index``'s name attribute (:issue:`19582`) Other ^^^^^ - Improved error message when attempting to use a Python keyword as an identifier in a ``numexpr`` backed query (:issue:`18221`) -- Comparisons between :class:`Series` and :class:`Index` would return a ``Series`` with an incorrect name, ignoring the ``Index``'s name attribute (:issue:`19582`) -- From c05f3c17d5a02884f5d9a32c1fb29947f7bc3d06 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 20 Feb 2018 10:20:55 +0100 Subject: [PATCH 141/214] DOC: fix various warnings and errors in the docs (from deprecations/api changes) (#19763) --- doc/source/advanced.rst | 2 +- doc/source/dsintro.rst | 2 +- doc/source/io.rst | 1 + doc/source/whatsnew/v0.10.0.txt | 26 +++++++++++++++-------- doc/source/whatsnew/v0.13.1.txt | 37 +++++++++++++++++++++++++-------- doc/source/whatsnew/v0.15.0.txt | 2 +- doc/source/whatsnew/v0.21.0.txt | 13 +++++------- doc/source/whatsnew/v0.23.0.txt | 1 + doc/source/whatsnew/v0.8.0.txt | 4 ++-- pandas/core/frame.py | 4 ++-- 10 files changed, 59 insertions(+), 33 deletions(-) diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst index c455fbb8d0687..c81842d3d9212 100644 --- a/doc/source/advanced.rst +++ b/doc/source/advanced.rst @@ -274,7 +274,7 @@ Passing a list of labels or tuples works similar to reindexing: df.loc[[('bar', 'two'), ('qux', 'one')]] -.. info:: +.. note:: It is important to note that tuples and lists are not treated identically in pandas when it comes to indexing. Whereas a tuple is interpreted as one diff --git a/doc/source/dsintro.rst b/doc/source/dsintro.rst index 78e2fdb46f659..582750b16f40d 100644 --- a/doc/source/dsintro.rst +++ b/doc/source/dsintro.rst @@ -506,7 +506,7 @@ to be inserted (for example, a ``Series`` or NumPy array), or a function of one argument to be called on the ``DataFrame``. A *copy* of the original DataFrame is returned, with the new values inserted. -.. versionmodified:: 0.23.0 +.. versionchanged:: 0.23.0 Starting with Python 3.6 the order of ``**kwargs`` is preserved. This allows for *dependent* assignment, where an expression later in ``**kwargs`` can refer diff --git a/doc/source/io.rst b/doc/source/io.rst index 7bb34e4d232dd..6120f7d25a0c3 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -2262,6 +2262,7 @@ is not round-trippable, nor are any names beginning with 'level_' within a indicate missing values and the subsequent read cannot distinguish the intent. .. ipython:: python + :okwarning: df.index.name = 'index' df.to_json('test.json', orient='table') diff --git a/doc/source/whatsnew/v0.10.0.txt b/doc/source/whatsnew/v0.10.0.txt index a0c4a3e0073f9..222a2da23865c 100644 --- a/doc/source/whatsnew/v0.10.0.txt +++ b/doc/source/whatsnew/v0.10.0.txt @@ -411,15 +411,23 @@ N Dimensional Panels (Experimental) Adding experimental support for Panel4D and factory functions to create n-dimensional named panels. :ref:`Docs ` for NDim. Here is a taste of what to expect. - .. ipython:: python - :okwarning: - - p4d = Panel4D(randn(2, 2, 5, 4), - labels=['Label1','Label2'], - items=['Item1', 'Item2'], - major_axis=date_range('1/1/2000', periods=5), - minor_axis=['A', 'B', 'C', 'D']) - p4d +.. code-block:: ipython + + In [58]: p4d = Panel4D(randn(2, 2, 5, 4), + ....: labels=['Label1','Label2'], + ....: items=['Item1', 'Item2'], + ....: major_axis=date_range('1/1/2000', periods=5), + ....: minor_axis=['A', 'B', 'C', 'D']) + ....: + + In [59]: p4d + Out[59]: + + Dimensions: 2 (labels) x 2 (items) x 5 (major_axis) x 4 (minor_axis) + Labels axis: Label1 to Label2 + Items axis: Item1 to Item2 + Major_axis axis: 2000-01-01 00:00:00 to 2000-01-05 00:00:00 + Minor_axis axis: A to D diff --git a/doc/source/whatsnew/v0.13.1.txt b/doc/source/whatsnew/v0.13.1.txt index 5e5653945fefa..51ca6116d42ce 100644 --- a/doc/source/whatsnew/v0.13.1.txt +++ b/doc/source/whatsnew/v0.13.1.txt @@ -140,14 +140,21 @@ API changes applied would be called with an empty ``Series`` to guess whether a ``Series`` or ``DataFrame`` should be returned: - .. ipython:: python + .. code-block:: ipython + + In [32]: def applied_func(col): + ....: print("Apply function being called with: ", col) + ....: return col.sum() + ....: - def applied_func(col): - print("Apply function being called with: ", col) - return col.sum() + In [33]: empty = DataFrame(columns=['a', 'b']) - empty = DataFrame(columns=['a', 'b']) - empty.apply(applied_func) + In [34]: empty.apply(applied_func) + Apply function being called with: Series([], Length: 0, dtype: float64) + Out[34]: + a NaN + b NaN + Length: 2, dtype: float64 Now, when ``apply`` is called on an empty ``DataFrame``: if the ``reduce`` argument is ``True`` a ``Series`` will returned, if it is ``False`` a @@ -155,10 +162,22 @@ API changes function being applied will be called with an empty series to try and guess the return type. - .. ipython:: python + .. code-block:: ipython + + In [35]: empty.apply(applied_func, reduce=True) + Out[35]: + a NaN + b NaN + Length: 2, dtype: float64 + + In [36]: empty.apply(applied_func, reduce=False) + Out[36]: + Empty DataFrame + Columns: [a, b] + Index: [] + + [0 rows x 2 columns] - empty.apply(applied_func, reduce=True) - empty.apply(applied_func, reduce=False) Prior Version Deprecations/Changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/whatsnew/v0.15.0.txt b/doc/source/whatsnew/v0.15.0.txt index ef17904d5ab1a..c5ef6c8c9d74a 100644 --- a/doc/source/whatsnew/v0.15.0.txt +++ b/doc/source/whatsnew/v0.15.0.txt @@ -1044,7 +1044,7 @@ Other: idx = MultiIndex.from_product([['a'], range(3), list("pqr")], names=['foo', 'bar', 'baz']) idx.set_names('qux', level=0) - idx.set_names(['qux','baz'], level=[0,1]) + idx.set_names(['qux','corge'], level=[0,1]) idx.set_levels(['a','b','c'], level='bar') idx.set_levels([['a','b','c'],[1,2,3]], level=[1,2]) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 3e673bd4cbc28..0c2e494f29bc1 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -894,17 +894,14 @@ imported. Matplotlib plot methods (``plt.plot``, ``ax.plot``, ...), will not nicely format the x-axis for ``DatetimeIndex`` or ``PeriodIndex`` values. You must explicitly register these methods: -.. ipython:: python - - from pandas.tseries import converter - converter.register() - - fig, ax = plt.subplots() - plt.plot(pd.date_range('2017', periods=6), range(6)) - Pandas built-in ``Series.plot`` and ``DataFrame.plot`` *will* register these converters on first-use (:issue:17710). +.. note:: + + This change has been temporarily reverted in pandas 0.21.1, + for more details see :ref:`here `. + .. _whatsnew_0210.api: Other API Changes diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 7bd47c7172671..f31d0a5a0667c 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -211,6 +211,7 @@ A ``DataFrame`` can now be written to and subsequently read back via JSON while Please note that the string `index` is not supported with the round trip format, as it is used by default in ``write_json`` to indicate a missing index name. .. ipython:: python + :okwarning: df.index.name = 'index' df.to_json('test.json', orient='table') diff --git a/doc/source/whatsnew/v0.8.0.txt b/doc/source/whatsnew/v0.8.0.txt index b9cece752981e..b2d1d16e86990 100644 --- a/doc/source/whatsnew/v0.8.0.txt +++ b/doc/source/whatsnew/v0.8.0.txt @@ -217,12 +217,12 @@ nanosecond support (the ``nanosecond`` field store the nanosecond value between ``DatetimeIndex`` to regular NumPy arrays. If you have code that requires an array of ``datetime.datetime`` objects, you -have a couple of options. First, the ``asobject`` property of ``DatetimeIndex`` +have a couple of options. First, the ``astype(object)`` method of ``DatetimeIndex`` produces an array of ``Timestamp`` objects: .. ipython:: python - stamp_array = rng.asobject + stamp_array = rng.astype(object) stamp_array stamp_array[5] diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b96af6af3707f..0b315a7c6f031 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -114,7 +114,7 @@ - if `axis` is 1 or `'columns'` then `by` may contain column levels and/or index labels - .. versionmodified:: 0.23.0 + .. versionchanged:: 0.23.0 Allow specifying index or column level names.""", versionadded_to_excel='', optional_labels="""labels : array-like, optional @@ -2696,7 +2696,7 @@ def assign(self, **kwargs): or modified columns. All items are computed first, and then assigned in alphabetical order. - .. versionmodified :: 0.23.0 + .. versionchanged :: 0.23.0 Keyword argument order is maintained for Python 3.6 and later. From 1cf0c3a56a972b7b35ff8b4af82aa18de30696b6 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 20 Feb 2018 03:16:29 -0800 Subject: [PATCH 142/214] Split+Parametrize Timedelta tests (#19736) --- .../scalar/timedelta/test_construction.py | 222 +++++++++++++++ pandas/tests/scalar/timedelta/test_formats.py | 48 ++++ .../tests/scalar/timedelta/test_timedelta.py | 254 ------------------ 3 files changed, 270 insertions(+), 254 deletions(-) create mode 100644 pandas/tests/scalar/timedelta/test_construction.py create mode 100644 pandas/tests/scalar/timedelta/test_formats.py diff --git a/pandas/tests/scalar/timedelta/test_construction.py b/pandas/tests/scalar/timedelta/test_construction.py new file mode 100644 index 0000000000000..5ccad9e6b4e3c --- /dev/null +++ b/pandas/tests/scalar/timedelta/test_construction.py @@ -0,0 +1,222 @@ +# -*- coding: utf-8 -*- +from datetime import timedelta + +import pytest +import numpy as np + +import pandas as pd +import pandas.util.testing as tm +from pandas import Timedelta + + +def test_construction(): + expected = np.timedelta64(10, 'D').astype('m8[ns]').view('i8') + assert Timedelta(10, unit='d').value == expected + assert Timedelta(10.0, unit='d').value == expected + assert Timedelta('10 days').value == expected + assert Timedelta(days=10).value == expected + assert Timedelta(days=10.0).value == expected + + expected += np.timedelta64(10, 's').astype('m8[ns]').view('i8') + assert Timedelta('10 days 00:00:10').value == expected + assert Timedelta(days=10, seconds=10).value == expected + assert Timedelta(days=10, milliseconds=10 * 1000).value == expected + assert Timedelta(days=10, + microseconds=10 * 1000 * 1000).value == expected + + # rounding cases + assert Timedelta(82739999850000).value == 82739999850000 + assert ('0 days 22:58:59.999850' in str(Timedelta(82739999850000))) + assert Timedelta(123072001000000).value == 123072001000000 + assert ('1 days 10:11:12.001' in str(Timedelta(123072001000000))) + + # string conversion with/without leading zero + # GH#9570 + assert Timedelta('0:00:00') == timedelta(hours=0) + assert Timedelta('00:00:00') == timedelta(hours=0) + assert Timedelta('-1:00:00') == -timedelta(hours=1) + assert Timedelta('-01:00:00') == -timedelta(hours=1) + + # more strings & abbrevs + # GH#8190 + assert Timedelta('1 h') == timedelta(hours=1) + assert Timedelta('1 hour') == timedelta(hours=1) + assert Timedelta('1 hr') == timedelta(hours=1) + assert Timedelta('1 hours') == timedelta(hours=1) + assert Timedelta('-1 hours') == -timedelta(hours=1) + assert Timedelta('1 m') == timedelta(minutes=1) + assert Timedelta('1.5 m') == timedelta(seconds=90) + assert Timedelta('1 minute') == timedelta(minutes=1) + assert Timedelta('1 minutes') == timedelta(minutes=1) + assert Timedelta('1 s') == timedelta(seconds=1) + assert Timedelta('1 second') == timedelta(seconds=1) + assert Timedelta('1 seconds') == timedelta(seconds=1) + assert Timedelta('1 ms') == timedelta(milliseconds=1) + assert Timedelta('1 milli') == timedelta(milliseconds=1) + assert Timedelta('1 millisecond') == timedelta(milliseconds=1) + assert Timedelta('1 us') == timedelta(microseconds=1) + assert Timedelta('1 micros') == timedelta(microseconds=1) + assert Timedelta('1 microsecond') == timedelta(microseconds=1) + assert Timedelta('1.5 microsecond') == Timedelta('00:00:00.000001500') + assert Timedelta('1 ns') == Timedelta('00:00:00.000000001') + assert Timedelta('1 nano') == Timedelta('00:00:00.000000001') + assert Timedelta('1 nanosecond') == Timedelta('00:00:00.000000001') + + # combos + assert Timedelta('10 days 1 hour') == timedelta(days=10, hours=1) + assert Timedelta('10 days 1 h') == timedelta(days=10, hours=1) + assert Timedelta('10 days 1 h 1m 1s') == timedelta( + days=10, hours=1, minutes=1, seconds=1) + assert Timedelta('-10 days 1 h 1m 1s') == -timedelta( + days=10, hours=1, minutes=1, seconds=1) + assert Timedelta('-10 days 1 h 1m 1s') == -timedelta( + days=10, hours=1, minutes=1, seconds=1) + assert Timedelta('-10 days 1 h 1m 1s 3us') == -timedelta( + days=10, hours=1, minutes=1, seconds=1, microseconds=3) + assert Timedelta('-10 days 1 h 1.5m 1s 3us') == -timedelta( + days=10, hours=1, minutes=1, seconds=31, microseconds=3) + + # Currently invalid as it has a - on the hh:mm:dd part + # (only allowed on the days) + with pytest.raises(ValueError): + Timedelta('-10 days -1 h 1.5m 1s 3us') + + # only leading neg signs are allowed + with pytest.raises(ValueError): + Timedelta('10 days -1 h 1.5m 1s 3us') + + # no units specified + with pytest.raises(ValueError): + Timedelta('3.1415') + + # invalid construction + tm.assert_raises_regex(ValueError, "cannot construct a Timedelta", + lambda: Timedelta()) + tm.assert_raises_regex(ValueError, + "unit abbreviation w/o a number", + lambda: Timedelta('foo')) + tm.assert_raises_regex(ValueError, + "cannot construct a Timedelta from the " + "passed arguments, allowed keywords are ", + lambda: Timedelta(day=10)) + + # floats + expected = np.timedelta64( + 10, 's').astype('m8[ns]').view('i8') + np.timedelta64( + 500, 'ms').astype('m8[ns]').view('i8') + assert Timedelta(10.5, unit='s').value == expected + + # offset + assert pd.to_timedelta(pd.offsets.Hour(2)) == Timedelta(hours=2) + assert Timedelta(pd.offsets.Hour(2)) == Timedelta(hours=2) + assert Timedelta(pd.offsets.Second(2)) == Timedelta(seconds=2) + + # GH#11995: unicode + expected = Timedelta('1H') + result = pd.Timedelta(u'1H') + assert result == expected + assert (pd.to_timedelta(pd.offsets.Hour(2)) == + Timedelta(u'0 days, 02:00:00')) + + with pytest.raises(ValueError): + Timedelta(u'foo bar') + + +@pytest.mark.parametrize('item', list({'days': 'D', + 'seconds': 's', + 'microseconds': 'us', + 'milliseconds': 'ms', + 'minutes': 'm', + 'hours': 'h', + 'weeks': 'W'}.items())) +@pytest.mark.parametrize('npdtype', [np.int64, np.int32, np.int16, + np.float64, np.float32, np.float16]) +def test_td_construction_with_np_dtypes(npdtype, item): + # GH#8757: test construction with np dtypes + pykwarg, npkwarg = item + expected = np.timedelta64(1, npkwarg).astype('m8[ns]').view('i8') + assert Timedelta(**{pykwarg: npdtype(1)}).value == expected + + +@pytest.mark.parametrize('val', [ + '1s', '-1s', '1us', '-1us', '1 day', '-1 day', + '-23:59:59.999999', '-1 days +23:59:59.999999', '-1ns', + '1ns', '-23:59:59.999999999']) +def test_td_from_repr_roundtrip(val): + # round-trip both for string and value + td = Timedelta(val) + assert Timedelta(td.value) == td + + # str does not normally display nanos + if not td.nanoseconds: + assert Timedelta(str(td)) == td + assert Timedelta(td._repr_base(format='all')) == td + + +def test_overflow_on_construction(): + # xref https://github.com/statsmodels/statsmodels/issues/3374 + value = pd.Timedelta('1day').value * 20169940 + with pytest.raises(OverflowError): + pd.Timedelta(value) + + # xref GH#17637 + with pytest.raises(OverflowError): + pd.Timedelta(7 * 19999, unit='D') + + with pytest.raises(OverflowError): + pd.Timedelta(timedelta(days=13 * 19999)) + + +@pytest.mark.parametrize('fmt,exp', [ + ('P6DT0H50M3.010010012S', Timedelta(days=6, minutes=50, seconds=3, + milliseconds=10, microseconds=10, + nanoseconds=12)), + ('P-6DT0H50M3.010010012S', Timedelta(days=-6, minutes=50, seconds=3, + milliseconds=10, microseconds=10, + nanoseconds=12)), + ('P4DT12H30M5S', Timedelta(days=4, hours=12, minutes=30, seconds=5)), + ('P0DT0H0M0.000000123S', Timedelta(nanoseconds=123)), + ('P0DT0H0M0.00001S', Timedelta(microseconds=10)), + ('P0DT0H0M0.001S', Timedelta(milliseconds=1)), + ('P0DT0H1M0S', Timedelta(minutes=1)), + ('P1DT25H61M61S', Timedelta(days=1, hours=25, minutes=61, seconds=61)) +]) +def test_iso_constructor(fmt, exp): + assert Timedelta(fmt) == exp + + +@pytest.mark.parametrize('fmt', [ + 'PPPPPPPPPPPP', 'PDTHMS', 'P0DT999H999M999S', + 'P1DT0H0M0.0000000000000S', 'P1DT0H0M00000000000S', + 'P1DT0H0M0.S']) +def test_iso_constructor_raises(fmt): + with tm.assert_raises_regex(ValueError, 'Invalid ISO 8601 Duration ' + 'format - {}'.format(fmt)): + Timedelta(fmt) + + +def test_td_constructor_on_nanoseconds(): + # GH#9273 + result = Timedelta(nanoseconds=100) + expected = Timedelta('100ns') + assert result == expected + + result = Timedelta(days=1, hours=1, minutes=1, weeks=1, seconds=1, + milliseconds=1, microseconds=1, nanoseconds=1) + expected = Timedelta(694861001001001) + assert result == expected + + result = Timedelta(microseconds=1) + Timedelta(nanoseconds=1) + expected = Timedelta('1us1ns') + assert result == expected + + result = Timedelta(microseconds=1) - Timedelta(nanoseconds=1) + expected = Timedelta('999ns') + assert result == expected + + result = Timedelta(microseconds=1) + 5 * Timedelta(nanoseconds=-2) + expected = Timedelta('990ns') + assert result == expected + + with pytest.raises(TypeError): + Timedelta(nanoseconds='abc') diff --git a/pandas/tests/scalar/timedelta/test_formats.py b/pandas/tests/scalar/timedelta/test_formats.py new file mode 100644 index 0000000000000..8a877c7d1c0fa --- /dev/null +++ b/pandas/tests/scalar/timedelta/test_formats.py @@ -0,0 +1,48 @@ +# -*- coding: utf-8 -*- +from pandas import Timedelta + + +def test_repr(): + assert (repr(Timedelta(10, unit='d')) == + "Timedelta('10 days 00:00:00')") + assert (repr(Timedelta(10, unit='s')) == + "Timedelta('0 days 00:00:10')") + assert (repr(Timedelta(10, unit='ms')) == + "Timedelta('0 days 00:00:00.010000')") + assert (repr(Timedelta(-10, unit='ms')) == + "Timedelta('-1 days +23:59:59.990000')") + + +def test_isoformat(): + td = Timedelta(days=6, minutes=50, seconds=3, + milliseconds=10, microseconds=10, nanoseconds=12) + expected = 'P6DT0H50M3.010010012S' + result = td.isoformat() + assert result == expected + + td = Timedelta(days=4, hours=12, minutes=30, seconds=5) + result = td.isoformat() + expected = 'P4DT12H30M5S' + assert result == expected + + td = Timedelta(nanoseconds=123) + result = td.isoformat() + expected = 'P0DT0H0M0.000000123S' + assert result == expected + + # trim nano + td = Timedelta(microseconds=10) + result = td.isoformat() + expected = 'P0DT0H0M0.00001S' + assert result == expected + + # trim micro + td = Timedelta(milliseconds=1) + result = td.isoformat() + expected = 'P0DT0H0M0.001S' + assert result == expected + + # don't strip every 0 + result = Timedelta(minutes=1).isoformat() + expected = 'P0DT0H1M0S' + assert result == expected diff --git a/pandas/tests/scalar/timedelta/test_timedelta.py b/pandas/tests/scalar/timedelta/test_timedelta.py index 420b66b4ce0dc..0f7fb84c6520b 100644 --- a/pandas/tests/scalar/timedelta/test_timedelta.py +++ b/pandas/tests/scalar/timedelta/test_timedelta.py @@ -36,31 +36,6 @@ def test_ops_error_str(self): assert not left == right assert left != right - def test_to_timedelta_on_nanoseconds(self): - # GH 9273 - result = Timedelta(nanoseconds=100) - expected = Timedelta('100ns') - assert result == expected - - result = Timedelta(days=1, hours=1, minutes=1, weeks=1, seconds=1, - milliseconds=1, microseconds=1, nanoseconds=1) - expected = Timedelta(694861001001001) - assert result == expected - - result = Timedelta(microseconds=1) + Timedelta(nanoseconds=1) - expected = Timedelta('1us1ns') - assert result == expected - - result = Timedelta(microseconds=1) - Timedelta(nanoseconds=1) - expected = Timedelta('999ns') - assert result == expected - - result = Timedelta(microseconds=1) + 5 * Timedelta(nanoseconds=-2) - expected = Timedelta('990ns') - assert result == expected - - pytest.raises(TypeError, lambda: Timedelta(nanoseconds='abc')) - def test_ops_notimplemented(self): class Other: pass @@ -124,164 +99,6 @@ def test_compare_timedelta_ndarray(self): class TestTimedeltas(object): - def setup_method(self, method): - pass - - def test_construction(self): - - expected = np.timedelta64(10, 'D').astype('m8[ns]').view('i8') - assert Timedelta(10, unit='d').value == expected - assert Timedelta(10.0, unit='d').value == expected - assert Timedelta('10 days').value == expected - assert Timedelta(days=10).value == expected - assert Timedelta(days=10.0).value == expected - - expected += np.timedelta64(10, 's').astype('m8[ns]').view('i8') - assert Timedelta('10 days 00:00:10').value == expected - assert Timedelta(days=10, seconds=10).value == expected - assert Timedelta(days=10, milliseconds=10 * 1000).value == expected - assert (Timedelta(days=10, microseconds=10 * 1000 * 1000) - .value == expected) - - # gh-8757: test construction with np dtypes - timedelta_kwargs = {'days': 'D', - 'seconds': 's', - 'microseconds': 'us', - 'milliseconds': 'ms', - 'minutes': 'm', - 'hours': 'h', - 'weeks': 'W'} - npdtypes = [np.int64, np.int32, np.int16, np.float64, np.float32, - np.float16] - for npdtype in npdtypes: - for pykwarg, npkwarg in timedelta_kwargs.items(): - expected = np.timedelta64(1, npkwarg).astype( - 'm8[ns]').view('i8') - assert Timedelta(**{pykwarg: npdtype(1)}).value == expected - - # rounding cases - assert Timedelta(82739999850000).value == 82739999850000 - assert ('0 days 22:58:59.999850' in str(Timedelta(82739999850000))) - assert Timedelta(123072001000000).value == 123072001000000 - assert ('1 days 10:11:12.001' in str(Timedelta(123072001000000))) - - # string conversion with/without leading zero - # GH 9570 - assert Timedelta('0:00:00') == timedelta(hours=0) - assert Timedelta('00:00:00') == timedelta(hours=0) - assert Timedelta('-1:00:00') == -timedelta(hours=1) - assert Timedelta('-01:00:00') == -timedelta(hours=1) - - # more strings & abbrevs - # GH 8190 - assert Timedelta('1 h') == timedelta(hours=1) - assert Timedelta('1 hour') == timedelta(hours=1) - assert Timedelta('1 hr') == timedelta(hours=1) - assert Timedelta('1 hours') == timedelta(hours=1) - assert Timedelta('-1 hours') == -timedelta(hours=1) - assert Timedelta('1 m') == timedelta(minutes=1) - assert Timedelta('1.5 m') == timedelta(seconds=90) - assert Timedelta('1 minute') == timedelta(minutes=1) - assert Timedelta('1 minutes') == timedelta(minutes=1) - assert Timedelta('1 s') == timedelta(seconds=1) - assert Timedelta('1 second') == timedelta(seconds=1) - assert Timedelta('1 seconds') == timedelta(seconds=1) - assert Timedelta('1 ms') == timedelta(milliseconds=1) - assert Timedelta('1 milli') == timedelta(milliseconds=1) - assert Timedelta('1 millisecond') == timedelta(milliseconds=1) - assert Timedelta('1 us') == timedelta(microseconds=1) - assert Timedelta('1 micros') == timedelta(microseconds=1) - assert Timedelta('1 microsecond') == timedelta(microseconds=1) - assert Timedelta('1.5 microsecond') == Timedelta('00:00:00.000001500') - assert Timedelta('1 ns') == Timedelta('00:00:00.000000001') - assert Timedelta('1 nano') == Timedelta('00:00:00.000000001') - assert Timedelta('1 nanosecond') == Timedelta('00:00:00.000000001') - - # combos - assert Timedelta('10 days 1 hour') == timedelta(days=10, hours=1) - assert Timedelta('10 days 1 h') == timedelta(days=10, hours=1) - assert Timedelta('10 days 1 h 1m 1s') == timedelta( - days=10, hours=1, minutes=1, seconds=1) - assert Timedelta('-10 days 1 h 1m 1s') == -timedelta( - days=10, hours=1, minutes=1, seconds=1) - assert Timedelta('-10 days 1 h 1m 1s') == -timedelta( - days=10, hours=1, minutes=1, seconds=1) - assert Timedelta('-10 days 1 h 1m 1s 3us') == -timedelta( - days=10, hours=1, minutes=1, seconds=1, microseconds=3) - assert Timedelta('-10 days 1 h 1.5m 1s 3us'), -timedelta( - days=10, hours=1, minutes=1, seconds=31, microseconds=3) - - # Currently invalid as it has a - on the hh:mm:dd part - # (only allowed on the days) - pytest.raises(ValueError, - lambda: Timedelta('-10 days -1 h 1.5m 1s 3us')) - - # only leading neg signs are allowed - pytest.raises(ValueError, - lambda: Timedelta('10 days -1 h 1.5m 1s 3us')) - - # no units specified - pytest.raises(ValueError, lambda: Timedelta('3.1415')) - - # invalid construction - tm.assert_raises_regex(ValueError, "cannot construct a Timedelta", - lambda: Timedelta()) - tm.assert_raises_regex(ValueError, - "unit abbreviation w/o a number", - lambda: Timedelta('foo')) - tm.assert_raises_regex(ValueError, - "cannot construct a Timedelta from the " - "passed arguments, allowed keywords are ", - lambda: Timedelta(day=10)) - - # round-trip both for string and value - for v in ['1s', '-1s', '1us', '-1us', '1 day', '-1 day', - '-23:59:59.999999', '-1 days +23:59:59.999999', '-1ns', - '1ns', '-23:59:59.999999999']: - - td = Timedelta(v) - assert Timedelta(td.value) == td - - # str does not normally display nanos - if not td.nanoseconds: - assert Timedelta(str(td)) == td - assert Timedelta(td._repr_base(format='all')) == td - - # floats - expected = np.timedelta64( - 10, 's').astype('m8[ns]').view('i8') + np.timedelta64( - 500, 'ms').astype('m8[ns]').view('i8') - assert Timedelta(10.5, unit='s').value == expected - - # offset - assert (to_timedelta(pd.offsets.Hour(2)) == - Timedelta('0 days, 02:00:00')) - assert (Timedelta(pd.offsets.Hour(2)) == - Timedelta('0 days, 02:00:00')) - assert (Timedelta(pd.offsets.Second(2)) == - Timedelta('0 days, 00:00:02')) - - # gh-11995: unicode - expected = Timedelta('1H') - result = pd.Timedelta(u'1H') - assert result == expected - assert (to_timedelta(pd.offsets.Hour(2)) == - Timedelta(u'0 days, 02:00:00')) - - pytest.raises(ValueError, lambda: Timedelta(u'foo bar')) - - def test_overflow_on_construction(self): - # xref https://github.com/statsmodels/statsmodels/issues/3374 - value = pd.Timedelta('1day').value * 20169940 - pytest.raises(OverflowError, pd.Timedelta, value) - - # xref gh-17637 - with pytest.raises(OverflowError): - pd.Timedelta(7 * 19999, unit='D') - - with pytest.raises(OverflowError): - pd.Timedelta(timedelta(days=13 * 19999)) - def test_total_seconds_scalar(self): # see gh-10939 rng = Timedelta('1 days, 10:11:12.100123456') @@ -291,17 +108,6 @@ def test_total_seconds_scalar(self): rng = Timedelta(np.nan) assert np.isnan(rng.total_seconds()) - def test_repr(self): - - assert (repr(Timedelta(10, unit='d')) == - "Timedelta('10 days 00:00:00')") - assert (repr(Timedelta(10, unit='s')) == - "Timedelta('0 days 00:00:10')") - assert (repr(Timedelta(10, unit='ms')) == - "Timedelta('0 days 00:00:00.010000')") - assert (repr(Timedelta(-10, unit='ms')) == - "Timedelta('-1 days +23:59:59.990000')") - def test_conversion(self): for td in [Timedelta(10, unit='d'), @@ -756,63 +562,3 @@ def test_components(self): result = s.dt.components assert not result.iloc[0].isna().all() assert result.iloc[1].isna().all() - - def test_isoformat(self): - td = Timedelta(days=6, minutes=50, seconds=3, - milliseconds=10, microseconds=10, nanoseconds=12) - expected = 'P6DT0H50M3.010010012S' - result = td.isoformat() - assert result == expected - - td = Timedelta(days=4, hours=12, minutes=30, seconds=5) - result = td.isoformat() - expected = 'P4DT12H30M5S' - assert result == expected - - td = Timedelta(nanoseconds=123) - result = td.isoformat() - expected = 'P0DT0H0M0.000000123S' - assert result == expected - - # trim nano - td = Timedelta(microseconds=10) - result = td.isoformat() - expected = 'P0DT0H0M0.00001S' - assert result == expected - - # trim micro - td = Timedelta(milliseconds=1) - result = td.isoformat() - expected = 'P0DT0H0M0.001S' - assert result == expected - - # don't strip every 0 - result = Timedelta(minutes=1).isoformat() - expected = 'P0DT0H1M0S' - assert result == expected - - @pytest.mark.parametrize('fmt,exp', [ - ('P6DT0H50M3.010010012S', Timedelta(days=6, minutes=50, seconds=3, - milliseconds=10, microseconds=10, - nanoseconds=12)), - ('P-6DT0H50M3.010010012S', Timedelta(days=-6, minutes=50, seconds=3, - milliseconds=10, microseconds=10, - nanoseconds=12)), - ('P4DT12H30M5S', Timedelta(days=4, hours=12, minutes=30, seconds=5)), - ('P0DT0H0M0.000000123S', Timedelta(nanoseconds=123)), - ('P0DT0H0M0.00001S', Timedelta(microseconds=10)), - ('P0DT0H0M0.001S', Timedelta(milliseconds=1)), - ('P0DT0H1M0S', Timedelta(minutes=1)), - ('P1DT25H61M61S', Timedelta(days=1, hours=25, minutes=61, seconds=61)) - ]) - def test_iso_constructor(self, fmt, exp): - assert Timedelta(fmt) == exp - - @pytest.mark.parametrize('fmt', [ - 'PPPPPPPPPPPP', 'PDTHMS', 'P0DT999H999M999S', - 'P1DT0H0M0.0000000000000S', 'P1DT0H0M00000000000S', - 'P1DT0H0M0.S']) - def test_iso_constructor_raises(self, fmt): - with tm.assert_raises_regex(ValueError, 'Invalid ISO 8601 Duration ' - 'format - {}'.format(fmt)): - Timedelta(fmt) From 1de47fb6f2757cc792b7f260052bbe1fa4493b40 Mon Sep 17 00:00:00 2001 From: Mike Kutzma Date: Tue, 20 Feb 2018 06:20:46 -0500 Subject: [PATCH 143/214] BUG: GH19458 fixes precision issue in TimeDelta.total_seconds() (#19783) --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/_libs/tslibs/timedeltas.pyx | 2 +- pandas/tests/scalar/timedelta/test_timedelta.py | 9 +++++++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index f31d0a5a0667c..349d7607559c5 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -733,6 +733,7 @@ Datetimelike - Bug in :func:`to_datetime` where passing an out-of-bounds datetime with ``errors='coerce'`` and ``utc=True`` would raise ``OutOfBoundsDatetime`` instead of parsing to ``NaT`` (:issue:`19612`) - Bug in :func:`Timedelta.__add__`, :func:`Timedelta.__sub__` where adding or subtracting a ``np.timedelta64`` object would return another ``np.timedelta64`` instead of a ``Timedelta`` (:issue:`19738`) - Bug in :func:`Timedelta.__floordiv__`, :func:`Timedelta.__rfloordiv__` where operating with a ``Tick`` object would raise a ``TypeError`` instead of returning a numeric value (:issue:`19738`) +- Bug in :func:`Timedelta.total_seconds()` causing precision errors i.e. `Timedelta('30S').total_seconds()==30.000000000000004` (:issue:`19458`) Timezones diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 4483225e1801d..78fdeb988e0f2 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -739,7 +739,7 @@ cdef class _Timedelta(timedelta): """ Total duration of timedelta in seconds (to ns precision) """ - return 1e-9 * self.value + return self.value / 1e9 def view(self, dtype): """ array view compat """ diff --git a/pandas/tests/scalar/timedelta/test_timedelta.py b/pandas/tests/scalar/timedelta/test_timedelta.py index 0f7fb84c6520b..4257c610fb960 100644 --- a/pandas/tests/scalar/timedelta/test_timedelta.py +++ b/pandas/tests/scalar/timedelta/test_timedelta.py @@ -512,6 +512,15 @@ def test_implementation_limits(self): with pytest.raises(OverflowError): Timedelta(max_td.value + 1, 'ns') + def test_total_seconds_precision(self): + # GH 19458 + assert Timedelta('30S').total_seconds() == 30.0 + assert Timedelta('0').total_seconds() == 0.0 + assert Timedelta('-2S').total_seconds() == -2.0 + assert Timedelta('5.324S').total_seconds() == 5.324 + assert (Timedelta('30S').total_seconds() - 30.0) < 1e-20 + assert (30.0 - Timedelta('30S').total_seconds()) < 1e-20 + def test_timedelta_arithmetic(self): data = pd.Series(['nat', '32 days'], dtype='timedelta64[ns]') deltas = [timedelta(days=1), Timedelta(1, unit='D')] From 8bfcddc7728deaf8e840416d83c8feda86630d27 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 20 Feb 2018 06:34:52 -0500 Subject: [PATCH 144/214] DOC: whatsnew cleanups --- doc/source/whatsnew/v0.23.0.txt | 155 +++++++++++++++++--------------- 1 file changed, 82 insertions(+), 73 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 349d7607559c5..ed50596843272 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -219,50 +219,6 @@ Please note that the string `index` is not supported with the round trip format, new_df print(new_df.index.name) -.. _whatsnew_0230.enhancements.index_division_by_zero: - -Index Division By Zero Fills Correctly -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Division operations on ``Index`` and subclasses will now fill division of positive numbers by zero with ``np.inf``, division of negative numbers by zero with ``-np.inf`` and `0 / 0` with ``np.nan``. This matches existing ``Series`` behavior. (:issue:`19322`, :issue:`19347`) - -Previous Behavior: - -.. code-block:: ipython - - In [6]: index = pd.Int64Index([-1, 0, 1]) - - In [7]: index / 0 - Out[7]: Int64Index([0, 0, 0], dtype='int64') - - # Previous behavior yielded different results depending on the type of zero in the divisor - In [8]: index / 0.0 - Out[8]: Float64Index([-inf, nan, inf], dtype='float64') - - In [9]: index = pd.UInt64Index([0, 1]) - - In [10]: index / np.array([0, 0], dtype=np.uint64) - Out[10]: UInt64Index([0, 0], dtype='uint64') - - In [11]: pd.RangeIndex(1, 5) / 0 - ZeroDivisionError: integer division or modulo by zero - -Current Behavior: - -.. ipython:: python - - index = pd.Int64Index([-1, 0, 1]) - # division by zero gives -infinity where negative, +infinity where positive, and NaN for 0 / 0 - index / 0 - - # The result of division by zero should not depend on whether the zero is int or float - index / 0.0 - - index = pd.UInt64Index([0, 1]) - index / np.array([0, 0], dtype=np.uint64) - - pd.RangeIndex(1, 5) / 0 - .. _whatsnew_0230.enhancements.assign_dependent: ``.assign()`` accepts dependent arguments @@ -467,6 +423,50 @@ Build Changes - Building from source now explicitly requires ``setuptools`` in ``setup.py`` (:issue:`18113`) - Updated conda recipe to be in compliance with conda-build 3.0+ (:issue:`18002`) +.. _whatsnew_0230.api_breaking.index_division_by_zero: + +Index Division By Zero Fills Correctly +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Division operations on ``Index`` and subclasses will now fill division of positive numbers by zero with ``np.inf``, division of negative numbers by zero with ``-np.inf`` and `0 / 0` with ``np.nan``. This matches existing ``Series`` behavior. (:issue:`19322`, :issue:`19347`) + +Previous Behavior: + +.. code-block:: ipython + + In [6]: index = pd.Int64Index([-1, 0, 1]) + + In [7]: index / 0 + Out[7]: Int64Index([0, 0, 0], dtype='int64') + + # Previous behavior yielded different results depending on the type of zero in the divisor + In [8]: index / 0.0 + Out[8]: Float64Index([-inf, nan, inf], dtype='float64') + + In [9]: index = pd.UInt64Index([0, 1]) + + In [10]: index / np.array([0, 0], dtype=np.uint64) + Out[10]: UInt64Index([0, 0], dtype='uint64') + + In [11]: pd.RangeIndex(1, 5) / 0 + ZeroDivisionError: integer division or modulo by zero + +Current Behavior: + +.. ipython:: python + + index = pd.Int64Index([-1, 0, 1]) + # division by zero gives -infinity where negative, +infinity where positive, and NaN for 0 / 0 + index / 0 + + # The result of division by zero should not depend on whether the zero is int or float + index / 0.0 + + index = pd.UInt64Index([0, 1]) + index / np.array([0, 0], dtype=np.uint64) + + pd.RangeIndex(1, 5) / 0 + .. _whatsnew_0230.api_breaking.extract: Extraction of matching patterns from strings @@ -475,10 +475,8 @@ Extraction of matching patterns from strings By default, extracting matching patterns from strings with :func:`str.extract` used to return a ``Series`` if a single group was being extracted (a ``DataFrame`` if more than one group was extracted``). As of Pandas 0.23.0 :func:`str.extract` always returns a ``DataFrame``, unless -``expand`` is set to ``False`` (:issue:`11386`). - -Also, ``None`` was an accepted value for the ``expand`` parameter (which was equivalent to -``False``), but now raises a ``ValueError``. +``expand`` is set to ``False``. Finallay, ``None`` was an accepted value for +the ``expand`` parameter (which was equivalent to ``False``), but now raises a ``ValueError``. (:issue:`11386`) Previous Behavior: @@ -539,7 +537,26 @@ Notice in the example above that the converted ``Categorical`` has retained ``or Note that the unintenional conversion of ``ordered`` discussed above did not arise in previous versions due to separate bugs that prevented ``astype`` from doing any type of category to category conversion (:issue:`10696`, :issue:`18593`). These bugs have been fixed in this release, and motivated changing the default value of ``ordered``. -.. _whatsnew_0230.api: +.. _whatsnew_0230.api.datetimelike: + +Datetimelike API Changes +^^^^^^^^^^^^^^^^^^^^^^^^ + +- The default ``Timedelta`` constructor now accepts an ``ISO 8601 Duration`` string as an argument (:issue:`19040`) +- Addition or subtraction of ``NaT`` from :class:`TimedeltaIndex` will return ``TimedeltaIndex`` instead of ``DatetimeIndex`` (:issue:`19124`) +- :func:`DatetimeIndex.shift` and :func:`TimedeltaIndex.shift` will now raise ``NullFrequencyError`` (which subclasses ``ValueError``, which was raised in older versions) when the index- Addition and subtraction of ``NaN`` from a :class:`Series` with ``dtype='timedelta64[ns]'`` will raise a ``TypeError` instead of treating the ``NaN`` as ``NaT`` (:issue:`19274`) +- ``NaT`` division with :class:`datetime.timedelta` will now return ``NaN`` instead of raising (:issue:`17876`) +- :class:`Timestamp` will no longer silently ignore unused or invalid ``tz`` or ``tzinfo`` keyword arguments (:issue:`17690`) +- :class:`Timestamp` will no longer silently ignore invalid ``freq`` arguments (:issue:`5168`) +- :class:`CacheableOffset` and :class:`WeekDay` are no longer available in the ``pandas.tseries.offsets`` module (:issue:`17830`) +- ``pandas.tseries.frequencies.get_freq_group()`` and ``pandas.tseries.frequencies.DAYS`` are removed from the public API (:issue:`18034`) +- :func:`Series.truncate` and :func:`DataFrame.truncate` will raise a ``ValueError`` if the index is not sorted instead of an unhelpful ``KeyError`` (:issue:`17935`) +- Restricted ``DateOffset`` keyword arguments. Previously, ``DateOffset`` subclasses allowed arbitrary keyword arguments which could lead to unexpected behavior. Now, only valid arguments will be accepted. (:issue:`17176`, :issue:`18226`). +- Subtracting ``NaT`` from a :class:`Series` with ``dtype='datetime64[ns]'`` returns a ``Series`` with ``dtype='timedelta64[ns]'`` instead of ``dtype='datetime64[ns]'``(:issue:`18808`) +- Operations between a :class:`Series` with dtype ``dtype='datetime64[ns]'`` and a :class:`PeriodIndex` will correctly raises ``TypeError`` (:issue:`18850`) +- Subtraction of :class:`Series` with timezone-aware ``dtype='datetime64[ns]'`` with mis-matched timezones will raise ``TypeError`` instead of ``ValueError`` (issue:`18817`) + +.. _whatsnew_0230.api.other: Other API Changes ^^^^^^^^^^^^^^^^^ @@ -547,45 +564,32 @@ Other API Changes - :func:`Series.astype` and :func:`Index.astype` with an incompatible dtype will now raise a ``TypeError`` rather than a ``ValueError`` (:issue:`18231`) - ``Series`` construction with an ``object`` dtyped tz-aware datetime and ``dtype=object`` specified, will now return an ``object`` dtyped ``Series``, previously this would infer the datetime dtype (:issue:`18231`) - A :class:`Series` of ``dtype=category`` constructed from an empty ``dict`` will now have categories of ``dtype=object`` rather than ``dtype=float64``, consistently with the case in which an empty list is passed (:issue:`18515`) -- ``NaT`` division with :class:`datetime.timedelta` will now return ``NaN`` instead of raising (:issue:`17876`) - All-NaN levels in a ``MultiIndex`` are now assigned ``float`` rather than ``object`` dtype, promoting consistency with ``Index`` (:issue:`17929`). - Levels names of a ``MultiIndex`` (when not None) are now required to be unique: trying to create a ``MultiIndex`` with repeated names will raise a ``ValueError`` (:issue:`18872`) -- :class:`Timestamp` will no longer silently ignore unused or invalid ``tz`` or ``tzinfo`` keyword arguments (:issue:`17690`) -- :class:`Timestamp` will no longer silently ignore invalid ``freq`` arguments (:issue:`5168`) -- :class:`CacheableOffset` and :class:`WeekDay` are no longer available in the ``pandas.tseries.offsets`` module (:issue:`17830`) -- ``pandas.tseries.frequencies.get_freq_group()`` and ``pandas.tseries.frequencies.DAYS`` are removed from the public API (:issue:`18034`) -- :func:`Series.truncate` and :func:`DataFrame.truncate` will raise a ``ValueError`` if the index is not sorted instead of an unhelpful ``KeyError`` (:issue:`17935`) + - :func:`Index.map` can now accept ``Series`` and dictionary input objects (:issue:`12756`, :issue:`18482`, :issue:`18509`). - :func:`DataFrame.unstack` will now default to filling with ``np.nan`` for ``object`` columns. (:issue:`12815`) - :class:`IntervalIndex` constructor will raise if the ``closed`` parameter conflicts with how the input data is inferred to be closed (:issue:`18421`) - Inserting missing values into indexes will work for all types of indexes and automatically insert the correct type of missing value (``NaN``, ``NaT``, etc.) regardless of the type passed in (:issue:`18295`) -- Restricted ``DateOffset`` keyword arguments. Previously, ``DateOffset`` subclasses allowed arbitrary keyword arguments which could lead to unexpected behavior. Now, only valid arguments will be accepted. (:issue:`17176`, :issue:`18226`). -- :func:`DataFrame.from_items` provides a more informative error message when passed scalar values (:issue:`17312`) - When created with duplicate labels, ``MultiIndex`` now raises a ``ValueError``. (:issue:`17464`) - :func:`Series.fillna` now raises a ``TypeError`` instead of a ``ValueError`` when passed a list, tuple or DataFrame as a ``value`` (:issue:`18293`) - :func:`pandas.DataFrame.merge` no longer casts a ``float`` column to ``object`` when merging on ``int`` and ``float`` columns (:issue:`16572`) +- :func:`pandas.merge` now raises a ``ValueError`` when trying to merge on incompatible data types (:issue:`9780`) +- :func:`pandas.merge` provides a more informative error message when trying to merge on timezone-aware and timezone-naive columns (:issue:`15800`) - The default NA value for :class:`UInt64Index` has changed from 0 to ``NaN``, which impacts methods that mask with NA, such as ``UInt64Index.where()`` (:issue:`18398`) - Refactored ``setup.py`` to use ``find_packages`` instead of explicitly listing out all subpackages (:issue:`18535`) - Rearranged the order of keyword arguments in :func:`read_excel()` to align with :func:`read_csv()` (:issue:`16672`) -- :func:`pandas.merge` now raises a ``ValueError`` when trying to merge on incompatible data types (:issue:`9780`) - :func:`wide_to_long` previously kept numeric-like suffixes as ``object`` dtype. Now they are cast to numeric if possible (:issue:`17627`) - In :func:`read_excel`, the ``comment`` argument is now exposed as a named parameter (:issue:`18735`) - Rearranged the order of keyword arguments in :func:`read_excel()` to align with :func:`read_csv()` (:issue:`16672`) - The options ``html.border`` and ``mode.use_inf_as_null`` were deprecated in prior versions, these will now show ``FutureWarning`` rather than a ``DeprecationWarning`` (:issue:`19003`) -- Subtracting ``NaT`` from a :class:`Series` with ``dtype='datetime64[ns]'`` returns a ``Series`` with ``dtype='timedelta64[ns]'`` instead of ``dtype='datetime64[ns]'``(:issue:`18808`) -- Operations between a :class:`Series` with dtype ``dtype='datetime64[ns]'`` and a :class:`PeriodIndex` will correctly raises ``TypeError`` (:issue:`18850`) -- Subtraction of :class:`Series` with timezone-aware ``dtype='datetime64[ns]'`` with mis-matched timezones will raise ``TypeError`` instead of ``ValueError`` (issue:`18817`) - :class:`IntervalIndex` and ``IntervalDtype`` no longer support categorical, object, and string subtypes (:issue:`19016`) -- The default ``Timedelta`` constructor now accepts an ``ISO 8601 Duration`` string as an argument (:issue:`19040`) - ``IntervalDtype`` now returns ``True`` when compared against ``'interval'`` regardless of subtype, and ``IntervalDtype.name`` now returns ``'interval'`` regardless of subtype (:issue:`18980`) - ``KeyError`` now raises instead of ``ValueError`` in :meth:`~DataFrame.drop`, :meth:`~Panel.drop`, :meth:`~Series.drop`, :meth:`~Index.drop` when dropping a non-existent element in an axis with duplicates (:issue:`19186`) - :func:`Series.to_csv` now accepts a ``compression`` argument that works in the same way as the ``compression`` argument in :func:`DataFrame.to_csv` (:issue:`18958`) -- Addition or subtraction of ``NaT`` from :class:`TimedeltaIndex` will return ``TimedeltaIndex`` instead of ``DatetimeIndex`` (:issue:`19124`) -- :func:`DatetimeIndex.shift` and :func:`TimedeltaIndex.shift` will now raise ``NullFrequencyError`` (which subclasses ``ValueError``, which was raised in older versions) when the index object frequency is ``None`` (:issue:`19147`) -- Addition and subtraction of ``NaN`` from a :class:`Series` with ``dtype='timedelta64[ns]'`` will raise a ``TypeError` instead of treating the ``NaN`` as ``NaT`` (:issue:`19274`) + object frequency is ``None`` (:issue:`19147`) - Set operations (union, difference...) on :class:`IntervalIndex` with incompatible index types will now raise a ``TypeError`` rather than a ``ValueError`` (:issue:`19329`) - :class:`DateOffset` objects render more simply, e.g. "" instead of "" (:issue:`19403`) -- :func:`pandas.merge` provides a more informative error message when trying to merge on timezone-aware and timezone-naive columns (:issue:`15800`) .. _whatsnew_0230.deprecations: @@ -603,7 +607,8 @@ Deprecations - :func:`read_excel` has deprecated the ``skip_footer`` parameter. Use ``skipfooter`` instead (:issue:`18836`) - The ``is_copy`` attribute is deprecated and will be removed in a future version (:issue:`18801`). - ``IntervalIndex.from_intervals`` is deprecated in favor of the :class:`IntervalIndex` constructor (:issue:`19263`) -- :func:``DataFrame.from_items`` is deprecated. Use :func:``DataFrame.from_dict()`` instead, or :func:``DataFrame.from_dict(OrderedDict())`` if you wish to preserve the key order (:issue:`17320`) +- ``DataFrame.from_items`` is deprecated. Use :func:`DataFrame.from_dict` instead, or ``DataFrame.from_dict(OrderedDict())`` if you wish to preserve the key order (:issue:`17320`, :issue:`17312`) + - The ``broadcast`` parameter of ``.apply()`` is deprecated in favor of ``result_type='broadcast'`` (:issue:`18577`) - The ``reduce`` parameter of ``.apply()`` is deprecated in favor of ``result_type='reduce'`` (:issue:`18577`) @@ -712,17 +717,12 @@ Datetimelike - Bug in :func:`Series.__sub__` subtracting a non-nanosecond ``np.datetime64`` object from a ``Series`` gave incorrect results (:issue:`7996`) - Bug in :class:`DatetimeIndex`, :class:`TimedeltaIndex` addition and subtraction of zero-dimensional integer arrays gave incorrect results (:issue:`19012`) +- Bug in :class:`DatetimeIndex` and :class:`TimedeltaIndex` where adding or subtracting an array-like of ``DateOffset`` objects either raised (``np.array``, ``pd.Index``) or broadcast incorrectly (``pd.Series``) (:issue:`18849`) - Bug in :func:`Series.__add__` adding Series with dtype ``timedelta64[ns]`` to a timezone-aware ``DatetimeIndex`` incorrectly dropped timezone information (:issue:`13905`) -- Bug in :func:`Timedelta.__floordiv__` and :func:`Timedelta.__rfloordiv__` dividing by many incompatible numpy objects was incorrectly allowed (:issue:`18846`) - Adding a ``Period`` object to a ``datetime`` or ``Timestamp`` object will now correctly raise a ``TypeError`` (:issue:`17983`) - Bug in :class:`Timestamp` where comparison with an array of ``Timestamp`` objects would result in a ``RecursionError`` (:issue:`15183`) -- Bug in :class:`DatetimeIndex` and :class:`TimedeltaIndex` where adding or subtracting an array-like of ``DateOffset`` objects either raised (``np.array``, ``pd.Index``) or broadcast incorrectly (``pd.Series``) (:issue:`18849`) - Bug in :class:`Series` floor-division where operating on a scalar ``timedelta`` raises an exception (:issue:`18846`) -- Bug in :class:`Series`` with ``dtype='timedelta64[ns]`` where addition or subtraction of ``TimedeltaIndex`` had results cast to ``dtype='int64'`` (:issue:`17250`) -- Bug in :class:`TimedeltaIndex` where division by a ``Series`` would return a ``TimedeltaIndex`` instead of a ``Series`` (:issue:`19042`) -- Bug in :class:`Series` with ``dtype='timedelta64[ns]`` where addition or subtraction of ``TimedeltaIndex`` could return a ``Series`` with an incorrect name (:issue:`19043`) - Bug in :class:`DatetimeIndex` where the repr was not showing high-precision time values at the end of a day (e.g., 23:59:59.999999999) (:issue:`19030`) -- Bug where dividing a scalar timedelta-like object with :class:`TimedeltaIndex` performed the reciprocal operation (:issue:`19125`) - Bug in ``.astype()`` to non-ns timedelta units would hold the incorrect dtype (:issue:`19176`, :issue:`19223`, :issue:`12425`) - Bug in subtracting :class:`Series` from ``NaT`` incorrectly returning ``NaT`` (:issue:`19158`) - Bug in :func:`Series.truncate` which raises ``TypeError`` with a monotonic ``PeriodIndex`` (:issue:`17717`) @@ -731,6 +731,15 @@ Datetimelike - Bug in :class:`Timestamp` and :func:`to_datetime` where a string representing a barely out-of-bounds timestamp would be incorrectly rounded down instead of raising ``OutOfBoundsDatetime`` (:issue:`19382`) - Bug in :func:`Timestamp.floor` :func:`DatetimeIndex.floor` where time stamps far in the future and past were not rounded correctly (:issue:`19206`) - Bug in :func:`to_datetime` where passing an out-of-bounds datetime with ``errors='coerce'`` and ``utc=True`` would raise ``OutOfBoundsDatetime`` instead of parsing to ``NaT`` (:issue:`19612`) + +Timedelta +^^^^^^^^^ + +- Bug in :class:`Series`` with ``dtype='timedelta64[ns]`` where addition or subtraction of ``TimedeltaIndex`` had results cast to ``dtype='int64'`` (:issue:`17250`) +- Bug in :class:`Series` with ``dtype='timedelta64[ns]`` where addition or subtraction of ``TimedeltaIndex`` could return a ``Series`` with an incorrect name (:issue:`19043`) +- Bug in :func:`Timedelta.__floordiv__` and :func:`Timedelta.__rfloordiv__` dividing by many incompatible numpy objects was incorrectly allowed (:issue:`18846`) +- Bug where dividing a scalar timedelta-like object with :class:`TimedeltaIndex` performed the reciprocal operation (:issue:`19125`) +- Bug in :class:`TimedeltaIndex` where division by a ``Series`` would return a ``TimedeltaIndex`` instead of a ``Series`` (:issue:`19042`) - Bug in :func:`Timedelta.__add__`, :func:`Timedelta.__sub__` where adding or subtracting a ``np.timedelta64`` object would return another ``np.timedelta64`` instead of a ``Timedelta`` (:issue:`19738`) - Bug in :func:`Timedelta.__floordiv__`, :func:`Timedelta.__rfloordiv__` where operating with a ``Tick`` object would raise a ``TypeError`` instead of returning a numeric value (:issue:`19738`) - Bug in :func:`Timedelta.total_seconds()` causing precision errors i.e. `Timedelta('30S').total_seconds()==30.000000000000004` (:issue:`19458`) From 740ad9aabda113e34de0d30c78b8aec40414daf7 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 20 Feb 2018 08:57:40 -0500 Subject: [PATCH 145/214] DOC: typos in whatsnew --- doc/source/api.rst | 10 ++++++++++ doc/source/whatsnew/v0.23.0.txt | 29 ++++++++++++++--------------- 2 files changed, 24 insertions(+), 15 deletions(-) diff --git a/doc/source/api.rst b/doc/source/api.rst index 103b0fe9ff019..3b38f0caa1766 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -1682,6 +1682,16 @@ MultiIndex Components MultiIndex.reorder_levels MultiIndex.remove_unused_levels +MultiIndex Selecting +~~~~~~~~~~~~~~~~~~~~ + +.. autosummary:: + :toctree: generated/ + + MultiIndex.get_loc + MultiIndex.get_indexer + MultiIndex.get_level_values + .. _api.datetimeindex: DatetimeIndex diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index ed50596843272..879b245af49cd 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -544,7 +544,8 @@ Datetimelike API Changes - The default ``Timedelta`` constructor now accepts an ``ISO 8601 Duration`` string as an argument (:issue:`19040`) - Addition or subtraction of ``NaT`` from :class:`TimedeltaIndex` will return ``TimedeltaIndex`` instead of ``DatetimeIndex`` (:issue:`19124`) -- :func:`DatetimeIndex.shift` and :func:`TimedeltaIndex.shift` will now raise ``NullFrequencyError`` (which subclasses ``ValueError``, which was raised in older versions) when the index- Addition and subtraction of ``NaN`` from a :class:`Series` with ``dtype='timedelta64[ns]'`` will raise a ``TypeError` instead of treating the ``NaN`` as ``NaT`` (:issue:`19274`) +- :func:`DatetimeIndex.shift` and :func:`TimedeltaIndex.shift` will now raise ``NullFrequencyError`` (which subclasses ``ValueError``, which was raised in older versions) when the index object frequency is ``None`` (:issue:`19147`) +- Addition and subtraction of ``NaN`` from a :class:`Series` with ``dtype='timedelta64[ns]'`` will raise a ``TypeError` instead of treating the ``NaN`` as ``NaT`` (:issue:`19274`) - ``NaT`` division with :class:`datetime.timedelta` will now return ``NaN`` instead of raising (:issue:`17876`) - :class:`Timestamp` will no longer silently ignore unused or invalid ``tz`` or ``tzinfo`` keyword arguments (:issue:`17690`) - :class:`Timestamp` will no longer silently ignore invalid ``freq`` arguments (:issue:`5168`) @@ -552,9 +553,9 @@ Datetimelike API Changes - ``pandas.tseries.frequencies.get_freq_group()`` and ``pandas.tseries.frequencies.DAYS`` are removed from the public API (:issue:`18034`) - :func:`Series.truncate` and :func:`DataFrame.truncate` will raise a ``ValueError`` if the index is not sorted instead of an unhelpful ``KeyError`` (:issue:`17935`) - Restricted ``DateOffset`` keyword arguments. Previously, ``DateOffset`` subclasses allowed arbitrary keyword arguments which could lead to unexpected behavior. Now, only valid arguments will be accepted. (:issue:`17176`, :issue:`18226`). -- Subtracting ``NaT`` from a :class:`Series` with ``dtype='datetime64[ns]'`` returns a ``Series`` with ``dtype='timedelta64[ns]'`` instead of ``dtype='datetime64[ns]'``(:issue:`18808`) +- Subtracting ``NaT`` from a :class:`Series` with ``dtype='datetime64[ns]'`` returns a ``Series`` with ``dtype='timedelta64[ns]'`` instead of ``dtype='datetime64[ns]'`` (:issue:`18808`) - Operations between a :class:`Series` with dtype ``dtype='datetime64[ns]'`` and a :class:`PeriodIndex` will correctly raises ``TypeError`` (:issue:`18850`) -- Subtraction of :class:`Series` with timezone-aware ``dtype='datetime64[ns]'`` with mis-matched timezones will raise ``TypeError`` instead of ``ValueError`` (issue:`18817`) +- Subtraction of :class:`Series` with timezone-aware ``dtype='datetime64[ns]'`` with mis-matched timezones will raise ``TypeError`` instead of ``ValueError`` (:issue:`18817`) .. _whatsnew_0230.api.other: @@ -566,7 +567,6 @@ Other API Changes - A :class:`Series` of ``dtype=category`` constructed from an empty ``dict`` will now have categories of ``dtype=object`` rather than ``dtype=float64``, consistently with the case in which an empty list is passed (:issue:`18515`) - All-NaN levels in a ``MultiIndex`` are now assigned ``float`` rather than ``object`` dtype, promoting consistency with ``Index`` (:issue:`17929`). - Levels names of a ``MultiIndex`` (when not None) are now required to be unique: trying to create a ``MultiIndex`` with repeated names will raise a ``ValueError`` (:issue:`18872`) - - :func:`Index.map` can now accept ``Series`` and dictionary input objects (:issue:`12756`, :issue:`18482`, :issue:`18509`). - :func:`DataFrame.unstack` will now default to filling with ``np.nan`` for ``object`` columns. (:issue:`12815`) - :class:`IntervalIndex` constructor will raise if the ``closed`` parameter conflicts with how the input data is inferred to be closed (:issue:`18421`) @@ -587,9 +587,8 @@ Other API Changes - ``IntervalDtype`` now returns ``True`` when compared against ``'interval'`` regardless of subtype, and ``IntervalDtype.name`` now returns ``'interval'`` regardless of subtype (:issue:`18980`) - ``KeyError`` now raises instead of ``ValueError`` in :meth:`~DataFrame.drop`, :meth:`~Panel.drop`, :meth:`~Series.drop`, :meth:`~Index.drop` when dropping a non-existent element in an axis with duplicates (:issue:`19186`) - :func:`Series.to_csv` now accepts a ``compression`` argument that works in the same way as the ``compression`` argument in :func:`DataFrame.to_csv` (:issue:`18958`) - object frequency is ``None`` (:issue:`19147`) - Set operations (union, difference...) on :class:`IntervalIndex` with incompatible index types will now raise a ``TypeError`` rather than a ``ValueError`` (:issue:`19329`) -- :class:`DateOffset` objects render more simply, e.g. "" instead of "" (:issue:`19403`) +- :class:`DateOffset` objects render more simply, e.g. ```` instead of ```` (:issue:`19403`) .. _whatsnew_0230.deprecations: @@ -637,11 +636,11 @@ Removal of prior version deprecations/changes - ``pandas.tseries.frequencies.get_standard_freq`` has been removed in favor of ``pandas.tseries.frequencies.to_offset(freq).rule_code`` (:issue:`13874`) - The ``freqstr`` keyword has been removed from ``pandas.tseries.frequencies.to_offset`` in favor of ``freq`` (:issue:`13874`) - The ``Panel4D`` and ``PanelND`` classes have been removed (:issue:`13776`) -- The ``Panel`` class has dropped the ``to_long``and ``toLong`` methods (:issue:`19077`) +- The ``Panel`` class has dropped the ``to_long`` and ``toLong`` methods (:issue:`19077`) - The options ``display.line_with`` and ``display.height`` are removed in favor of ``display.width`` and ``display.max_rows`` respectively (:issue:`4391`, :issue:`19107`) - The ``labels`` attribute of the ``Categorical`` class has been removed in favor of :attribute:`Categorical.codes` (:issue:`7768`) - The ``flavor`` parameter have been removed from func:`to_sql` method (:issue:`13611`) -- The modules `pandas.tools.hashing` and `pandas.util.hashing` have been removed (:issue:`16223`) +- The modules ``pandas.tools.hashing`` and ``pandas.util.hashing`` have been removed (:issue:`16223`) - The top-level functions ``pd.rolling_*``, ``pd.expanding_*`` and ``pd.ewm*`` have been removed (Deprecated since v0.18). Instead, use the DataFrame/Series methods :attr:`~DataFrame.rolling`, :attr:`~DataFrame.expanding` and :attr:`~DataFrame.ewm` (:issue:`18723`) @@ -652,7 +651,7 @@ Performance Improvements - Indexers on ``Series`` or ``DataFrame`` no longer create a reference cycle (:issue:`17956`) - Added a keyword argument, ``cache``, to :func:`to_datetime` that improved the performance of converting duplicate datetime arguments (:issue:`11665`) -- :class`DateOffset` arithmetic performance is improved (:issue:`18218`) +- :class:`DateOffset` arithmetic performance is improved (:issue:`18218`) - Converting a ``Series`` of ``Timedelta`` objects to days, seconds, etc... sped up through vectorization of underlying methods (:issue:`18092`) - Improved performance of ``.map()`` with a ``Series/dict`` input (:issue:`15081`) - The overridden ``Timedelta`` properties of days, seconds and microseconds have been removed, leveraging their built-in Python versions instead (:issue:`18242`) @@ -735,14 +734,14 @@ Datetimelike Timedelta ^^^^^^^^^ -- Bug in :class:`Series`` with ``dtype='timedelta64[ns]`` where addition or subtraction of ``TimedeltaIndex`` had results cast to ``dtype='int64'`` (:issue:`17250`) +- Bug in :class:`Series` with ``dtype='timedelta64[ns]`` where addition or subtraction of ``TimedeltaIndex`` had results cast to ``dtype='int64'`` (:issue:`17250`) - Bug in :class:`Series` with ``dtype='timedelta64[ns]`` where addition or subtraction of ``TimedeltaIndex`` could return a ``Series`` with an incorrect name (:issue:`19043`) - Bug in :func:`Timedelta.__floordiv__` and :func:`Timedelta.__rfloordiv__` dividing by many incompatible numpy objects was incorrectly allowed (:issue:`18846`) - Bug where dividing a scalar timedelta-like object with :class:`TimedeltaIndex` performed the reciprocal operation (:issue:`19125`) - Bug in :class:`TimedeltaIndex` where division by a ``Series`` would return a ``TimedeltaIndex`` instead of a ``Series`` (:issue:`19042`) - Bug in :func:`Timedelta.__add__`, :func:`Timedelta.__sub__` where adding or subtracting a ``np.timedelta64`` object would return another ``np.timedelta64`` instead of a ``Timedelta`` (:issue:`19738`) - Bug in :func:`Timedelta.__floordiv__`, :func:`Timedelta.__rfloordiv__` where operating with a ``Tick`` object would raise a ``TypeError`` instead of returning a numeric value (:issue:`19738`) -- Bug in :func:`Timedelta.total_seconds()` causing precision errors i.e. `Timedelta('30S').total_seconds()==30.000000000000004` (:issue:`19458`) +- Bug in :func:`Timedelta.total_seconds()` causing precision errors i.e. ``Timedelta('30S').total_seconds()==30.000000000000004`` (:issue:`19458`) Timezones @@ -763,7 +762,7 @@ Timezones Offsets ^^^^^^^ -- Bug in :class:`WeekOfMonth` and class:`Week` where addition and subtraction did not roll correctly (:issue:`18510`,:issue:`18672`,:issue:`18864`) +- Bug in :class:`WeekOfMonth` and :class:`Week` where addition and subtraction did not roll correctly (:issue:`18510`, :issue:`18672`, :issue:`18864`) - Bug in :class:`WeekOfMonth` and :class:`LastWeekOfMonth` where default keyword arguments for constructor raised ``ValueError`` (:issue:`19142`) - Bug in :class:`FY5253Quarter`, :class:`LastWeekOfMonth` where rollback and rollforward behavior was inconsistent with addition and subtraction behavior (:issue:`18854`) - Bug in :class:`FY5253` where ``datetime`` addition and subtraction incremented incorrectly for dates on the year-end but not normalized to midnight (:issue:`18854`) @@ -776,7 +775,7 @@ Numeric - Bug in :class:`Index` multiplication and division methods where operating with a ``Series`` would return an ``Index`` object instead of a ``Series`` object (:issue:`19042`) - Bug in the :class:`DataFrame` constructor in which data containing very large positive or very large negative numbers was causing ``OverflowError`` (:issue:`18584`) - Bug in :class:`Index` constructor with ``dtype='uint64'`` where int-like floats were not coerced to :class:`UInt64Index` (:issue:`18400`) -- Bug in :class:`DataFrame` flex arithmetic (e.g. `df.add(other, fill_value=foo)`) with a `fill_value` other than ``None`` failed to raise ``NotImplementedError`` in corner cases where either the frame or ``other`` has length zero (:issue:`19522`) +- Bug in :class:`DataFrame` flex arithmetic (e.g. ``df.add(other, fill_value=foo)``) with a ``fill_value`` other than ``None`` failed to raise ``NotImplementedError`` in corner cases where either the frame or ``other`` has length zero (:issue:`19522`) Indexing @@ -863,13 +862,13 @@ Reshaping - Fixed construction of a :class:`Series` from a ``dict`` containing ``NaN`` as key (:issue:`18480`) - Bug in :func:`Series.rank` where ``Series`` containing ``NaT`` modifies the ``Series`` inplace (:issue:`18521`) - Bug in :func:`cut` which fails when using readonly arrays (:issue:`18773`) -- Bug in :func:`Dataframe.pivot_table` which fails when the ``aggfunc`` arg is of type string. The behavior is now consistent with other methods like ``agg`` and ``apply`` (:issue:`18713`) +- Bug in :func:`DataFrame.pivot_table` which fails when the ``aggfunc`` arg is of type string. The behavior is now consistent with other methods like ``agg`` and ``apply`` (:issue:`18713`) - Bug in :func:`DataFrame.merge` in which merging using ``Index`` objects as vectors raised an Exception (:issue:`19038`) - Bug in :func:`DataFrame.stack`, :func:`DataFrame.unstack`, :func:`Series.unstack` which were not returning subclasses (:issue:`15563`) - Bug in timezone comparisons, manifesting as a conversion of the index to UTC in ``.concat()`` (:issue:`18523`) - Bug in :func:`concat` when concatting sparse and dense series it returns only a ``SparseDataFrame``. Should be a ``DataFrame``. (:issue:`18914`, :issue:`18686`, and :issue:`16874`) - Improved error message for :func:`DataFrame.merge` when there is no common merge key (:issue:`19427`) -- Bug in :func:`DataFrame.join` which does an *outer* instead of a *left* join when being called with multiple DataFrames and some have non-unique indices (:issue:`19624`) +- Bug in :func:`DataFrame.join` which does an ``outer`` instead of a ``left`` join when being called with multiple DataFrames and some have non-unique indices (:issue:`19624`) - :func:`Series.rename` now accepts ``axis`` as a kwarg (:issue:`18589`) - Comparisons between :class:`Series` and :class:`Index` would return a ``Series`` with an incorrect name, ignoring the ``Index``'s name attribute (:issue:`19582`) From 3da7b1fe665a34dab6510dad5d68d57a9ca4879a Mon Sep 17 00:00:00 2001 From: Olivier Bilodeau Date: Tue, 20 Feb 2018 18:54:14 -0500 Subject: [PATCH 146/214] DOC: added a reference to DataFrame assign in concatenate section of merging (#18665) --- doc/source/merging.rst | 7 +++++++ doc/source/whatsnew/v0.23.0.txt | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/doc/source/merging.rst b/doc/source/merging.rst index ebade853313ab..4d9746eed0f0b 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -323,6 +323,13 @@ the name of the ``Series``. labels=['df1', 's1'], vertical=False); plt.close('all'); +.. note:: + + Since we're concatenating a ``Series`` to a ``DataFrame``, we could have + achieved the same result with :meth:`DataFrame.assign`. To concatenate an + arbitrary number of pandas objects (``DataFrame`` or ``Series``), use + ``concat``. + If unnamed ``Series`` are passed they will be numbered consecutively. .. ipython:: python diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 879b245af49cd..8d6a3dc72163e 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -677,7 +677,7 @@ Documentation Changes Rewrote some sentences for greater clarity, added more dynamic references to functions, methods and classes. (:issue:`18941`, :issue:`18948`, :issue:`18973`, :issue:`19017`) - +- Added a reference to :func:`DataFrame.assign` in the concatenate section of the merging documentation (:issue:`18665`) .. _whatsnew_0230.bug_fixes: From 83820679f16e6803b653ede5a4e468a211fcb443 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 20 Feb 2018 16:15:56 -0800 Subject: [PATCH 147/214] Sparse Ops Cleanup (#19782) --- pandas/core/frame.py | 2 +- pandas/core/indexes/timedeltas.py | 2 +- pandas/core/ops.py | 45 +++++++++++++------------------ pandas/core/sparse/array.py | 7 +++++ pandas/core/sparse/frame.py | 2 +- pandas/core/sparse/series.py | 5 +--- 6 files changed, 30 insertions(+), 33 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0b315a7c6f031..efd6814ba04c5 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3995,7 +3995,7 @@ def _combine_const(self, other, func, errors='raise', try_cast=True): try_cast=try_cast) return self._constructor(new_data) - def _compare_frame(self, other, func, str_rep, try_cast=True): + def _compare_frame(self, other, func, str_rep): # compare_frame assumes self._indexed_same(other) import pandas.core.computation.expressions as expressions diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 4b543262fc485..41e499da8e008 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -928,7 +928,7 @@ def insert(self, loc, item): def delete(self, loc): """ - Make a new DatetimeIndex with passed location(s) deleted. + Make a new TimedeltaIndex with passed location(s) deleted. Parameters ---------- diff --git a/pandas/core/ops.py b/pandas/core/ops.py index da65f1f31ed2a..ad6102eb6ad0f 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -721,9 +721,7 @@ def add_flex_arithmetic_methods(cls, flex_arith_method, flex_comp_method=None): subtract=new_methods['sub'], divide=new_methods['div'])) # opt out of bool flex methods for now - for k in ('ror_', 'rxor', 'rand_'): - if k in new_methods: - new_methods.pop(k) + assert not any(kname in new_methods for kname in ('ror_', 'rxor', 'rand_')) add_methods(cls, new_methods=new_methods) @@ -1080,19 +1078,19 @@ def na_op(x, y): try: result = lib.scalar_binop(x, y, op) except: - msg = ("cannot compare a dtyped [{dtype}] array " - "with a scalar of type [{type}]" - ).format(dtype=x.dtype, type=type(y).__name__) - raise TypeError(msg) + raise TypeError("cannot compare a dtyped [{dtype}] array " + "with a scalar of type [{typ}]" + .format(dtype=x.dtype, + typ=type(y).__name__)) return result + fill_int = lambda x: x.fillna(0) + fill_bool = lambda x: x.fillna(False).astype(bool) + def wrapper(self, other): is_self_int_dtype = is_integer_dtype(self.dtype) - fill_int = lambda x: x.fillna(0) - fill_bool = lambda x: x.fillna(False).astype(bool) - self, other = _align_method_SERIES(self, other, align_asobject=True) if isinstance(other, ABCDataFrame): @@ -1232,10 +1230,10 @@ def to_series(right): elif right.ndim == 2: if left.shape != right.shape: - msg = ("Unable to coerce to DataFrame, shape " - "must be {req_shape}: given {given_shape}" - ).format(req_shape=left.shape, given_shape=right.shape) - raise ValueError(msg) + raise ValueError("Unable to coerce to DataFrame, shape " + "must be {req_shape}: given {given_shape}" + .format(req_shape=left.shape, + given_shape=right.shape)) right = left._constructor(right, index=left.index, columns=left.columns) @@ -1293,8 +1291,8 @@ def na_op(x, y): result[mask] = op(xrav, y) else: raise TypeError("cannot perform operation {op} between " - "objects of type {x} and {y}".format( - op=name, x=type(x), y=type(y))) + "objects of type {x} and {y}" + .format(op=name, x=type(x), y=type(y))) result, changed = maybe_upcast_putmask(result, ~mask, np.nan) result = result.reshape(x.shape) @@ -1355,7 +1353,7 @@ def f(self, other, axis=default_axis, level=None): if not self._indexed_same(other): self, other = self.align(other, 'outer', level=level, copy=False) - return self._compare_frame(other, na_op, str_rep, try_cast=False) + return self._compare_frame(other, na_op, str_rep) elif isinstance(other, ABCSeries): return _combine_series_frame(self, other, na_op, @@ -1380,7 +1378,7 @@ def f(self, other): if not self._indexed_same(other): raise ValueError('Can only compare identically-labeled ' 'DataFrame objects') - return self._compare_frame(other, func, str_rep, try_cast=True) + return self._compare_frame(other, func, str_rep) elif isinstance(other, ABCSeries): return _combine_series_frame(self, other, func, @@ -1532,10 +1530,6 @@ def wrapper(self, other): .format(other=type(other))) wrapper.__name__ = name - if name.startswith("__"): - # strip special method names, e.g. `__add__` needs to be `add` when - # passed to _sparse_series_op - name = name[2:-2] return wrapper @@ -1568,7 +1562,7 @@ def wrapper(self, other): dtype = getattr(other, 'dtype', None) other = SparseArray(other, fill_value=self.fill_value, dtype=dtype) - return _sparse_array_op(self, other, op, name) + return _sparse_array_op(self, other, op, name, series=False) elif is_scalar(other): with np.errstate(all='ignore'): fill = op(_get_fill(self), np.asarray(other)) @@ -1579,8 +1573,6 @@ def wrapper(self, other): raise TypeError('operation with {other} not supported' .format(other=type(other))) - if name.startswith("__"): - name = name[2:-2] wrapper.__name__ = name return wrapper @@ -1591,4 +1583,5 @@ def wrapper(self, other): sparse_series_special_funcs = dict(arith_method=_arith_method_SPARSE_SERIES, comp_method=_arith_method_SPARSE_SERIES, - bool_method=None) + bool_method=_bool_method_SERIES) +# TODO: I don't think the functions defined by bool_method are tested diff --git a/pandas/core/sparse/array.py b/pandas/core/sparse/array.py index 4f7152666f7bf..92c4fe932f066 100644 --- a/pandas/core/sparse/array.py +++ b/pandas/core/sparse/array.py @@ -54,6 +54,9 @@ def _get_fill(arr): def _sparse_array_op(left, right, op, name, series=False): + if name.startswith('__'): + # For lookups in _libs.sparse we need non-dunder op name + name = name[2:-2] if series and is_integer_dtype(left) and is_integer_dtype(right): # series coerces to float64 if result should have NaN/inf @@ -119,6 +122,10 @@ def _sparse_array_op(left, right, op, name, series=False): def _wrap_result(name, data, sparse_index, fill_value, dtype=None): """ wrap op result to have correct dtype """ + if name.startswith('__'): + # e.g. __eq__ --> eq + name = name[2:-2] + if name in ('eq', 'ne', 'lt', 'gt', 'le', 'ge'): dtype = np.bool diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index 19b126216db81..872a17d8dbabe 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -551,7 +551,6 @@ def _combine_frame(self, other, func, fill_value=None, level=None): return self._constructor(index=new_index).__finalize__(self) new_data = {} - new_fill_value = None if fill_value is not None: # TODO: be a bit more intelligent here for col in new_columns: @@ -568,6 +567,7 @@ def _combine_frame(self, other, func, fill_value=None, level=None): new_data[col] = func(this[col], other[col]) # if the fill values are the same use them? or use a valid one + new_fill_value = None other_fill_value = getattr(other, 'default_fill_value', np.nan) if self.default_fill_value == other_fill_value: new_fill_value = self.default_fill_value diff --git a/pandas/core/sparse/series.py b/pandas/core/sparse/series.py index 335a4c80adc63..26cf9dbadbbf2 100644 --- a/pandas/core/sparse/series.py +++ b/pandas/core/sparse/series.py @@ -811,10 +811,7 @@ def from_coo(cls, A, dense_index=False): return _coo_to_sparse_series(A, dense_index=dense_index) -# overwrite series methods with unaccelerated versions -ops.add_special_arithmetic_methods(SparseSeries, **ops.series_special_funcs) +# overwrite series methods with unaccelerated Sparse-specific versions ops.add_flex_arithmetic_methods(SparseSeries, **ops.series_flex_funcs) -# overwrite basic arithmetic to use SparseSeries version -# force methods to overwrite previous definitions. ops.add_special_arithmetic_methods(SparseSeries, **ops.sparse_series_special_funcs) From e5be6bde969a75b62fee0d9449cf82c2ce29ca1b Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 21 Feb 2018 02:31:55 -0800 Subject: [PATCH 148/214] BUG: fix Period.asfreq conversion near datetime(1, 1, 1) (#19650) --- doc/source/whatsnew/v0.23.0.txt | 3 +- pandas/_libs/src/period_helper.c | 5 +- pandas/_libs/tslibs/period.pyx | 124 +++++++++++++++--- .../tests/scalar/period/test_period_asfreq.py | 22 ++++ pandas/tests/tslibs/test_period_asfreq.py | 1 + 5 files changed, 133 insertions(+), 22 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 8d6a3dc72163e..35856b64c171a 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -741,8 +741,9 @@ Timedelta - Bug in :class:`TimedeltaIndex` where division by a ``Series`` would return a ``TimedeltaIndex`` instead of a ``Series`` (:issue:`19042`) - Bug in :func:`Timedelta.__add__`, :func:`Timedelta.__sub__` where adding or subtracting a ``np.timedelta64`` object would return another ``np.timedelta64`` instead of a ``Timedelta`` (:issue:`19738`) - Bug in :func:`Timedelta.__floordiv__`, :func:`Timedelta.__rfloordiv__` where operating with a ``Tick`` object would raise a ``TypeError`` instead of returning a numeric value (:issue:`19738`) +- Bug in :func:`Period.asfreq` where periods near ``datetime(1, 1, 1)`` could be converted incorrectly (:issue:`19643`) - Bug in :func:`Timedelta.total_seconds()` causing precision errors i.e. ``Timedelta('30S').total_seconds()==30.000000000000004`` (:issue:`19458`) - +- Timezones ^^^^^^^^^ diff --git a/pandas/_libs/src/period_helper.c b/pandas/_libs/src/period_helper.c index 7c4de8e42e73b..a812ed2e7e2b3 100644 --- a/pandas/_libs/src/period_helper.c +++ b/pandas/_libs/src/period_helper.c @@ -138,7 +138,7 @@ PANDAS_INLINE npy_int64 transform_via_day(npy_int64 ordinal, } static npy_int64 DtoB_weekday(npy_int64 absdate) { - return (((absdate) / 7) * 5) + (absdate) % 7 - BDAY_OFFSET; + return floordiv(absdate, 7) * 5 + mod_compat(absdate, 7) - BDAY_OFFSET; } static npy_int64 DtoB(struct date_info *dinfo, @@ -245,7 +245,8 @@ static npy_int64 asfreq_UpsampleWithinDay(npy_int64 ordinal, static npy_int64 asfreq_BtoDT(npy_int64 ordinal, asfreq_info *af_info) { ordinal += BDAY_OFFSET; ordinal = - (((ordinal - 1) / 5) * 7 + mod_compat(ordinal - 1, 5) + 1 - ORD_OFFSET); + (floordiv(ordinal - 1, 5) * 7 + mod_compat(ordinal - 1, 5) + 1 - + ORD_OFFSET); return upsample_daytime(ordinal, af_info); } diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 32ffe4e6d0453..e1c783ac9fa54 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -154,12 +154,32 @@ cdef inline int get_freq_group(int freq) nogil: return (freq // 1000) * 1000 -@cython.cdivision +# specifically _dont_ use cdvision or else ordinals near -1 are assigned to +# incorrect dates GH#19643 +@cython.cdivision(False) cdef int64_t get_period_ordinal(int year, int month, int day, int hour, int minute, int second, int microseconds, int picoseconds, int freq) nogil: - """generate an ordinal in period space""" + """ + Generate an ordinal in period space + + Parameters + ---------- + year : int + month : int + day : int + hour : int + minute : int + second : int + microseconds : int + picoseconds : int + freq : int + + Returns + ------- + period_ordinal : int64_t + """ cdef: int64_t absdays, unix_date, seconds, delta int64_t weeks @@ -190,7 +210,7 @@ cdef int64_t get_period_ordinal(int year, int month, int day, if month >= fmonth: mdiff += 12 - return (year - 1970) * 4 + (mdiff - 1) / 3 + return (year - 1970) * 4 + (mdiff - 1) // 3 elif freq == FR_MTH: return (year - 1970) * 12 + month - 1 @@ -202,14 +222,14 @@ cdef int64_t get_period_ordinal(int year, int month, int day, seconds = unix_date * 86400 + hour * 3600 + minute * 60 + second if freq == FR_MS: - return seconds * 1000 + microseconds / 1000 + return seconds * 1000 + microseconds // 1000 elif freq == FR_US: return seconds * 1000000 + microseconds elif freq == FR_NS: return (seconds * 1000000000 + - microseconds * 1000 + picoseconds / 1000) + microseconds * 1000 + picoseconds // 1000) else: return seconds @@ -229,7 +249,7 @@ cdef int64_t get_period_ordinal(int year, int month, int day, elif freq == FR_BUS: # calculate the current week assuming sunday as last day of a week # Jan 1 0001 is a Monday, so subtract 1 to get to end-of-week - weeks = (unix_date + ORD_OFFSET - 1) / 7 + weeks = (unix_date + ORD_OFFSET - 1) // 7 # calculate the current weekday (in range 1 .. 7) delta = (unix_date + ORD_OFFSET - 1) % 7 + 1 # return the number of business days in full weeks plus the business @@ -241,12 +261,12 @@ cdef int64_t get_period_ordinal(int year, int month, int day, elif freq_group == FR_WK: day_adj = freq - FR_WK - return (unix_date + ORD_OFFSET - (1 + day_adj)) / 7 + 1 - WEEK_OFFSET + return (unix_date + ORD_OFFSET - (1 + day_adj)) // 7 + 1 - WEEK_OFFSET # raise ValueError -cdef int get_date_info(int64_t ordinal, int freq, date_info *dinfo) nogil: +cdef void get_date_info(int64_t ordinal, int freq, date_info *dinfo) nogil: cdef: int64_t absdate double abstime @@ -263,7 +283,6 @@ cdef int get_date_info(int64_t ordinal, int freq, date_info *dinfo) nogil: absdate += 1 dInfoCalc_SetFromAbsDateTime(dinfo, absdate, abstime) - return 0 cdef int64_t get_python_ordinal(int64_t period_ordinal, int freq) nogil: @@ -272,6 +291,15 @@ cdef int64_t get_python_ordinal(int64_t period_ordinal, int freq) nogil: This corresponds to the number of days since Jan., 1st, 1AD. When the instance has a frequency less than daily, the proleptic date is calculated for the last day of the period. + + Parameters + ---------- + period_ordinal : int64_t + freq : int + + Returns + ------- + absdate : int64_t number of days since datetime(1, 1, 1) """ cdef: asfreq_info af_info @@ -285,11 +313,23 @@ cdef int64_t get_python_ordinal(int64_t period_ordinal, int freq) nogil: return toDaily(period_ordinal, &af_info) + ORD_OFFSET -cdef int dInfoCalc_SetFromAbsDateTime(date_info *dinfo, - int64_t absdate, double abstime) nogil: +cdef void dInfoCalc_SetFromAbsDateTime(date_info *dinfo, + int64_t absdate, double abstime) nogil: """ Set the instance's value using the given date and time. Assumes GREGORIAN_CALENDAR. + + Parameters + ---------- + dinfo : date_info* + absdate : int64_t + days elapsed since datetime(1, 1, 1) + abstime : double + seconds elapsed since beginning of day described by absdate + + Notes + ----- + Updates dinfo inplace """ # Bounds check # The calling function is responsible for ensuring that @@ -300,13 +340,21 @@ cdef int dInfoCalc_SetFromAbsDateTime(date_info *dinfo, # Calculate the time dInfoCalc_SetFromAbsTime(dinfo, abstime) - return 0 -cdef int dInfoCalc_SetFromAbsDate(date_info *dinfo, int64_t absdate) nogil: +cdef void dInfoCalc_SetFromAbsDate(date_info *dinfo, int64_t absdate) nogil: """ Sets the date part of the date_info struct Assumes GREGORIAN_CALENDAR + + Parameters + ---------- + dinfo : date_info* + unix_date : int64_t + + Notes + ----- + Updates dinfo inplace """ cdef: pandas_datetimestruct dts @@ -315,13 +363,22 @@ cdef int dInfoCalc_SetFromAbsDate(date_info *dinfo, int64_t absdate) nogil: dinfo.year = dts.year dinfo.month = dts.month dinfo.day = dts.day - return 0 @cython.cdivision -cdef int dInfoCalc_SetFromAbsTime(date_info *dinfo, double abstime) nogil: +cdef void dInfoCalc_SetFromAbsTime(date_info *dinfo, double abstime) nogil: """ Sets the time part of the DateTime object. + + Parameters + ---------- + dinfo : date_info* + abstime : double + seconds elapsed since beginning of day described by absdate + + Notes + ----- + Updates dinfo inplace """ cdef: int inttime @@ -336,7 +393,6 @@ cdef int dInfoCalc_SetFromAbsTime(date_info *dinfo, double abstime) nogil: dinfo.hour = hour dinfo.minute = minute dinfo.second = second - return 0 @cython.cdivision @@ -370,7 +426,19 @@ cdef int64_t absdate_from_ymd(int year, int month, int day) nogil: Find the absdate (days elapsed since datetime(1, 1, 1) for the given year/month/day. Assumes GREGORIAN_CALENDAR + + Parameters + ---------- + year : int + month : int + day : int + + Returns + ------- + absdate : int + days elapsed since datetime(1, 1, 1) """ + # /* Calculate the absolute date cdef: pandas_datetimestruct dts @@ -385,6 +453,25 @@ cdef int64_t absdate_from_ymd(int year, int month, int day) nogil: cdef int get_yq(int64_t ordinal, int freq, int *quarter, int *year): + """ + Find the year and quarter of a Period with the given ordinal and frequency + + Parameters + ---------- + ordinal : int64_t + freq : int + quarter : *int + year : *int + + Returns + ------- + qtr_freq : int + describes the implied quarterly frequency associated with `freq` + + Notes + ----- + Sets quarter and year inplace + """ cdef: asfreq_info af_info int qtr_freq @@ -403,8 +490,8 @@ cdef int get_yq(int64_t ordinal, int freq, int *quarter, int *year): return qtr_freq -cdef int64_t DtoQ_yq(int64_t ordinal, asfreq_info *af_info, - int *year, int *quarter): +cdef void DtoQ_yq(int64_t ordinal, asfreq_info *af_info, + int *year, int *quarter): cdef: date_info dinfo @@ -419,7 +506,6 @@ cdef int64_t DtoQ_yq(int64_t ordinal, asfreq_info *af_info, year[0] = dinfo.year quarter[0] = monthToQuarter(dinfo.month) - return 0 cdef inline int monthToQuarter(int month): diff --git a/pandas/tests/scalar/period/test_period_asfreq.py b/pandas/tests/scalar/period/test_period_asfreq.py index a2819a3478f79..9f8b2562e9e20 100644 --- a/pandas/tests/scalar/period/test_period_asfreq.py +++ b/pandas/tests/scalar/period/test_period_asfreq.py @@ -1,3 +1,7 @@ +import pytest + +from pandas.errors import OutOfBoundsDatetime + import pandas as pd from pandas import Period, offsets from pandas.util import testing as tm @@ -6,6 +10,24 @@ class TestFreqConversion(object): """Test frequency conversion of date objects""" + @pytest.mark.parametrize('freq', ['A', 'Q', 'M', 'W', 'B', 'D']) + def test_asfreq_near_zero(self, freq): + # GH#19643, GH#19650 + per = Period('0001-01-01', freq=freq) + tup1 = (per.year, per.hour, per.day) + + prev = per - 1 + assert (per - 1).ordinal == per.ordinal - 1 + tup2 = (prev.year, prev.month, prev.day) + assert tup2 < tup1 + + @pytest.mark.xfail(reason='GH#19643 period_helper asfreq functions fail ' + 'to check for overflows') + def test_to_timestamp_out_of_bounds(self): + # GH#19643, currently gives Timestamp('1754-08-30 22:43:41.128654848') + per = Period('0001-01-01', freq='B') + with pytest.raises(OutOfBoundsDatetime): + per.to_timestamp() def test_asfreq_corner(self): val = Period(freq='A', year=2007) diff --git a/pandas/tests/tslibs/test_period_asfreq.py b/pandas/tests/tslibs/test_period_asfreq.py index 98959adf6fda4..61737083e22ea 100644 --- a/pandas/tests/tslibs/test_period_asfreq.py +++ b/pandas/tests/tslibs/test_period_asfreq.py @@ -5,6 +5,7 @@ class TestPeriodFreqConversion(object): + def test_intraday_conversion_factors(self): assert period_asfreq(1, get_freq('D'), get_freq('H'), False) == 24 assert period_asfreq(1, get_freq('D'), get_freq('T'), False) == 1440 From f8dfcfb35ca975575dcfba625eac6c9f231c0e5e Mon Sep 17 00:00:00 2001 From: ZhuBaohe Date: Wed, 21 Feb 2018 18:38:41 +0800 Subject: [PATCH 149/214] DOC: correct Series.searchsorted example (#19784) --- pandas/core/base.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index 0ca029ffd4c25..ebd69a5f9aac1 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1165,21 +1165,16 @@ def factorize(self, sort=False, na_sentinel=-1): >>> x.searchsorted([1, 3], side='right') array([1, 3]) - >>> x = pd.Categorical(['apple', 'bread', 'bread', 'cheese', 'milk' ]) + >>> x = pd.Categorical(['apple', 'bread', 'bread', + 'cheese', 'milk'], ordered=True) [apple, bread, bread, cheese, milk] Categories (4, object): [apple < bread < cheese < milk] >>> x.searchsorted('bread') array([1]) # Note: an array, not a scalar - >>> x.searchsorted(['bread']) - array([1]) - - >>> x.searchsorted(['bread', 'eggs']) - array([1, 4]) - - >>> x.searchsorted(['bread', 'eggs'], side='right') - array([3, 4]) # eggs before milk + >>> x.searchsorted(['bread'], side='right') + array([3]) """) @Substitution(klass='IndexOpsMixin') From feec5c140dc83963c1b00679e1e8b9c19665dd7d Mon Sep 17 00:00:00 2001 From: Antonio Quinonez Date: Wed, 21 Feb 2018 02:56:21 -0800 Subject: [PATCH 150/214] DOC: Edit installation instructions for clarity. (#19798) --- doc/source/install.rst | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/doc/source/install.rst b/doc/source/install.rst index c4e331d64e721..4ff63d59024b2 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -6,7 +6,7 @@ Installation ============ -The easiest way for the majority of users to install pandas is to install it +The easiest way to install pandas is to install it as part of the `Anaconda `__ distribution, a cross platform distribution for data analysis and scientific computing. This is the recommended installation method for most users. @@ -40,7 +40,7 @@ packages that make up the `SciPy `__ stack (Linux, Mac OS X, Windows) Python distribution for data analytics and scientific computing. -After running a simple installer, the user will have access to pandas and the +After running the installer, the user will have access to pandas and the rest of the `SciPy `__ stack without needing to install anything else, and without needing to wait for any software to be compiled. @@ -51,9 +51,9 @@ A full list of the packages available as part of the `Anaconda `__ distribution `can be found here `__. -An additional advantage of installing with Anaconda is that you don't require -admin rights to install it, it will install in the user's home directory, and -this also makes it trivial to delete Anaconda at a later date (just delete +Another advantage to installing Anaconda is that you don't need +admin rights to install it. Anaconda can install in the user's home directory, +which makes it trivial to delete Anaconda if you decide (just delete that folder). .. _install.miniconda: @@ -85,9 +85,9 @@ downloading and running the `Miniconda will do this for you. The installer `can be found here `__ -The next step is to create a new conda environment (these are analogous to a -virtualenv but they also allow you to specify precisely which Python version -to install also). Run the following commands from a terminal window:: +The next step is to create a new conda environment. A conda environment is like a +virtualenv that allows you to specify a specific version of Python and set of libraries. +Run the following commands from a terminal window:: conda create -n name_of_my_env python @@ -118,8 +118,8 @@ distribution:: conda install anaconda -If you require any packages that are available to pip but not conda, simply -install pip, and use pip to install these packages:: +If you need packages that are available to pip but not conda, then +install pip, and then use pip to install those packages:: conda install pip pip install django @@ -134,15 +134,12 @@ pandas can be installed via pip from pip install pandas -This will likely require the installation of a number of dependencies, -including NumPy, will require a compiler to compile required bits of code, -and can take a few minutes to complete. Installing using your Linux distribution's package manager. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The commands in this table will install pandas for Python 3 from your distribution. -To install pandas for Python 2 you may need to use the package ``python-pandas``. +To install pandas for Python 2, you may need to use the ``python-pandas`` package. .. csv-table:: :header: "Distribution", "Status", "Download / Repository Link", "Install method" @@ -169,9 +166,9 @@ See the :ref:`contributing documentation ` for complete instructio Running the test suite ~~~~~~~~~~~~~~~~~~~~~~ -pandas is equipped with an exhaustive set of unit tests covering about 97% of +pandas is equipped with an exhaustive set of unit tests, covering about 97% of the codebase as of this writing. To run it on your machine to verify that -everything is working (and you have all of the dependencies, soft and hard, +everything is working (and that you have all of the dependencies, soft and hard, installed), make sure you have `pytest `__ and run: @@ -214,8 +211,8 @@ Recommended Dependencies .. note:: - You are highly encouraged to install these libraries, as they provide large speedups, especially - if working with large data sets. + You are highly encouraged to install these libraries, as they provide speed improvements, especially + when working with large data sets. .. _install.optional_dependencies: From 695614df5b1f1b5536529c855054bc4cb21c5984 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Wed, 21 Feb 2018 06:30:58 -0500 Subject: [PATCH 151/214] BF: Skip test_read_excel_parse_dates if no xlwt which is used in to_excel (#19803) --- pandas/tests/io/test_excel.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index ebb8424b78ed4..4c790a0f0f64a 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -968,6 +968,7 @@ def test_read_excel_chunksize(self): def test_read_excel_parse_dates(self): # GH 11544, 12051 _skip_if_no_openpyxl() + _skip_if_no_xlwt() # for df2.to_excel df = DataFrame( {'col': [1, 2, 3], From eb149cea6ec976a3409f235444982f1026acbe13 Mon Sep 17 00:00:00 2001 From: Eric Chea <5069128+EricChea@users.noreply.github.com> Date: Wed, 21 Feb 2018 06:37:46 -0500 Subject: [PATCH 152/214] DEPR: Add deprecation warning for factorize() order keyword (#19751) --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/core/algorithms.py | 2 ++ pandas/tests/test_algos.py | 9 ++++++++ pandas/tests/util/test_util.py | 14 ++++++++++++ pandas/util/_decorators.py | 38 +++++++++++++++++++++++++++++++-- 5 files changed, 62 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 35856b64c171a..ed3069943bb6a 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -610,6 +610,7 @@ Deprecations - The ``broadcast`` parameter of ``.apply()`` is deprecated in favor of ``result_type='broadcast'`` (:issue:`18577`) - The ``reduce`` parameter of ``.apply()`` is deprecated in favor of ``result_type='reduce'`` (:issue:`18577`) +- The ``order`` parameter of :func:`factorize` is deprecated and will be removed in a future release (:issue:`19727`) .. _whatsnew_0230.prior_deprecations: diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index c754c063fce8e..624045a3d64bc 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -32,6 +32,7 @@ from pandas.core import common as com from pandas._libs import algos, lib, hashtable as htable from pandas._libs.tslib import iNaT +from pandas.util._decorators import deprecate_kwarg # --------------- # @@ -436,6 +437,7 @@ def isin(comps, values): return f(comps, values) +@deprecate_kwarg(old_arg_name='order', new_arg_name=None) def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): """ Encode input values as an enumerated type or categorical variable diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index b1e3177547ac6..884b1eb7342c6 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -248,6 +248,15 @@ def test_uint64_factorize(self): tm.assert_numpy_array_equal(labels, exp_labels) tm.assert_numpy_array_equal(uniques, exp_uniques) + def test_deprecate_order(self): + # gh 19727 - check warning is raised for deprecated keyword, order. + # Test not valid once order keyword is removed. + data = np.array([2**63, 1, 2**63], dtype=np.uint64) + with tm.assert_produces_warning(expected_warning=FutureWarning): + algos.factorize(data, order=True) + with tm.assert_produces_warning(False): + algos.factorize(data) + class TestUnique(object): diff --git a/pandas/tests/util/test_util.py b/pandas/tests/util/test_util.py index 3b0a428218771..2bc017ef226ce 100644 --- a/pandas/tests/util/test_util.py +++ b/pandas/tests/util/test_util.py @@ -34,9 +34,14 @@ def _f2(new=False): def _f3(new=0): return new + @deprecate_kwarg('old', None) + def _f4(old=True, unchanged=True): + return old + self.f1 = _f1 self.f2 = _f2 self.f3 = _f3 + self.f4 = _f4 def test_deprecate_kwarg(self): x = 78 @@ -72,6 +77,15 @@ def test_bad_deprecate_kwarg(self): def f4(new=None): pass + def test_deprecate_keyword(self): + x = 9 + with tm.assert_produces_warning(FutureWarning): + result = self.f4(old=x) + assert result is x + with tm.assert_produces_warning(None): + result = self.f4(unchanged=x) + assert result is True + def test_rands(): r = tm.rands(10) diff --git a/pandas/util/_decorators.py b/pandas/util/_decorators.py index eed9cee54efb3..1753bc8b8fc33 100644 --- a/pandas/util/_decorators.py +++ b/pandas/util/_decorators.py @@ -65,8 +65,9 @@ def deprecate_kwarg(old_arg_name, new_arg_name, mapping=None, stacklevel=2): ---------- old_arg_name : str Name of argument in function to deprecate - new_arg_name : str - Name of preferred argument in function + new_arg_name : str or None + Name of preferred argument in function. Use None to raise warning that + ``old_arg_name`` keyword is deprecated. mapping : dict or callable If mapping is present, use it to translate old arguments to new arguments. A callable must do its own value checking; @@ -82,12 +83,15 @@ def deprecate_kwarg(old_arg_name, new_arg_name, mapping=None, stacklevel=2): ... >>> f(columns='should work ok') should work ok + >>> f(cols='should raise warning') FutureWarning: cols is deprecated, use columns instead warnings.warn(msg, FutureWarning) should raise warning + >>> f(cols='should error', columns="can\'t pass do both") TypeError: Can only specify 'cols' or 'columns', not both + >>> @deprecate_kwarg('old', 'new', {'yes': True, 'no': False}) ... def f(new=False): ... print('yes!' if new else 'no!') @@ -96,6 +100,25 @@ def deprecate_kwarg(old_arg_name, new_arg_name, mapping=None, stacklevel=2): FutureWarning: old='yes' is deprecated, use new=True instead warnings.warn(msg, FutureWarning) yes! + + + To raise a warning that a keyword will be removed entirely in the future + + >>> @deprecate_kwarg(old_arg_name='cols', new_arg_name=None) + ... def f(cols='', another_param=''): + ... print(cols) + ... + >>> f(cols='should raise warning') + FutureWarning: the 'cols' keyword is deprecated and will be removed in a + future version please takes steps to stop use of 'cols' + should raise warning + >>> f(another_param='should not raise warning') + should not raise warning + + >>> f(cols='should raise warning', another_param='') + FutureWarning: the 'cols' keyword is deprecated and will be removed in a + future version please takes steps to stop use of 'cols' + should raise warning """ if mapping is not None and not hasattr(mapping, 'get') and \ @@ -107,6 +130,17 @@ def _deprecate_kwarg(func): @wraps(func) def wrapper(*args, **kwargs): old_arg_value = kwargs.pop(old_arg_name, None) + + if new_arg_name is None and old_arg_value is not None: + msg = ( + "the '{old_name}' keyword is deprecated and will be " + "removed in a future version " + "please takes steps to stop use of '{old_name}'" + ).format(old_name=old_arg_name) + warnings.warn(msg, FutureWarning, stacklevel=stacklevel) + kwargs[old_arg_name] = old_arg_value + return func(*args, **kwargs) + if old_arg_value is not None: if mapping is not None: if hasattr(mapping, 'get'): From dbc601e3f24966e274ec1e9705e13ea95dcadbad Mon Sep 17 00:00:00 2001 From: Noah Date: Wed, 21 Feb 2018 06:40:21 -0500 Subject: [PATCH 153/214] BUG: drop_duplicates not raising KeyError on missing key (#19730) --- doc/source/whatsnew/v0.23.0.txt | 2 ++ pandas/core/frame.py | 7 +++++++ pandas/tests/frame/test_analytics.py | 13 +++++++++++++ 3 files changed, 22 insertions(+) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index ed3069943bb6a..a4b943f995a33 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -795,6 +795,8 @@ Indexing - Bug in :class:`IntervalIndex` where empty and purely NA data was constructed inconsistently depending on the construction method (:issue:`18421`) - Bug in :func:`IntervalIndex.symmetric_difference` where the symmetric difference with a non-``IntervalIndex`` did not raise (:issue:`18475`) - Bug in :class:`IntervalIndex` where set operations that returned an empty ``IntervalIndex`` had the wrong dtype (:issue:`19101`) +- Bug in :meth:`DataFrame.drop_duplicates` where no ``KeyError`` is raised when passing in columns that don't exist on the ``DataFrame`` (issue:`19726`) + MultiIndex ^^^^^^^^^^ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index efd6814ba04c5..d81d22173bfbd 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3655,6 +3655,13 @@ def f(vals): isinstance(subset, tuple) and subset in self.columns): subset = subset, + # Verify all columns in subset exist in the queried dataframe + # Otherwise, raise a KeyError, same as if you try to __getitem__ with a + # key that doesn't exist. + diff = Index(subset).difference(self.columns) + if not diff.empty: + raise KeyError(diff) + vals = (col.values for name, col in self.iteritems() if name in subset) labels, shape = map(list, zip(*map(f, vals))) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index b9275fc69e7ff..f2b8387072c8d 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -1492,6 +1492,19 @@ def test_drop_duplicates(self): for keep in ['first', 'last', False]: assert df.duplicated(keep=keep).sum() == 0 + @pytest.mark.parametrize('subset', ['a', ['a'], ['a', 'B']]) + def test_duplicated_with_misspelled_column_name(self, subset): + # GH 19730 + df = pd.DataFrame({'A': [0, 0, 1], + 'B': [0, 0, 1], + 'C': [0, 0, 1]}) + + with pytest.raises(KeyError): + df.duplicated(subset) + + with pytest.raises(KeyError): + df.drop_duplicates(subset) + def test_drop_duplicates_with_duplicate_column_names(self): # GH17836 df = DataFrame([ From aa59954a217c8f856bb0980265520d37b85a80af Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 21 Feb 2018 06:53:20 -0500 Subject: [PATCH 154/214] ASV: excel asv occasional failure (#19811) closes #19779 --- asv_bench/benchmarks/io/excel.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/asv_bench/benchmarks/io/excel.py b/asv_bench/benchmarks/io/excel.py index a7c6c43d15026..58ab6bb8046c5 100644 --- a/asv_bench/benchmarks/io/excel.py +++ b/asv_bench/benchmarks/io/excel.py @@ -25,13 +25,12 @@ def setup(self, engine): self.writer_read.save() self.bio_read.seek(0) - self.bio_write = BytesIO() - self.bio_write.seek(0) - self.writer_write = ExcelWriter(self.bio_write, engine=engine) - def time_read_excel(self, engine): read_excel(self.bio_read) def time_write_excel(self, engine): - self.df.to_excel(self.writer_write, sheet_name='Sheet1') - self.writer_write.save() + bio_write = BytesIO() + bio_write.seek(0) + writer_write = ExcelWriter(bio_write, engine=engine) + self.df.to_excel(writer_write, sheet_name='Sheet1') + writer_write.save() From cd1b168cf0fe12d3d850e2f98e3e7164c649f274 Mon Sep 17 00:00:00 2001 From: Eric Chea <5069128+EricChea@users.noreply.github.com> Date: Wed, 21 Feb 2018 14:53:08 -0500 Subject: [PATCH 155/214] DOC: Add example of how to preserve order of columns with usecols. (#19746) * Add example of how to preserve order of columns with usecols. * Encase usecols in double back ticks for consistency. Change column names from numeric to string. * Add line to separate examples. --- doc/source/io.rst | 10 ++++++++-- pandas/io/parsers.py | 7 ++++++- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 6120f7d25a0c3..0b9a610b50d7d 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -135,8 +135,14 @@ usecols : array-like or callable, default ``None`` be positional (i.e. integer indices into the document columns) or strings that correspond to column names provided either by the user in `names` or inferred from the document header row(s). For example, a valid array-like - `usecols` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``. - Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``. + `usecols` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``. + + Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``. To + instantiate a DataFrame from ``data`` with element order preserved use + ``pd.read_csv(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` for columns + in ``['foo', 'bar']`` order or + ``pd.read_csv(data, usecols=['foo', 'bar'])[['bar', 'foo']]`` for + ``['bar', 'foo']`` order. If callable, the callable function will be evaluated against the column names, returning names where the callable function evaluates to True: diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 7ea6d321e0fdd..4b1385514a0c4 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -102,7 +102,12 @@ that correspond to column names provided either by the user in `names` or inferred from the document header row(s). For example, a valid array-like `usecols` parameter would be [0, 1, 2] or ['foo', 'bar', 'baz']. Element - order is ignored, so usecols=[1,0] is the same as [0,1]. + order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``. + To instantiate a DataFrame from ``data`` with element order preserved use + ``pd.read_csv(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` for columns + in ``['foo', 'bar']`` order or + ``pd.read_csv(data, usecols=['foo', 'bar'])[['bar', 'foo']]`` + for ``['bar', 'foo']`` order. If callable, the callable function will be evaluated against the column names, returning names where the callable function evaluates to True. An From fb28b6e92f0dbac6b9f0b573abddda668382b5b7 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 21 Feb 2018 15:50:04 -0800 Subject: [PATCH 156/214] TST: move more series tests to test_arithmetic (#19794) --- pandas/tests/series/test_arithmetic.py | 474 ++++++++++++++++- pandas/tests/series/test_operators.py | 695 +++++-------------------- pandas/tests/series/test_timezones.py | 9 + 3 files changed, 610 insertions(+), 568 deletions(-) diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index f727edf8fb7d8..5b8d9cfab3e0d 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -1,17 +1,26 @@ # -*- coding: utf-8 -*- from datetime import datetime, timedelta import operator +from decimal import Decimal import numpy as np import pytest -from pandas import Series, Timestamp, Period +from pandas import Series, Timestamp, Timedelta, Period, NaT from pandas._libs.tslibs.period import IncompatibleFrequency import pandas as pd import pandas.util.testing as tm +@pytest.fixture +def tdser(): + """ + Return a Series with dtype='timedelta64[ns]', including a NaT. + """ + return Series(['59 Days', '59 Days', 'NaT'], dtype='timedelta64[ns]') + + # ------------------------------------------------------------------ # Comparisons @@ -262,6 +271,97 @@ def test_cmp_series_period_series_mixed_freq(self): # ------------------------------------------------------------------ # Arithmetic +class TestSeriesDivision(object): + # __div__, __rdiv__, __floordiv__, __rfloordiv__ + # for non-timestamp/timedelta/period dtypes + + def test_divide_decimal(self): + # resolves issue GH#9787 + expected = Series([Decimal(5)]) + + ser = Series([Decimal(10)]) + result = ser / Decimal(2) + + tm.assert_series_equal(result, expected) + + ser = Series([Decimal(10)]) + result = ser // Decimal(2) + + tm.assert_series_equal(result, expected) + + def test_div_equiv_binop(self): + # Test Series.div as well as Series.__div__ + # float/integer issue + # GH#7785 + first = Series([1, 0], name='first') + second = Series([-0.01, -0.02], name='second') + expected = Series([-0.01, -np.inf]) + + result = second.div(first) + tm.assert_series_equal(result, expected, check_names=False) + + result = second / first + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize('dtype2', [ + np.int64, np.int32, np.int16, np.int8, + np.float64, np.float32, np.float16, + np.uint64, np.uint32, np.uint16, np.uint8]) + @pytest.mark.parametrize('dtype1', [np.int64, np.float64, np.uint64]) + def test_ser_div_ser(self, dtype1, dtype2): + # no longer do integer div for any ops, but deal with the 0's + first = Series([3, 4, 5, 8], name='first').astype(dtype1) + second = Series([0, 0, 0, 3], name='second').astype(dtype2) + + with np.errstate(all='ignore'): + expected = Series(first.values.astype(np.float64) / second.values, + dtype='float64', name=None) + expected.iloc[0:3] = np.inf + + result = first / second + tm.assert_series_equal(result, expected) + assert not result.equals(second / first) + + def test_rdiv_zero_compat(self): + # GH#8674 + zero_array = np.array([0] * 5) + data = np.random.randn(5) + expected = Series([0.] * 5) + + result = zero_array / Series(data) + tm.assert_series_equal(result, expected) + + result = Series(zero_array) / data + tm.assert_series_equal(result, expected) + + result = Series(zero_array) / Series(data) + tm.assert_series_equal(result, expected) + + def test_div_zero_inf_signs(self): + # GH#9144, inf signing + ser = Series([-1, 0, 1], name='first') + expected = Series([-np.inf, np.nan, np.inf], name='first') + + result = ser / 0 + tm.assert_series_equal(result, expected) + + def test_rdiv_zero(self): + # GH#9144 + ser = Series([-1, 0, 1], name='first') + expected = Series([0.0, np.nan, 0.0], name='first') + + result = 0 / ser + tm.assert_series_equal(result, expected) + + def test_floordiv_div(self): + # GH#9144 + ser = Series([-1, 0, 1], name='first') + + result = ser // 0 + expected = Series([-np.inf, np.nan, np.inf], name='first') + tm.assert_series_equal(result, expected) + + class TestSeriesArithmetic(object): # Standard, numeric, or otherwise not-Timestamp/Timedelta/Period dtypes @pytest.mark.parametrize('data', [ @@ -316,6 +416,20 @@ def test_series_radd_str(self): tm.assert_series_equal('a' + ser, pd.Series(['ax', np.nan, 'ax'])) tm.assert_series_equal(ser + 'a', pd.Series(['xa', np.nan, 'xa'])) + @pytest.mark.parametrize('dtype', [None, object]) + def test_series_with_dtype_radd_timedelta(self, dtype): + # note this test is _not_ aimed at timedelta64-dtyped Series + ser = pd.Series([pd.Timedelta('1 days'), pd.Timedelta('2 days'), + pd.Timedelta('3 days')], dtype=dtype) + expected = pd.Series([pd.Timedelta('4 days'), pd.Timedelta('5 days'), + pd.Timedelta('6 days')]) + + result = pd.Timedelta('3 days') + ser + tm.assert_series_equal(result, expected) + + result = ser + pd.Timedelta('3 days') + tm.assert_series_equal(result, expected) + class TestPeriodSeriesArithmetic(object): def test_ops_series_timedelta(self): @@ -377,3 +491,361 @@ def test_dt64ser_sub_datetime_dtype(self): ser = Series([ts]) result = pd.to_timedelta(np.abs(ser - dt)) assert result.dtype == 'timedelta64[ns]' + + +class TestTimedeltaSeriesAdditionSubtraction(object): + # Tests for Series[timedelta64[ns]] __add__, __sub__, __radd__, __rsub__ + + # ------------------------------------------------------------------ + # Operations with int-like others + + def test_td64series_add_int_series_invalid(self, tdser): + with pytest.raises(TypeError): + tdser + Series([2, 3, 4]) + + @pytest.mark.xfail(reason='GH#19123 integer interpreted as nanoseconds') + def test_td64series_radd_int_series_invalid(self, tdser): + with pytest.raises(TypeError): + Series([2, 3, 4]) + tdser + + def test_td64series_sub_int_series_invalid(self, tdser): + with pytest.raises(TypeError): + tdser - Series([2, 3, 4]) + + @pytest.mark.xfail(reason='GH#19123 integer interpreted as nanoseconds') + def test_td64series_rsub_int_series_invalid(self, tdser): + with pytest.raises(TypeError): + Series([2, 3, 4]) - tdser + + def test_td64_series_add_intlike(self): + # GH#19123 + tdi = pd.TimedeltaIndex(['59 days', '59 days', 'NaT']) + ser = Series(tdi) + + other = Series([20, 30, 40], dtype='uint8') + + pytest.raises(TypeError, ser.__add__, 1) + pytest.raises(TypeError, ser.__sub__, 1) + + pytest.raises(TypeError, ser.__add__, other) + pytest.raises(TypeError, ser.__sub__, other) + + pytest.raises(TypeError, ser.__add__, other.values) + pytest.raises(TypeError, ser.__sub__, other.values) + + pytest.raises(TypeError, ser.__add__, pd.Index(other)) + pytest.raises(TypeError, ser.__sub__, pd.Index(other)) + + @pytest.mark.parametrize('scalar', [1, 1.5, np.array(2)]) + def test_td64series_add_sub_numeric_scalar_invalid(self, scalar, tdser): + with pytest.raises(TypeError): + tdser + scalar + with pytest.raises(TypeError): + scalar + tdser + with pytest.raises(TypeError): + tdser - scalar + with pytest.raises(TypeError): + scalar - tdser + + @pytest.mark.parametrize('dtype', ['int64', 'int32', 'int16', + 'uint64', 'uint32', 'uint16', 'uint8', + 'float64', 'float32', 'float16']) + @pytest.mark.parametrize('vector', [ + np.array([1, 2, 3]), + pd.Index([1, 2, 3]), + pytest.param(Series([1, 2, 3]), + marks=pytest.mark.xfail(reason='GH#19123 integer ' + 'interpreted as nanos')) + ]) + def test_td64series_add_sub_numeric_array_invalid(self, vector, + dtype, tdser): + vector = vector.astype(dtype) + with pytest.raises(TypeError): + tdser + vector + with pytest.raises(TypeError): + vector + tdser + with pytest.raises(TypeError): + tdser - vector + with pytest.raises(TypeError): + vector - tdser + + # ------------------------------------------------------------------ + # Operations with datetime-like others + + def test_td64series_add_sub_timestamp(self): + # GH#11925 + tdser = Series(pd.timedelta_range('1 day', periods=3)) + ts = Timestamp('2012-01-01') + expected = Series(pd.date_range('2012-01-02', periods=3)) + tm.assert_series_equal(ts + tdser, expected) + tm.assert_series_equal(tdser + ts, expected) + + expected2 = Series(pd.date_range('2011-12-31', periods=3, freq='-1D')) + tm.assert_series_equal(ts - tdser, expected2) + tm.assert_series_equal(ts + (-tdser), expected2) + + with pytest.raises(TypeError): + tdser - ts + + # ------------------------------------------------------------------ + # Operations with timedelta-like others (including DateOffsets) + + @pytest.mark.parametrize('names', [(None, None, None), + ('Egon', 'Venkman', None), + ('NCC1701D', 'NCC1701D', 'NCC1701D')]) + def test_td64_series_with_tdi(self, names): + # GH#17250 make sure result dtype is correct + # GH#19043 make sure names are propagated correctly + tdi = pd.TimedeltaIndex(['0 days', '1 day'], name=names[0]) + ser = Series([Timedelta(hours=3), Timedelta(hours=4)], name=names[1]) + expected = Series([Timedelta(hours=3), Timedelta(days=1, hours=4)], + name=names[2]) + + result = tdi + ser + tm.assert_series_equal(result, expected) + assert result.dtype == 'timedelta64[ns]' + + result = ser + tdi + tm.assert_series_equal(result, expected) + assert result.dtype == 'timedelta64[ns]' + + expected = Series([Timedelta(hours=-3), Timedelta(days=1, hours=-4)], + name=names[2]) + + result = tdi - ser + tm.assert_series_equal(result, expected) + assert result.dtype == 'timedelta64[ns]' + + result = ser - tdi + tm.assert_series_equal(result, -expected) + assert result.dtype == 'timedelta64[ns]' + + def test_td64_sub_NaT(self): + # GH#18808 + ser = Series([NaT, Timedelta('1s')]) + res = ser - NaT + expected = Series([NaT, NaT], dtype='timedelta64[ns]') + tm.assert_series_equal(res, expected) + + +class TestTimedeltaSeriesMultiplicationDivision(object): + # Tests for Series[timedelta64[ns]] + # __mul__, __rmul__, __div__, __rdiv__, __floordiv__, __rfloordiv__ + + # ------------------------------------------------------------------ + # __floordiv__, __rfloordiv__ + + @pytest.mark.parametrize('scalar_td', [ + timedelta(minutes=5, seconds=4), + Timedelta('5m4s'), + Timedelta('5m4s').to_timedelta64()]) + def test_timedelta_floordiv(self, scalar_td): + # GH#18831 + td1 = Series([timedelta(minutes=5, seconds=3)] * 3) + td1.iloc[2] = np.nan + + result = td1 // scalar_td + expected = Series([0, 0, np.nan]) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize('scalar_td', [ + timedelta(minutes=5, seconds=4), + Timedelta('5m4s'), + Timedelta('5m4s').to_timedelta64()]) + def test_timedelta_rfloordiv(self, scalar_td): + # GH#18831 + td1 = Series([timedelta(minutes=5, seconds=3)] * 3) + td1.iloc[2] = np.nan + result = scalar_td // td1 + expected = Series([1, 1, np.nan]) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize('scalar_td', [ + timedelta(minutes=5, seconds=4), + Timedelta('5m4s'), + Timedelta('5m4s').to_timedelta64()]) + def test_timedelta_rfloordiv_explicit(self, scalar_td): + # GH#18831 + td1 = Series([timedelta(minutes=5, seconds=3)] * 3) + td1.iloc[2] = np.nan + + # We can test __rfloordiv__ using this syntax, + # see `test_timedelta_rfloordiv` + result = td1.__rfloordiv__(scalar_td) + expected = Series([1, 1, np.nan]) + tm.assert_series_equal(result, expected) + + # ------------------------------------------------------------------ + # Operations with int-like others + + @pytest.mark.parametrize('dtype', ['int64', 'int32', 'int16', + 'uint64', 'uint32', 'uint16', 'uint8', + 'float64', 'float32', 'float16']) + @pytest.mark.parametrize('vector', [np.array([20, 30, 40]), + pd.Index([20, 30, 40]), + Series([20, 30, 40])]) + def test_td64series_div_numeric_array(self, vector, dtype, tdser): + # GH#4521 + # divide/multiply by integers + vector = vector.astype(dtype) + expected = Series(['2.95D', '1D 23H 12m', 'NaT'], + dtype='timedelta64[ns]') + + result = tdser / vector + tm.assert_series_equal(result, expected) + + with pytest.raises(TypeError): + vector / tdser + + @pytest.mark.parametrize('dtype', ['int64', 'int32', 'int16', + 'uint64', 'uint32', 'uint16', 'uint8', + 'float64', 'float32', 'float16']) + @pytest.mark.parametrize('vector', [np.array([20, 30, 40]), + pd.Index([20, 30, 40]), + Series([20, 30, 40])]) + def test_td64series_mul_numeric_array(self, vector, dtype, tdser): + # GH#4521 + # divide/multiply by integers + vector = vector.astype(dtype) + + expected = Series(['1180 Days', '1770 Days', 'NaT'], + dtype='timedelta64[ns]') + + result = tdser * vector + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize('dtype', ['int64', 'int32', 'int16', + 'uint64', 'uint32', 'uint16', 'uint8', + 'float64', 'float32', 'float16']) + @pytest.mark.parametrize('vector', [ + np.array([20, 30, 40]), + pytest.param(pd.Index([20, 30, 40]), + marks=pytest.mark.xfail(reason='__mul__ raises ' + 'instead of returning ' + 'NotImplemented')), + Series([20, 30, 40]) + ]) + def test_td64series_rmul_numeric_array(self, vector, dtype, tdser): + # GH#4521 + # divide/multiply by integers + vector = vector.astype(dtype) + + expected = Series(['1180 Days', '1770 Days', 'NaT'], + dtype='timedelta64[ns]') + + result = vector * tdser + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize('one', [1, np.array(1), 1.0, np.array(1.0)]) + def test_td64series_mul_numeric_scalar(self, one, tdser): + # GH#4521 + # divide/multiply by integers + expected = Series(['-59 Days', '-59 Days', 'NaT'], + dtype='timedelta64[ns]') + + result = tdser * (-one) + tm.assert_series_equal(result, expected) + result = (-one) * tdser + tm.assert_series_equal(result, expected) + + expected = Series(['118 Days', '118 Days', 'NaT'], + dtype='timedelta64[ns]') + + result = tdser * (2 * one) + tm.assert_series_equal(result, expected) + result = (2 * one) * tdser + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize('two', [ + 2, 2.0, + pytest.param(np.array(2), + marks=pytest.mark.xfail(reason='GH#19011 is_list_like ' + 'incorrectly True.')), + pytest.param(np.array(2.0), + marks=pytest.mark.xfail(reason='GH#19011 is_list_like ' + 'incorrectly True.')), + ]) + def test_td64series_div_numeric_scalar(self, two, tdser): + # GH#4521 + # divide/multiply by integers + expected = Series(['29.5D', '29.5D', 'NaT'], dtype='timedelta64[ns]') + + result = tdser / two + tm.assert_series_equal(result, expected) + + # ------------------------------------------------------------------ + # Operations with timedelta-like others + + @pytest.mark.parametrize('names', [(None, None, None), + ('Egon', 'Venkman', None), + ('NCC1701D', 'NCC1701D', 'NCC1701D')]) + def test_tdi_mul_int_series(self, names): + # GH#19042 + tdi = pd.TimedeltaIndex(['0days', '1day', '2days', '3days', '4days'], + name=names[0]) + ser = Series([0, 1, 2, 3, 4], dtype=np.int64, name=names[1]) + + expected = Series(['0days', '1day', '4days', '9days', '16days'], + dtype='timedelta64[ns]', + name=names[2]) + + result = ser * tdi + tm.assert_series_equal(result, expected) + + # The direct operation tdi * ser still needs to be fixed. + result = ser.__rmul__(tdi) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize('names', [(None, None, None), + ('Egon', 'Venkman', None), + ('NCC1701D', 'NCC1701D', 'NCC1701D')]) + def test_float_series_rdiv_tdi(self, names): + # GH#19042 + # TODO: the direct operation TimedeltaIndex / Series still + # needs to be fixed. + tdi = pd.TimedeltaIndex(['0days', '1day', '2days', '3days', '4days'], + name=names[0]) + ser = Series([1.5, 3, 4.5, 6, 7.5], dtype=np.float64, name=names[1]) + + expected = Series([tdi[n] / ser[n] for n in range(len(ser))], + dtype='timedelta64[ns]', + name=names[2]) + + result = ser.__rdiv__(tdi) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize('scalar_td', [ + timedelta(minutes=5, seconds=4), + Timedelta('5m4s'), + Timedelta('5m4s').to_timedelta64()]) + def test_td64series_mul_timedeltalike_invalid(self, scalar_td): + td1 = Series([timedelta(minutes=5, seconds=3)] * 3) + td1.iloc[2] = np.nan + + # check that we are getting a TypeError + # with 'operate' (from core/ops.py) for the ops that are not + # defined + pattern = 'operate|unsupported|cannot|not supported' + with tm.assert_raises_regex(TypeError, pattern): + td1 * scalar_td + with tm.assert_raises_regex(TypeError, pattern): + scalar_td * td1 + + +class TestTimedeltaSeriesInvalidArithmeticOps(object): + @pytest.mark.parametrize('scalar_td', [ + timedelta(minutes=5, seconds=4), + Timedelta('5m4s'), + Timedelta('5m4s').to_timedelta64()]) + def test_td64series_pow_invalid(self, scalar_td): + td1 = Series([timedelta(minutes=5, seconds=3)] * 3) + td1.iloc[2] = np.nan + + # check that we are getting a TypeError + # with 'operate' (from core/ops.py) for the ops that are not + # defined + pattern = 'operate|unsupported|cannot|not supported' + with tm.assert_raises_regex(TypeError, pattern): + scalar_td ** td1 + with tm.assert_raises_regex(TypeError, pattern): + td1 ** scalar_td diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index 554b3e15d8f10..f90fcce973f00 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -9,7 +9,7 @@ import operator from itertools import product, starmap -from numpy import nan, inf +from numpy import nan import numpy as np import pandas as pd @@ -29,11 +29,6 @@ from .common import TestData -@pytest.fixture -def tdser(): - return Series(['59 Days', '59 Days', 'NaT'], dtype='timedelta64[ns]') - - class TestSeriesComparisons(object): def test_series_comparison_scalars(self): series = Series(date_range('1/1/2000', periods=10)) @@ -579,291 +574,7 @@ def test_comp_ops_df_compat(self): left.to_frame() < right.to_frame() -class TestSeriesArithmetic(object): - def test_divide_decimal(self): - """ resolves issue #9787 """ - from decimal import Decimal - - expected = Series([Decimal(5)]) - - s = Series([Decimal(10)]) - s = s / Decimal(2) - - assert_series_equal(expected, s) - - s = Series([Decimal(10)]) - s = s // Decimal(2) - - assert_series_equal(expected, s) - - @pytest.mark.parametrize( - 'dtype2', - [ - np.int64, np.int32, np.int16, np.int8, - np.float64, np.float32, np.float16, - np.uint64, np.uint32, - np.uint16, np.uint8 - ]) - @pytest.mark.parametrize('dtype1', [np.int64, np.float64, np.uint64]) - def test_ser_div_ser(self, dtype1, dtype2): - # no longer do integer div for any ops, but deal with the 0's - first = Series([3, 4, 5, 8], name='first').astype(dtype1) - second = Series([0, 0, 0, 3], name='second').astype(dtype2) - - with np.errstate(all='ignore'): - expected = Series(first.values.astype(np.float64) / second.values, - dtype='float64', name=None) - expected.iloc[0:3] = np.inf - - result = first / second - assert_series_equal(result, expected) - assert not result.equals(second / first) - - def test_div_equiv_binop(self): - # Test Series.div as well as Series.__div__ - # float/integer issue - # GH#7785 - first = pd.Series([1, 0], name='first') - second = pd.Series([-0.01, -0.02], name='second') - expected = Series([-0.01, -np.inf]) - - result = second.div(first) - assert_series_equal(result, expected, check_names=False) - - result = second / first - assert_series_equal(result, expected) - - def test_rdiv_zero_compat(self): - # GH#8674 - zero_array = np.array([0] * 5) - data = np.random.randn(5) - expected = pd.Series([0.] * 5) - - result = zero_array / pd.Series(data) - assert_series_equal(result, expected) - - result = pd.Series(zero_array) / data - assert_series_equal(result, expected) - - result = pd.Series(zero_array) / pd.Series(data) - assert_series_equal(result, expected) - - def test_div_zero_inf_signs(self): - # GH#9144, inf signing - ser = Series([-1, 0, 1], name='first') - expected = Series([-np.inf, np.nan, np.inf], name='first') - - result = ser / 0 - assert_series_equal(result, expected) - - def test_rdiv_zero(self): - # GH#9144 - ser = Series([-1, 0, 1], name='first') - expected = Series([0.0, np.nan, 0.0], name='first') - - result = 0 / ser - assert_series_equal(result, expected) - - def test_floordiv_div(self): - # GH#9144 - ser = Series([-1, 0, 1], name='first') - - result = ser // 0 - expected = Series([-inf, nan, inf], name='first') - assert_series_equal(result, expected) - - -class TestTimedeltaSeriesArithmeticWithIntegers(object): - # Tests for Series with dtype 'timedelta64[ns]' arithmetic operations - # with integer and int-like others - - # ------------------------------------------------------------------ - # Addition and Subtraction - - def test_td64series_add_int_series_invalid(self, tdser): - with pytest.raises(TypeError): - tdser + Series([2, 3, 4]) - - @pytest.mark.xfail(reason='GH#19123 integer interpreted as nanoseconds') - def test_td64series_radd_int_series_invalid(self, tdser): - with pytest.raises(TypeError): - Series([2, 3, 4]) + tdser - - def test_td64series_sub_int_series_invalid(self, tdser): - with pytest.raises(TypeError): - tdser - Series([2, 3, 4]) - - @pytest.mark.xfail(reason='GH#19123 integer interpreted as nanoseconds') - def test_td64series_rsub_int_series_invalid(self, tdser): - with pytest.raises(TypeError): - Series([2, 3, 4]) - tdser - - def test_td64_series_add_intlike(self): - # GH#19123 - tdi = pd.TimedeltaIndex(['59 days', '59 days', 'NaT']) - ser = Series(tdi) - - other = Series([20, 30, 40], dtype='uint8') - - pytest.raises(TypeError, ser.__add__, 1) - pytest.raises(TypeError, ser.__sub__, 1) - - pytest.raises(TypeError, ser.__add__, other) - pytest.raises(TypeError, ser.__sub__, other) - - pytest.raises(TypeError, ser.__add__, other.values) - pytest.raises(TypeError, ser.__sub__, other.values) - - pytest.raises(TypeError, ser.__add__, pd.Index(other)) - pytest.raises(TypeError, ser.__sub__, pd.Index(other)) - - @pytest.mark.parametrize('scalar', [1, 1.5, np.array(2)]) - def test_td64series_add_sub_numeric_scalar_invalid(self, scalar, tdser): - with pytest.raises(TypeError): - tdser + scalar - with pytest.raises(TypeError): - scalar + tdser - with pytest.raises(TypeError): - tdser - scalar - with pytest.raises(TypeError): - scalar - tdser - - @pytest.mark.parametrize('dtype', ['int64', 'int32', 'int16', - 'uint64', 'uint32', 'uint16', 'uint8', - 'float64', 'float32', 'float16']) - @pytest.mark.parametrize('vector', [ - np.array([1, 2, 3]), - pd.Index([1, 2, 3]), - pytest.param(Series([1, 2, 3]), - marks=pytest.mark.xfail(reason='GH#19123 integer ' - 'interpreted as nanos')) - ]) - def test_td64series_add_sub_numeric_array_invalid(self, vector, - dtype, tdser): - vector = vector.astype(dtype) - with pytest.raises(TypeError): - tdser + vector - with pytest.raises(TypeError): - vector + tdser - with pytest.raises(TypeError): - tdser - vector - with pytest.raises(TypeError): - vector - tdser - - # ------------------------------------------------------------------ - # Multiplicaton and Division - - @pytest.mark.parametrize('dtype', ['int64', 'int32', 'int16', - 'uint64', 'uint32', 'uint16', 'uint8', - 'float64', 'float32', 'float16']) - @pytest.mark.parametrize('vector', [np.array([20, 30, 40]), - pd.Index([20, 30, 40]), - Series([20, 30, 40])]) - def test_td64series_div_numeric_array(self, vector, dtype, tdser): - # GH 4521 - # divide/multiply by integers - vector = vector.astype(dtype) - expected = Series(['2.95D', '1D 23H 12m', 'NaT'], - dtype='timedelta64[ns]') - - result = tdser / vector - assert_series_equal(result, expected) - - with pytest.raises(TypeError): - vector / tdser - - @pytest.mark.parametrize('dtype', ['int64', 'int32', 'int16', - 'uint64', 'uint32', 'uint16', 'uint8', - 'float64', 'float32', 'float16']) - @pytest.mark.parametrize('vector', [np.array([20, 30, 40]), - pd.Index([20, 30, 40]), - Series([20, 30, 40])]) - def test_td64series_mul_numeric_array(self, vector, dtype, tdser): - # GH 4521 - # divide/multiply by integers - vector = vector.astype(dtype) - - expected = Series(['1180 Days', '1770 Days', 'NaT'], - dtype='timedelta64[ns]') - - result = tdser * vector - assert_series_equal(result, expected) - - @pytest.mark.parametrize('dtype', ['int64', 'int32', 'int16', - 'uint64', 'uint32', 'uint16', 'uint8', - 'float64', 'float32', 'float16']) - @pytest.mark.parametrize('vector', [ - np.array([20, 30, 40]), - pytest.param(pd.Index([20, 30, 40]), - marks=pytest.mark.xfail(reason='__mul__ raises ' - 'instead of returning ' - 'NotImplemented')), - Series([20, 30, 40]) - ]) - def test_td64series_rmul_numeric_array(self, vector, dtype, tdser): - # GH 4521 - # divide/multiply by integers - vector = vector.astype(dtype) - - expected = Series(['1180 Days', '1770 Days', 'NaT'], - dtype='timedelta64[ns]') - - result = vector * tdser - assert_series_equal(result, expected) - - @pytest.mark.parametrize('one', [1, np.array(1), 1.0, np.array(1.0)]) - def test_td64series_mul_numeric_scalar(self, one, tdser): - # GH 4521 - # divide/multiply by integers - expected = Series(['-59 Days', '-59 Days', 'NaT'], - dtype='timedelta64[ns]') - - result = tdser * (-one) - assert_series_equal(result, expected) - result = (-one) * tdser - assert_series_equal(result, expected) - - expected = Series(['118 Days', '118 Days', 'NaT'], - dtype='timedelta64[ns]') - - result = tdser * (2 * one) - assert_series_equal(result, expected) - result = (2 * one) * tdser - assert_series_equal(result, expected) - - @pytest.mark.parametrize('two', [ - 2, 2.0, - pytest.param(np.array(2), - marks=pytest.mark.xfail(reason='GH#19011 is_list_like ' - 'incorrectly True.')), - pytest.param(np.array(2.0), - marks=pytest.mark.xfail(reason='GH#19011 is_list_like ' - 'incorrectly True.')), - ]) - def test_td64series_div_numeric_scalar(self, two, tdser): - # GH 4521 - # divide/multiply by integers - expected = Series(['29.5D', '29.5D', 'NaT'], dtype='timedelta64[ns]') - - result = tdser / two - assert_series_equal(result, expected) - - class TestTimedeltaSeriesArithmetic(object): - def test_td64series_add_sub_timestamp(self): - # GH11925 - tdser = Series(timedelta_range('1 day', periods=3)) - ts = Timestamp('2012-01-01') - expected = Series(date_range('2012-01-02', periods=3)) - assert_series_equal(ts + tdser, expected) - assert_series_equal(tdser + ts, expected) - - expected2 = Series(date_range('2011-12-31', periods=3, freq='-1D')) - assert_series_equal(ts - tdser, expected2) - assert_series_equal(ts + (-tdser), expected2) - - with pytest.raises(TypeError): - tdser - ts def test_timedelta64_operations_with_DateOffset(self): # GH 10699 @@ -1081,13 +792,6 @@ def test_timedelta64_ops_nat(self): assert_series_equal(timedelta_series / nan, nat_series_dtype_timedelta) - def test_td64_sub_NaT(self): - # GH#18808 - ser = Series([NaT, Timedelta('1s')]) - res = ser - NaT - expected = Series([NaT, NaT], dtype='timedelta64[ns]') - tm.assert_series_equal(res, expected) - @pytest.mark.parametrize('scalar_td', [timedelta(minutes=5, seconds=4), Timedelta(minutes=5, seconds=4), Timedelta('5m4s').to_timedelta64()]) @@ -1103,135 +807,6 @@ def test_operators_timedelta64_with_timedelta(self, scalar_td): td1 / scalar_td scalar_td / td1 - @pytest.mark.parametrize('scalar_td', [ - timedelta(minutes=5, seconds=4), - Timedelta('5m4s'), - Timedelta('5m4s').to_timedelta64()]) - def test_operators_timedelta64_with_timedelta_invalid(self, scalar_td): - td1 = Series([timedelta(minutes=5, seconds=3)] * 3) - td1.iloc[2] = np.nan - - # check that we are getting a TypeError - # with 'operate' (from core/ops.py) for the ops that are not - # defined - pattern = 'operate|unsupported|cannot|not supported' - with tm.assert_raises_regex(TypeError, pattern): - td1 * scalar_td - with tm.assert_raises_regex(TypeError, pattern): - scalar_td * td1 - with tm.assert_raises_regex(TypeError, pattern): - scalar_td ** td1 - with tm.assert_raises_regex(TypeError, pattern): - td1 ** scalar_td - - @pytest.mark.parametrize('scalar_td', [ - timedelta(minutes=5, seconds=4), - Timedelta('5m4s'), - Timedelta('5m4s').to_timedelta64()]) - def test_timedelta_rfloordiv(self, scalar_td): - # GH#18831 - td1 = Series([timedelta(minutes=5, seconds=3)] * 3) - td1.iloc[2] = np.nan - result = scalar_td // td1 - expected = Series([1, 1, np.nan]) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize('scalar_td', [ - timedelta(minutes=5, seconds=4), - Timedelta('5m4s'), - Timedelta('5m4s').to_timedelta64()]) - def test_timedelta_rfloordiv_explicit(self, scalar_td): - # GH#18831 - td1 = Series([timedelta(minutes=5, seconds=3)] * 3) - td1.iloc[2] = np.nan - - # We can test __rfloordiv__ using this syntax, - # see `test_timedelta_rfloordiv` - result = td1.__rfloordiv__(scalar_td) - expected = Series([1, 1, np.nan]) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize('scalar_td', [ - timedelta(minutes=5, seconds=4), - Timedelta('5m4s'), - Timedelta('5m4s').to_timedelta64()]) - def test_timedelta_floordiv(self, scalar_td): - # GH#18831 - td1 = Series([timedelta(minutes=5, seconds=3)] * 3) - td1.iloc[2] = np.nan - - result = td1 // scalar_td - expected = Series([0, 0, np.nan]) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize('names', [(None, None, None), - ('Egon', 'Venkman', None), - ('NCC1701D', 'NCC1701D', 'NCC1701D')]) - def test_td64_series_with_tdi(self, names): - # GH#17250 make sure result dtype is correct - # GH#19043 make sure names are propagated correctly - tdi = pd.TimedeltaIndex(['0 days', '1 day'], name=names[0]) - ser = Series([Timedelta(hours=3), Timedelta(hours=4)], name=names[1]) - expected = Series([Timedelta(hours=3), Timedelta(days=1, hours=4)], - name=names[2]) - - result = tdi + ser - tm.assert_series_equal(result, expected) - assert result.dtype == 'timedelta64[ns]' - - result = ser + tdi - tm.assert_series_equal(result, expected) - assert result.dtype == 'timedelta64[ns]' - - expected = Series([Timedelta(hours=-3), Timedelta(days=1, hours=-4)], - name=names[2]) - - result = tdi - ser - tm.assert_series_equal(result, expected) - assert result.dtype == 'timedelta64[ns]' - - result = ser - tdi - tm.assert_series_equal(result, -expected) - assert result.dtype == 'timedelta64[ns]' - - @pytest.mark.parametrize('names', [(None, None, None), - ('Egon', 'Venkman', None), - ('NCC1701D', 'NCC1701D', 'NCC1701D')]) - def test_tdi_mul_int_series(self, names): - # GH#19042 - tdi = pd.TimedeltaIndex(['0days', '1day', '2days', '3days', '4days'], - name=names[0]) - ser = Series([0, 1, 2, 3, 4], dtype=np.int64, name=names[1]) - - expected = Series(['0days', '1day', '4days', '9days', '16days'], - dtype='timedelta64[ns]', - name=names[2]) - - result = ser * tdi - tm.assert_series_equal(result, expected) - - # The direct operation tdi * ser still needs to be fixed. - result = ser.__rmul__(tdi) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize('names', [(None, None, None), - ('Egon', 'Venkman', None), - ('NCC1701D', 'NCC1701D', 'NCC1701D')]) - def test_float_series_rdiv_tdi(self, names): - # GH#19042 - # TODO: the direct operation TimedeltaIndex / Series still - # needs to be fixed. - tdi = pd.TimedeltaIndex(['0days', '1day', '2days', '3days', '4days'], - name=names[0]) - ser = Series([1.5, 3, 4.5, 6, 7.5], dtype=np.float64, name=names[1]) - - expected = Series([tdi[n] / ser[n] for n in range(len(ser))], - dtype='timedelta64[ns]', - name=names[2]) - - result = ser.__rdiv__(tdi) - tm.assert_series_equal(result, expected) - class TestDatetimeSeriesArithmetic(object): @pytest.mark.parametrize( @@ -1994,138 +1569,6 @@ def test_operators_reverse_object(self, op): expected = op(1., arr.astype(float)) assert_series_equal(result.astype(float), expected) - def test_arith_ops_df_compat(self): - # GH 1134 - s1 = pd.Series([1, 2, 3], index=list('ABC'), name='x') - s2 = pd.Series([2, 2, 2], index=list('ABD'), name='x') - - exp = pd.Series([3.0, 4.0, np.nan, np.nan], - index=list('ABCD'), name='x') - assert_series_equal(s1 + s2, exp) - assert_series_equal(s2 + s1, exp) - - exp = pd.DataFrame({'x': [3.0, 4.0, np.nan, np.nan]}, - index=list('ABCD')) - assert_frame_equal(s1.to_frame() + s2.to_frame(), exp) - assert_frame_equal(s2.to_frame() + s1.to_frame(), exp) - - # different length - s3 = pd.Series([1, 2, 3], index=list('ABC'), name='x') - s4 = pd.Series([2, 2, 2, 2], index=list('ABCD'), name='x') - - exp = pd.Series([3, 4, 5, np.nan], - index=list('ABCD'), name='x') - assert_series_equal(s3 + s4, exp) - assert_series_equal(s4 + s3, exp) - - exp = pd.DataFrame({'x': [3, 4, 5, np.nan]}, - index=list('ABCD')) - assert_frame_equal(s3.to_frame() + s4.to_frame(), exp) - assert_frame_equal(s4.to_frame() + s3.to_frame(), exp) - - def test_bool_ops_df_compat(self): - # GH 1134 - s1 = pd.Series([True, False, True], index=list('ABC'), name='x') - s2 = pd.Series([True, True, False], index=list('ABD'), name='x') - - exp = pd.Series([True, False, False, False], - index=list('ABCD'), name='x') - assert_series_equal(s1 & s2, exp) - assert_series_equal(s2 & s1, exp) - - # True | np.nan => True - exp = pd.Series([True, True, True, False], - index=list('ABCD'), name='x') - assert_series_equal(s1 | s2, exp) - # np.nan | True => np.nan, filled with False - exp = pd.Series([True, True, False, False], - index=list('ABCD'), name='x') - assert_series_equal(s2 | s1, exp) - - # DataFrame doesn't fill nan with False - exp = pd.DataFrame({'x': [True, False, np.nan, np.nan]}, - index=list('ABCD')) - assert_frame_equal(s1.to_frame() & s2.to_frame(), exp) - assert_frame_equal(s2.to_frame() & s1.to_frame(), exp) - - exp = pd.DataFrame({'x': [True, True, np.nan, np.nan]}, - index=list('ABCD')) - assert_frame_equal(s1.to_frame() | s2.to_frame(), exp) - assert_frame_equal(s2.to_frame() | s1.to_frame(), exp) - - # different length - s3 = pd.Series([True, False, True], index=list('ABC'), name='x') - s4 = pd.Series([True, True, True, True], index=list('ABCD'), name='x') - - exp = pd.Series([True, False, True, False], - index=list('ABCD'), name='x') - assert_series_equal(s3 & s4, exp) - assert_series_equal(s4 & s3, exp) - - # np.nan | True => np.nan, filled with False - exp = pd.Series([True, True, True, False], - index=list('ABCD'), name='x') - assert_series_equal(s3 | s4, exp) - # True | np.nan => True - exp = pd.Series([True, True, True, True], - index=list('ABCD'), name='x') - assert_series_equal(s4 | s3, exp) - - exp = pd.DataFrame({'x': [True, False, True, np.nan]}, - index=list('ABCD')) - assert_frame_equal(s3.to_frame() & s4.to_frame(), exp) - assert_frame_equal(s4.to_frame() & s3.to_frame(), exp) - - exp = pd.DataFrame({'x': [True, True, True, np.nan]}, - index=list('ABCD')) - assert_frame_equal(s3.to_frame() | s4.to_frame(), exp) - assert_frame_equal(s4.to_frame() | s3.to_frame(), exp) - - def test_series_frame_radd_bug(self): - # GH 353 - vals = Series(tm.rands_array(5, 10)) - result = 'foo_' + vals - expected = vals.map(lambda x: 'foo_' + x) - assert_series_equal(result, expected) - - frame = DataFrame({'vals': vals}) - result = 'foo_' + frame - expected = DataFrame({'vals': vals.map(lambda x: 'foo_' + x)}) - assert_frame_equal(result, expected) - - # really raise this time - with pytest.raises(TypeError): - datetime.now() + self.ts - - with pytest.raises(TypeError): - self.ts + datetime.now() - - @pytest.mark.parametrize('dtype', [None, object]) - def test_series_with_dtype_radd_timedelta(self, dtype): - ser = pd.Series([pd.Timedelta('1 days'), pd.Timedelta('2 days'), - pd.Timedelta('3 days')], dtype=dtype) - expected = pd.Series([pd.Timedelta('4 days'), pd.Timedelta('5 days'), - pd.Timedelta('6 days')]) - - result = pd.Timedelta('3 days') + ser - assert_series_equal(result, expected) - - result = ser + pd.Timedelta('3 days') - assert_series_equal(result, expected) - - def test_operators_frame(self): - # rpow does not work with DataFrame - df = DataFrame({'A': self.ts}) - - assert_series_equal(self.ts + self.ts, self.ts + df['A'], - check_names=False) - assert_series_equal(self.ts ** self.ts, self.ts ** df['A'], - check_names=False) - assert_series_equal(self.ts < self.ts, self.ts < df['A'], - check_names=False) - assert_series_equal(self.ts / self.ts, self.ts / df['A'], - check_names=False) - def test_operators_combine(self): def _check_fill(meth, op, a, b, fill_value=0): exp_index = a.index.union(b.index) @@ -2231,15 +1674,6 @@ def test_datetime64_with_index(self): df['result'] = df['date'] - df.index assert_series_equal(df['result'], df['expected'], check_names=False) - def test_dti_tz_convert_to_utc(self): - base = pd.DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03'], - tz='UTC') - idx1 = base.tz_convert('Asia/Tokyo')[:2] - idx2 = base.tz_convert('US/Eastern')[1:] - - res = Series([1, 2], index=idx1) + Series([1, 1], index=idx2) - assert_series_equal(res, Series([np.nan, 3, np.nan], index=base)) - def test_op_duplicate_index(self): # GH14227 s1 = Series([1, 2], index=[1, 1]) @@ -2294,3 +1728,130 @@ def test_idxminmax_with_inf(self): assert np.isnan(s.idxmin(skipna=False)) assert s.idxmax() == 0 np.isnan(s.idxmax(skipna=False)) + + +class TestSeriesOperationsDataFrameCompat(object): + def test_operators_frame(self): + # rpow does not work with DataFrame + ts = tm.makeTimeSeries() + ts.name = 'ts' + + df = DataFrame({'A': ts}) + + assert_series_equal(ts + ts, ts + df['A'], + check_names=False) + assert_series_equal(ts ** ts, ts ** df['A'], + check_names=False) + assert_series_equal(ts < ts, ts < df['A'], + check_names=False) + assert_series_equal(ts / ts, ts / df['A'], + check_names=False) + + def test_series_frame_radd_bug(self): + # GH#353 + vals = Series(tm.rands_array(5, 10)) + result = 'foo_' + vals + expected = vals.map(lambda x: 'foo_' + x) + assert_series_equal(result, expected) + + frame = DataFrame({'vals': vals}) + result = 'foo_' + frame + expected = DataFrame({'vals': vals.map(lambda x: 'foo_' + x)}) + assert_frame_equal(result, expected) + + ts = tm.makeTimeSeries() + ts.name = 'ts' + + # really raise this time + with pytest.raises(TypeError): + datetime.now() + ts + + with pytest.raises(TypeError): + ts + datetime.now() + + def test_bool_ops_df_compat(self): + # GH 1134 + s1 = pd.Series([True, False, True], index=list('ABC'), name='x') + s2 = pd.Series([True, True, False], index=list('ABD'), name='x') + + exp = pd.Series([True, False, False, False], + index=list('ABCD'), name='x') + assert_series_equal(s1 & s2, exp) + assert_series_equal(s2 & s1, exp) + + # True | np.nan => True + exp = pd.Series([True, True, True, False], + index=list('ABCD'), name='x') + assert_series_equal(s1 | s2, exp) + # np.nan | True => np.nan, filled with False + exp = pd.Series([True, True, False, False], + index=list('ABCD'), name='x') + assert_series_equal(s2 | s1, exp) + + # DataFrame doesn't fill nan with False + exp = pd.DataFrame({'x': [True, False, np.nan, np.nan]}, + index=list('ABCD')) + assert_frame_equal(s1.to_frame() & s2.to_frame(), exp) + assert_frame_equal(s2.to_frame() & s1.to_frame(), exp) + + exp = pd.DataFrame({'x': [True, True, np.nan, np.nan]}, + index=list('ABCD')) + assert_frame_equal(s1.to_frame() | s2.to_frame(), exp) + assert_frame_equal(s2.to_frame() | s1.to_frame(), exp) + + # different length + s3 = pd.Series([True, False, True], index=list('ABC'), name='x') + s4 = pd.Series([True, True, True, True], index=list('ABCD'), name='x') + + exp = pd.Series([True, False, True, False], + index=list('ABCD'), name='x') + assert_series_equal(s3 & s4, exp) + assert_series_equal(s4 & s3, exp) + + # np.nan | True => np.nan, filled with False + exp = pd.Series([True, True, True, False], + index=list('ABCD'), name='x') + assert_series_equal(s3 | s4, exp) + # True | np.nan => True + exp = pd.Series([True, True, True, True], + index=list('ABCD'), name='x') + assert_series_equal(s4 | s3, exp) + + exp = pd.DataFrame({'x': [True, False, True, np.nan]}, + index=list('ABCD')) + assert_frame_equal(s3.to_frame() & s4.to_frame(), exp) + assert_frame_equal(s4.to_frame() & s3.to_frame(), exp) + + exp = pd.DataFrame({'x': [True, True, True, np.nan]}, + index=list('ABCD')) + assert_frame_equal(s3.to_frame() | s4.to_frame(), exp) + assert_frame_equal(s4.to_frame() | s3.to_frame(), exp) + + def test_arith_ops_df_compat(self): + # GH#1134 + s1 = pd.Series([1, 2, 3], index=list('ABC'), name='x') + s2 = pd.Series([2, 2, 2], index=list('ABD'), name='x') + + exp = pd.Series([3.0, 4.0, np.nan, np.nan], + index=list('ABCD'), name='x') + assert_series_equal(s1 + s2, exp) + assert_series_equal(s2 + s1, exp) + + exp = pd.DataFrame({'x': [3.0, 4.0, np.nan, np.nan]}, + index=list('ABCD')) + assert_frame_equal(s1.to_frame() + s2.to_frame(), exp) + assert_frame_equal(s2.to_frame() + s1.to_frame(), exp) + + # different length + s3 = pd.Series([1, 2, 3], index=list('ABC'), name='x') + s4 = pd.Series([2, 2, 2, 2], index=list('ABCD'), name='x') + + exp = pd.Series([3, 4, 5, np.nan], + index=list('ABCD'), name='x') + assert_series_equal(s3 + s4, exp) + assert_series_equal(s4 + s3, exp) + + exp = pd.DataFrame({'x': [3, 4, 5, np.nan]}, + index=list('ABCD')) + assert_frame_equal(s3.to_frame() + s4.to_frame(), exp) + assert_frame_equal(s4.to_frame() + s3.to_frame(), exp) diff --git a/pandas/tests/series/test_timezones.py b/pandas/tests/series/test_timezones.py index 2e15c964e4e93..b54645d04bd1a 100644 --- a/pandas/tests/series/test_timezones.py +++ b/pandas/tests/series/test_timezones.py @@ -88,6 +88,15 @@ def test_series_tz_convert(self): tm.assert_raises_regex(TypeError, "Cannot convert tz-naive", ts.tz_convert, 'US/Eastern') + def test_series_tz_convert_to_utc(self): + base = DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03'], + tz='UTC') + idx1 = base.tz_convert('Asia/Tokyo')[:2] + idx2 = base.tz_convert('US/Eastern')[1:] + + res = Series([1, 2], index=idx1) + Series([1, 1], index=idx2) + tm.assert_series_equal(res, Series([np.nan, 3, np.nan], index=base)) + # ----------------------------------------------------------------- # Series.append From 80241e6d4b469aa55f6105d15d0a4176718bcbaa Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 21 Feb 2018 15:55:21 -0800 Subject: [PATCH 157/214] Fix name setting in DTI/TDI __add__ and __sub__ (#19744) --- doc/source/whatsnew/v0.23.0.txt | 2 + pandas/core/common.py | 15 ---- pandas/core/indexes/datetimelike.py | 51 +++++++----- pandas/core/indexes/datetimes.py | 36 +++++---- pandas/core/indexes/period.py | 4 +- pandas/core/indexes/timedeltas.py | 34 +++++--- pandas/core/ops.py | 78 +++++++++++++++---- pandas/core/series.py | 6 +- .../indexes/datetimes/test_arithmetic.py | 44 +++++++++-- .../indexes/timedeltas/test_arithmetic.py | 75 +++++++++++------- pandas/tests/test_common.py | 13 ++-- 11 files changed, 240 insertions(+), 118 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index a4b943f995a33..c9951e0ec4378 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -731,6 +731,8 @@ Datetimelike - Bug in :class:`Timestamp` and :func:`to_datetime` where a string representing a barely out-of-bounds timestamp would be incorrectly rounded down instead of raising ``OutOfBoundsDatetime`` (:issue:`19382`) - Bug in :func:`Timestamp.floor` :func:`DatetimeIndex.floor` where time stamps far in the future and past were not rounded correctly (:issue:`19206`) - Bug in :func:`to_datetime` where passing an out-of-bounds datetime with ``errors='coerce'`` and ``utc=True`` would raise ``OutOfBoundsDatetime`` instead of parsing to ``NaT`` (:issue:`19612`) +- Bug in :class:`DatetimeIndex` and :class:`TimedeltaIndex` addition and subtraction where name of the returned object was not always set consistently. (:issue:`19744`) +- Timedelta ^^^^^^^^^ diff --git a/pandas/core/common.py b/pandas/core/common.py index 6748db825acf0..77dc1522052d4 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -121,21 +121,6 @@ def _consensus_name_attr(objs): return name -def _maybe_match_name(a, b): - a_has = hasattr(a, 'name') - b_has = hasattr(b, 'name') - if a_has and b_has: - if a.name == b.name: - return a.name - else: - return None - elif a_has: - return a.name - elif b_has: - return b.name - return None - - def _get_info_slice(obj, indexer): """Slice the info axis of `obj` with `indexer`.""" if not hasattr(obj, '_info_axis_number'): diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index c98f8ceea0ffa..187f9fcf52dd4 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -29,7 +29,7 @@ from pandas.core.dtypes.generic import ( ABCIndex, ABCSeries, ABCPeriodIndex, ABCIndexClass) from pandas.core.dtypes.missing import isna -from pandas.core import common as com, algorithms +from pandas.core import common as com, algorithms, ops from pandas.core.algorithms import checked_add_with_arr from pandas.errors import NullFrequencyError import pandas.io.formats.printing as printing @@ -661,29 +661,37 @@ def __add__(self, other): if isinstance(other, ABCSeries): return NotImplemented elif is_timedelta64_dtype(other): - return self._add_delta(other) + result = self._add_delta(other) elif isinstance(other, (DateOffset, timedelta)): - return self._add_delta(other) + result = self._add_delta(other) elif is_offsetlike(other): # Array/Index of DateOffset objects - return self._add_offset_array(other) + result = self._add_offset_array(other) elif isinstance(self, TimedeltaIndex) and isinstance(other, Index): if hasattr(other, '_add_delta'): - return other._add_delta(self) - raise TypeError("cannot add TimedeltaIndex and {typ}" - .format(typ=type(other))) + result = other._add_delta(self) + else: + raise TypeError("cannot add TimedeltaIndex and {typ}" + .format(typ=type(other))) elif is_integer(other): - return self.shift(other) + # This check must come after the check for timedelta64_dtype + # or else it will incorrectly catch np.timedelta64 objects + result = self.shift(other) elif isinstance(other, (datetime, np.datetime64)): - return self._add_datelike(other) + result = self._add_datelike(other) elif isinstance(other, Index): - return self._add_datelike(other) + result = self._add_datelike(other) elif is_integer_dtype(other) and self.freq is None: # GH#19123 raise NullFrequencyError("Cannot shift with no freq") else: # pragma: no cover return NotImplemented + if result is not NotImplemented: + res_name = ops.get_op_result_name(self, other) + result.name = res_name + return result + cls.__add__ = __add__ cls.__radd__ = __add__ @@ -697,25 +705,27 @@ def __sub__(self, other): if isinstance(other, ABCSeries): return NotImplemented elif is_timedelta64_dtype(other): - return self._add_delta(-other) + result = self._add_delta(-other) elif isinstance(other, (DateOffset, timedelta)): - return self._add_delta(-other) + result = self._add_delta(-other) elif is_offsetlike(other): # Array/Index of DateOffset objects - return self._sub_offset_array(other) + result = self._sub_offset_array(other) elif isinstance(self, TimedeltaIndex) and isinstance(other, Index): if not isinstance(other, TimedeltaIndex): raise TypeError("cannot subtract TimedeltaIndex and {typ}" .format(typ=type(other).__name__)) - return self._add_delta(-other) + result = self._add_delta(-other) elif isinstance(other, DatetimeIndex): - return self._sub_datelike(other) + result = self._sub_datelike(other) elif is_integer(other): - return self.shift(-other) + # This check must come after the check for timedelta64_dtype + # or else it will incorrectly catch np.timedelta64 objects + result = self.shift(-other) elif isinstance(other, (datetime, np.datetime64)): - return self._sub_datelike(other) + result = self._sub_datelike(other) elif isinstance(other, Period): - return self._sub_period(other) + result = self._sub_period(other) elif isinstance(other, Index): raise TypeError("cannot subtract {typ1} and {typ2}" .format(typ1=type(self).__name__, @@ -726,6 +736,11 @@ def __sub__(self, other): else: # pragma: no cover return NotImplemented + if result is not NotImplemented: + res_name = ops.get_op_result_name(self, other) + result.name = res_name + return result + cls.__sub__ = __sub__ def __rsub__(self, other): diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index cc9ce1f3fd5eb..debeabf9bae23 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -886,7 +886,7 @@ def _sub_datelike(self, other): else: raise TypeError("cannot subtract DatetimeIndex and {typ}" .format(typ=type(other).__name__)) - return TimedeltaIndex(result, name=self.name, copy=False) + return TimedeltaIndex(result) def _sub_datelike_dti(self, other): """subtraction of two DatetimeIndexes""" @@ -910,20 +910,31 @@ def _maybe_update_attributes(self, attrs): return attrs def _add_delta(self, delta): - if isinstance(delta, ABCSeries): - return NotImplemented + """ + Add a timedelta-like, DateOffset, or TimedeltaIndex-like object + to self. + + Parameters + ---------- + delta : {timedelta, np.timedelta64, DateOffset, + TimedelaIndex, ndarray[timedelta64]} + Returns + ------- + result : DatetimeIndex + + Notes + ----- + The result's name is set outside of _add_delta by the calling + method (__add__ or __sub__) + """ from pandas import TimedeltaIndex - name = self.name if isinstance(delta, (Tick, timedelta, np.timedelta64)): new_values = self._add_delta_td(delta) elif is_timedelta64_dtype(delta): if not isinstance(delta, TimedeltaIndex): delta = TimedeltaIndex(delta) - else: - # update name when delta is Index - name = com._maybe_match_name(self, delta) new_values = self._add_delta_tdi(delta) elif isinstance(delta, DateOffset): new_values = self._add_offset(delta).asi8 @@ -931,7 +942,7 @@ def _add_delta(self, delta): new_values = self.astype('O') + delta tz = 'UTC' if self.tz is not None else None - result = DatetimeIndex(new_values, tz=tz, name=name, freq='infer') + result = DatetimeIndex(new_values, tz=tz, freq='infer') if self.tz is not None and self.tz is not utc: result = result.tz_convert(self.tz) return result @@ -954,22 +965,19 @@ def _add_offset(self, offset): def _add_offset_array(self, other): # Array/Index of DateOffset objects - if isinstance(other, ABCSeries): - return NotImplemented - elif len(other) == 1: + if len(other) == 1: return self + other[0] else: warnings.warn("Adding/subtracting array of DateOffsets to " "{} not vectorized".format(type(self)), PerformanceWarning) return self.astype('O') + np.array(other) + # TODO: pass freq='infer' like we do in _sub_offset_array? # TODO: This works for __add__ but loses dtype in __sub__ def _sub_offset_array(self, other): # Array/Index of DateOffset objects - if isinstance(other, ABCSeries): - return NotImplemented - elif len(other) == 1: + if len(other) == 1: return self - other[0] else: warnings.warn("Adding/subtracting array of DateOffsets to " diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 8f2d7d382a16e..60798e6d77e37 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -729,7 +729,7 @@ def _sub_datelike(self, other): if other is tslib.NaT: new_data = np.empty(len(self), dtype=np.int64) new_data.fill(tslib.iNaT) - return TimedeltaIndex(new_data, name=self.name) + return TimedeltaIndex(new_data) return NotImplemented def _sub_period(self, other): @@ -744,7 +744,7 @@ def _sub_period(self, other): new_data = new_data.astype(np.float64) new_data[self._isnan] = np.nan # result must be Int64Index or Float64Index - return Index(new_data, name=self.name) + return Index(new_data) def shift(self, n): """ diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 41e499da8e008..6b61db53d9a11 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -356,19 +356,32 @@ def _maybe_update_attributes(self, attrs): return attrs def _add_delta(self, delta): + """ + Add a timedelta-like, Tick, or TimedeltaIndex-like object + to self. + + Parameters + ---------- + delta : {timedelta, np.timedelta64, Tick, TimedeltaIndex} + + Returns + ------- + result : TimedeltaIndex + + Notes + ----- + The result's name is set outside of _add_delta by the calling + method (__add__ or __sub__) + """ if isinstance(delta, (Tick, timedelta, np.timedelta64)): new_values = self._add_delta_td(delta) - name = self.name elif isinstance(delta, TimedeltaIndex): new_values = self._add_delta_tdi(delta) - # update name when delta is index - name = com._maybe_match_name(self, delta) else: raise TypeError("cannot add the type {0} to a TimedeltaIndex" .format(type(delta))) - result = TimedeltaIndex(new_values, freq='infer', name=name) - return result + return TimedeltaIndex(new_values, freq='infer') def _evaluate_with_timedelta_like(self, other, op, opstr, reversed=False): if isinstance(other, ABCSeries): @@ -409,7 +422,7 @@ def _add_datelike(self, other): result = checked_add_with_arr(i8, other.value, arr_mask=self._isnan) result = self._maybe_mask_results(result, fill_value=iNaT) - return DatetimeIndex(result, name=self.name, copy=False) + return DatetimeIndex(result) def _sub_datelike(self, other): # GH#19124 Timedelta - datetime is not in general well-defined. @@ -426,9 +439,7 @@ def _add_offset_array(self, other): # TimedeltaIndex can only operate with a subset of DateOffset # subclasses. Incompatible classes will raise AttributeError, # which we re-raise as TypeError - if isinstance(other, ABCSeries): - return NotImplemented - elif len(other) == 1: + if len(other) == 1: return self + other[0] else: from pandas.errors import PerformanceWarning @@ -436,6 +447,7 @@ def _add_offset_array(self, other): "{} not vectorized".format(type(self)), PerformanceWarning) return self.astype('O') + np.array(other) + # TODO: pass freq='infer' like we do in _sub_offset_array? # TODO: This works for __add__ but loses dtype in __sub__ except AttributeError: raise TypeError("Cannot add non-tick DateOffset to TimedeltaIndex") @@ -446,9 +458,7 @@ def _sub_offset_array(self, other): # TimedeltaIndex can only operate with a subset of DateOffset # subclasses. Incompatible classes will raise AttributeError, # which we re-raise as TypeError - if isinstance(other, ABCSeries): - return NotImplemented - elif len(other) == 1: + if len(other) == 1: return self - other[0] else: from pandas.errors import PerformanceWarning diff --git a/pandas/core/ops.py b/pandas/core/ops.py index ad6102eb6ad0f..9e80ab3b3da4c 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -42,6 +42,67 @@ ABCSparseSeries, ABCSparseArray) +# ----------------------------------------------------------------------------- +# Ops Wrapping Utilities + +def get_op_result_name(left, right): + """ + Find the appropriate name to pin to an operation result. This result + should always be either an Index or a Series. + + Parameters + ---------- + left : {Series, Index} + right : object + + Returns + ------- + name : object + Usually a string + """ + # `left` is always a pd.Series when called from within ops + if isinstance(right, (ABCSeries, pd.Index)): + name = _maybe_match_name(left, right) + else: + name = left.name + return name + + +def _maybe_match_name(a, b): + """ + Try to find a name to attach to the result of an operation between + a and b. If only one of these has a `name` attribute, return that + name. Otherwise return a consensus name if they match of None if + they have different names. + + Parameters + ---------- + a : object + b : object + + Returns + ------- + name : str or None + + See also + -------- + pandas.core.common._consensus_name_attr + """ + a_has = hasattr(a, 'name') + b_has = hasattr(b, 'name') + if a_has and b_has: + if a.name == b.name: + return a.name + else: + # TODO: what if they both have np.nan for their names? + return None + elif a_has: + return a.name + elif b_has: + return b.name + return None + + # ----------------------------------------------------------------------------- # Reversed Operations not available in the stdlib operator module. # Defining these instead of using lambdas allows us to reference them by name. @@ -822,7 +883,7 @@ def wrapper(left, right, name=name, na_op=na_op): return NotImplemented left, right = _align_method_SERIES(left, right) - res_name = _get_series_op_result_name(left, right) + res_name = get_op_result_name(left, right) if is_datetime64_dtype(left) or is_datetime64tz_dtype(left): result = dispatch_to_index_op(op, left, right, pd.DatetimeIndex) @@ -886,15 +947,6 @@ def dispatch_to_index_op(op, left, right, index_class): return result -def _get_series_op_result_name(left, right): - # `left` is always a pd.Series - if isinstance(right, (ABCSeries, pd.Index)): - name = com._maybe_match_name(left, right) - else: - name = left.name - return name - - def _comp_method_OBJECT_ARRAY(op, x, y): if isinstance(y, list): y = construct_1d_object_array_from_listlike(y) @@ -972,7 +1024,7 @@ def wrapper(self, other, axis=None): if axis is not None: self._get_axis_number(axis) - res_name = _get_series_op_result_name(self, other) + res_name = get_op_result_name(self, other) if isinstance(other, ABCDataFrame): # pragma: no cover # Defer to DataFrame implementation; fail early @@ -1098,7 +1150,7 @@ def wrapper(self, other): return NotImplemented elif isinstance(other, ABCSeries): - name = com._maybe_match_name(self, other) + name = get_op_result_name(self, other) is_other_int_dtype = is_integer_dtype(other.dtype) other = fill_int(other) if is_other_int_dtype else fill_bool(other) @@ -1536,7 +1588,7 @@ def wrapper(self, other): def _sparse_series_op(left, right, op, name): left, right = left.align(right, join='outer', copy=False) new_index = left.index - new_name = com._maybe_match_name(left, right) + new_name = get_op_result_name(left, right) from pandas.core.sparse.array import _sparse_array_op result = _sparse_array_op(left.values, right.values, op, name, diff --git a/pandas/core/series.py b/pandas/core/series.py index 90dc14836ab55..79ffb8be65838 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1728,7 +1728,7 @@ def _binop(self, other, func, level=None, fill_value=None): with np.errstate(all='ignore'): result = func(this_vals, other_vals) - name = com._maybe_match_name(self, other) + name = ops.get_op_result_name(self, other) result = self._constructor(result, index=new_index, name=name) result = result.__finalize__(self) if name is None: @@ -1769,7 +1769,7 @@ def combine(self, other, func, fill_value=np.nan): """ if isinstance(other, Series): new_index = self.index.union(other.index) - new_name = com._maybe_match_name(self, other) + new_name = ops.get_op_result_name(self, other) new_values = np.empty(len(new_index), dtype=self.dtype) for i, idx in enumerate(new_index): lv = self.get(idx, fill_value) @@ -1814,7 +1814,7 @@ def combine_first(self, other): this = self.reindex(new_index, copy=False) other = other.reindex(new_index, copy=False) # TODO: do we need name? - name = com._maybe_match_name(self, other) # noqa + name = ops.get_op_result_name(self, other) # noqa rs_vals = com._where_compat(isna(this), other._values, this._values) return self._constructor(rs_vals, index=new_index).__finalize__(self) diff --git a/pandas/tests/indexes/datetimes/test_arithmetic.py b/pandas/tests/indexes/datetimes/test_arithmetic.py index ddc97636ae0a8..f252d6ec31f89 100644 --- a/pandas/tests/indexes/datetimes/test_arithmetic.py +++ b/pandas/tests/indexes/datetimes/test_arithmetic.py @@ -721,11 +721,10 @@ def test_dti_add_series(self, tz, names): result4 = index + ser.values tm.assert_index_equal(result4, expected) - @pytest.mark.parametrize('box', [np.array, pd.Index]) - def test_dti_add_offset_array(self, tz, box): + def test_dti_add_offset_array(self, tz): # GH#18849 dti = pd.date_range('2017-01-01', periods=2, tz=tz) - other = box([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)]) + other = np.array([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)]) with tm.assert_produces_warning(PerformanceWarning): res = dti + other @@ -737,11 +736,29 @@ def test_dti_add_offset_array(self, tz, box): res2 = other + dti tm.assert_index_equal(res2, expected) - @pytest.mark.parametrize('box', [np.array, pd.Index]) - def test_dti_sub_offset_array(self, tz, box): + @pytest.mark.parametrize('names', [(None, None, None), + ('foo', 'bar', None), + ('foo', 'foo', 'foo')]) + def test_dti_add_offset_index(self, tz, names): + # GH#18849, GH#19744 + dti = pd.date_range('2017-01-01', periods=2, tz=tz, name=names[0]) + other = pd.Index([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)], + name=names[1]) + + with tm.assert_produces_warning(PerformanceWarning): + res = dti + other + expected = DatetimeIndex([dti[n] + other[n] for n in range(len(dti))], + name=names[2], freq='infer') + tm.assert_index_equal(res, expected) + + with tm.assert_produces_warning(PerformanceWarning): + res2 = other + dti + tm.assert_index_equal(res2, expected) + + def test_dti_sub_offset_array(self, tz): # GH#18824 dti = pd.date_range('2017-01-01', periods=2, tz=tz) - other = box([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)]) + other = np.array([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)]) with tm.assert_produces_warning(PerformanceWarning): res = dti - other @@ -749,6 +766,21 @@ def test_dti_sub_offset_array(self, tz, box): name=dti.name, freq='infer') tm.assert_index_equal(res, expected) + @pytest.mark.parametrize('names', [(None, None, None), + ('foo', 'bar', None), + ('foo', 'foo', 'foo')]) + def test_dti_sub_offset_index(self, tz, names): + # GH#18824, GH#19744 + dti = pd.date_range('2017-01-01', periods=2, tz=tz, name=names[0]) + other = pd.Index([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)], + name=names[1]) + + with tm.assert_produces_warning(PerformanceWarning): + res = dti - other + expected = DatetimeIndex([dti[n] - other[n] for n in range(len(dti))], + name=names[2], freq='infer') + tm.assert_index_equal(res, expected) + @pytest.mark.parametrize('names', [(None, None, None), ('foo', 'bar', None), ('foo', 'foo', 'foo')]) diff --git a/pandas/tests/indexes/timedeltas/test_arithmetic.py b/pandas/tests/indexes/timedeltas/test_arithmetic.py index 3dc60ed33b958..029fdfcefc299 100644 --- a/pandas/tests/indexes/timedeltas/test_arithmetic.py +++ b/pandas/tests/indexes/timedeltas/test_arithmetic.py @@ -194,11 +194,31 @@ def test_shift_no_freq(self): # ------------------------------------------------------------- - @pytest.mark.parametrize('box', [np.array, pd.Index]) - def test_tdi_add_offset_array(self, box): + @pytest.mark.parametrize('names', [(None, None, None), + ('foo', 'bar', None), + ('foo', 'foo', 'foo')]) + def test_tdi_add_offset_index(self, names): + # GH#18849, GH#19744 + tdi = TimedeltaIndex(['1 days 00:00:00', '3 days 04:00:00'], + name=names[0]) + other = pd.Index([pd.offsets.Hour(n=1), pd.offsets.Minute(n=-2)], + name=names[1]) + + expected = TimedeltaIndex([tdi[n] + other[n] for n in range(len(tdi))], + freq='infer', name=names[2]) + + with tm.assert_produces_warning(PerformanceWarning): + res = tdi + other + tm.assert_index_equal(res, expected) + + with tm.assert_produces_warning(PerformanceWarning): + res2 = other + tdi + tm.assert_index_equal(res2, expected) + + def test_tdi_add_offset_array(self): # GH#18849 tdi = TimedeltaIndex(['1 days 00:00:00', '3 days 04:00:00']) - other = box([pd.offsets.Hour(n=1), pd.offsets.Minute(n=-2)]) + other = np.array([pd.offsets.Hour(n=1), pd.offsets.Minute(n=-2)]) expected = TimedeltaIndex([tdi[n] + other[n] for n in range(len(tdi))], freq='infer') @@ -211,23 +231,27 @@ def test_tdi_add_offset_array(self, box): res2 = other + tdi tm.assert_index_equal(res2, expected) - anchored = box([pd.offsets.QuarterEnd(), - pd.offsets.Week(weekday=2)]) + @pytest.mark.parametrize('names', [(None, None, None), + ('foo', 'bar', None), + ('foo', 'foo', 'foo')]) + def test_tdi_sub_offset_index(self, names): + # GH#18824, GH#19744 + tdi = TimedeltaIndex(['1 days 00:00:00', '3 days 04:00:00'], + name=names[0]) + other = pd.Index([pd.offsets.Hour(n=1), pd.offsets.Minute(n=-2)], + name=names[1]) - # addition/subtraction ops with anchored offsets should issue - # a PerformanceWarning and _then_ raise a TypeError. - with pytest.raises(TypeError): - with tm.assert_produces_warning(PerformanceWarning): - tdi + anchored - with pytest.raises(TypeError): - with tm.assert_produces_warning(PerformanceWarning): - anchored + tdi + expected = TimedeltaIndex([tdi[n] - other[n] for n in range(len(tdi))], + freq='infer', name=names[2]) + + with tm.assert_produces_warning(PerformanceWarning): + res = tdi - other + tm.assert_index_equal(res, expected) - @pytest.mark.parametrize('box', [np.array, pd.Index]) - def test_tdi_sub_offset_array(self, box): + def test_tdi_sub_offset_array(self): # GH#18824 tdi = TimedeltaIndex(['1 days 00:00:00', '3 days 04:00:00']) - other = box([pd.offsets.Hour(n=1), pd.offsets.Minute(n=-2)]) + other = np.array([pd.offsets.Hour(n=1), pd.offsets.Minute(n=-2)]) expected = TimedeltaIndex([tdi[n] - other[n] for n in range(len(tdi))], freq='infer') @@ -236,17 +260,6 @@ def test_tdi_sub_offset_array(self, box): res = tdi - other tm.assert_index_equal(res, expected) - anchored = box([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)]) - - # addition/subtraction ops with anchored offsets should issue - # a PerformanceWarning and _then_ raise a TypeError. - with pytest.raises(TypeError): - with tm.assert_produces_warning(PerformanceWarning): - tdi - anchored - with pytest.raises(TypeError): - with tm.assert_produces_warning(PerformanceWarning): - anchored - tdi - @pytest.mark.parametrize('names', [(None, None, None), ('foo', 'bar', None), ('foo', 'foo', 'foo')]) @@ -275,8 +288,12 @@ def test_tdi_with_offset_series(self, names): res3 = tdi - other tm.assert_series_equal(res3, expected_sub) - anchored = Series([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)], - name=names[1]) + @pytest.mark.parametrize('box', [np.array, pd.Index, pd.Series]) + def test_tdi_add_sub_anchored_offset_arraylike(self, box): + # GH#18824 + tdi = TimedeltaIndex(['1 days 00:00:00', '3 days 04:00:00']) + + anchored = box([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)]) # addition/subtraction ops with anchored offsets should issue # a PerformanceWarning and _then_ raise a TypeError. diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index 57479be4d989f..0b329f64dafa3 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -9,6 +9,7 @@ from pandas import Series, Timestamp from pandas.compat import range, lmap import pandas.core.common as com +from pandas.core import ops import pandas.util.testing as tm @@ -167,26 +168,26 @@ def test_random_state(): def test_maybe_match_name(): - matched = com._maybe_match_name( + matched = ops._maybe_match_name( Series([1], name='x'), Series( [2], name='x')) assert (matched == 'x') - matched = com._maybe_match_name( + matched = ops._maybe_match_name( Series([1], name='x'), Series( [2], name='y')) assert (matched is None) - matched = com._maybe_match_name(Series([1]), Series([2], name='x')) + matched = ops._maybe_match_name(Series([1]), Series([2], name='x')) assert (matched is None) - matched = com._maybe_match_name(Series([1], name='x'), Series([2])) + matched = ops._maybe_match_name(Series([1], name='x'), Series([2])) assert (matched is None) - matched = com._maybe_match_name(Series([1], name='x'), [2]) + matched = ops._maybe_match_name(Series([1], name='x'), [2]) assert (matched == 'x') - matched = com._maybe_match_name([1], Series([2], name='y')) + matched = ops._maybe_match_name([1], Series([2], name='y')) assert (matched == 'y') From b94524775432cc309c57bb82d52bd41e1beb693f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 21 Feb 2018 16:11:32 -0800 Subject: [PATCH 158/214] parametrize a whole mess of tests (#19785) --- .../indexes/datetimes/test_arithmetic.py | 16 +- .../indexes/datetimes/test_construction.py | 150 ++++---- .../indexes/datetimes/test_date_range.py | 83 +++-- .../tests/indexes/datetimes/test_datetime.py | 42 ++- .../tests/indexes/datetimes/test_indexing.py | 28 +- .../tests/indexes/datetimes/test_missing.py | 88 ++--- pandas/tests/indexes/datetimes/test_ops.py | 259 +++++++------- .../indexes/datetimes/test_scalar_compat.py | 21 +- .../tests/indexes/datetimes/test_timezones.py | 32 +- pandas/tests/indexes/datetimes/test_tools.py | 14 +- .../tests/indexes/period/test_arithmetic.py | 44 +-- pandas/tests/indexes/period/test_asfreq.py | 29 +- .../tests/indexes/period/test_construction.py | 43 ++- pandas/tests/indexes/period/test_ops.py | 63 ++-- pandas/tests/indexes/period/test_period.py | 10 +- pandas/tests/indexes/period/test_setops.py | 21 +- pandas/tests/indexes/period/test_tools.py | 323 +++++++++--------- .../indexes/timedeltas/test_arithmetic.py | 14 +- pandas/tests/indexes/timedeltas/test_ops.py | 17 +- .../indexes/timedeltas/test_timedelta.py | 10 +- pandas/tests/scalar/period/test_period.py | 203 +++++------ 21 files changed, 742 insertions(+), 768 deletions(-) diff --git a/pandas/tests/indexes/datetimes/test_arithmetic.py b/pandas/tests/indexes/datetimes/test_arithmetic.py index f252d6ec31f89..7900c983b6c77 100644 --- a/pandas/tests/indexes/datetimes/test_arithmetic.py +++ b/pandas/tests/indexes/datetimes/test_arithmetic.py @@ -614,19 +614,19 @@ def test_sub_dti_dti(self): result = dti2 - dti1 tm.assert_index_equal(result, expected) - def test_sub_period(self): - # GH 13078 + @pytest.mark.parametrize('freq', [None, 'D']) + def test_sub_period(self, freq): + # GH#13078 # not supported, check TypeError p = pd.Period('2011-01-01', freq='D') - for freq in [None, 'D']: - idx = pd.DatetimeIndex(['2011-01-01', '2011-01-02'], freq=freq) + idx = pd.DatetimeIndex(['2011-01-01', '2011-01-02'], freq=freq) - with pytest.raises(TypeError): - idx - p + with pytest.raises(TypeError): + idx - p - with pytest.raises(TypeError): - p - idx + with pytest.raises(TypeError): + p - idx def test_ufunc_coercions(self): idx = date_range('2011-01-01', periods=3, freq='2D', name='x') diff --git a/pandas/tests/indexes/datetimes/test_construction.py b/pandas/tests/indexes/datetimes/test_construction.py index 197a42bdaacbb..176f5bd0c1a2a 100644 --- a/pandas/tests/indexes/datetimes/test_construction.py +++ b/pandas/tests/indexes/datetimes/test_construction.py @@ -351,52 +351,51 @@ def test_constructor_coverage(self): freq='B') pytest.raises(ValueError, DatetimeIndex, periods=10, freq='D') - def test_constructor_datetime64_tzformat(self): - # see gh-6572: ISO 8601 format results in pytz.FixedOffset - for freq in ['AS', 'W-SUN']: - idx = date_range('2013-01-01T00:00:00-05:00', - '2016-01-01T23:59:59-05:00', freq=freq) - expected = date_range('2013-01-01T00:00:00', '2016-01-01T23:59:59', - freq=freq, tz=pytz.FixedOffset(-300)) - tm.assert_index_equal(idx, expected) - # Unable to use `US/Eastern` because of DST - expected_i8 = date_range('2013-01-01T00:00:00', - '2016-01-01T23:59:59', freq=freq, - tz='America/Lima') - tm.assert_numpy_array_equal(idx.asi8, expected_i8.asi8) - - idx = date_range('2013-01-01T00:00:00+09:00', - '2016-01-01T23:59:59+09:00', freq=freq) - expected = date_range('2013-01-01T00:00:00', '2016-01-01T23:59:59', - freq=freq, tz=pytz.FixedOffset(540)) - tm.assert_index_equal(idx, expected) - expected_i8 = date_range('2013-01-01T00:00:00', - '2016-01-01T23:59:59', freq=freq, - tz='Asia/Tokyo') - tm.assert_numpy_array_equal(idx.asi8, expected_i8.asi8) + @pytest.mark.parametrize('freq', ['AS', 'W-SUN']) + def test_constructor_datetime64_tzformat(self, freq): + # see GH#6572: ISO 8601 format results in pytz.FixedOffset + idx = date_range('2013-01-01T00:00:00-05:00', + '2016-01-01T23:59:59-05:00', freq=freq) + expected = date_range('2013-01-01T00:00:00', '2016-01-01T23:59:59', + freq=freq, tz=pytz.FixedOffset(-300)) + tm.assert_index_equal(idx, expected) + # Unable to use `US/Eastern` because of DST + expected_i8 = date_range('2013-01-01T00:00:00', + '2016-01-01T23:59:59', freq=freq, + tz='America/Lima') + tm.assert_numpy_array_equal(idx.asi8, expected_i8.asi8) + + idx = date_range('2013-01-01T00:00:00+09:00', + '2016-01-01T23:59:59+09:00', freq=freq) + expected = date_range('2013-01-01T00:00:00', '2016-01-01T23:59:59', + freq=freq, tz=pytz.FixedOffset(540)) + tm.assert_index_equal(idx, expected) + expected_i8 = date_range('2013-01-01T00:00:00', + '2016-01-01T23:59:59', freq=freq, + tz='Asia/Tokyo') + tm.assert_numpy_array_equal(idx.asi8, expected_i8.asi8) # Non ISO 8601 format results in dateutil.tz.tzoffset - for freq in ['AS', 'W-SUN']: - idx = date_range('2013/1/1 0:00:00-5:00', '2016/1/1 23:59:59-5:00', - freq=freq) - expected = date_range('2013-01-01T00:00:00', '2016-01-01T23:59:59', - freq=freq, tz=pytz.FixedOffset(-300)) - tm.assert_index_equal(idx, expected) - # Unable to use `US/Eastern` because of DST - expected_i8 = date_range('2013-01-01T00:00:00', - '2016-01-01T23:59:59', freq=freq, - tz='America/Lima') - tm.assert_numpy_array_equal(idx.asi8, expected_i8.asi8) - - idx = date_range('2013/1/1 0:00:00+9:00', - '2016/1/1 23:59:59+09:00', freq=freq) - expected = date_range('2013-01-01T00:00:00', '2016-01-01T23:59:59', - freq=freq, tz=pytz.FixedOffset(540)) - tm.assert_index_equal(idx, expected) - expected_i8 = date_range('2013-01-01T00:00:00', - '2016-01-01T23:59:59', freq=freq, - tz='Asia/Tokyo') - tm.assert_numpy_array_equal(idx.asi8, expected_i8.asi8) + idx = date_range('2013/1/1 0:00:00-5:00', '2016/1/1 23:59:59-5:00', + freq=freq) + expected = date_range('2013-01-01T00:00:00', '2016-01-01T23:59:59', + freq=freq, tz=pytz.FixedOffset(-300)) + tm.assert_index_equal(idx, expected) + # Unable to use `US/Eastern` because of DST + expected_i8 = date_range('2013-01-01T00:00:00', + '2016-01-01T23:59:59', freq=freq, + tz='America/Lima') + tm.assert_numpy_array_equal(idx.asi8, expected_i8.asi8) + + idx = date_range('2013/1/1 0:00:00+9:00', + '2016/1/1 23:59:59+09:00', freq=freq) + expected = date_range('2013-01-01T00:00:00', '2016-01-01T23:59:59', + freq=freq, tz=pytz.FixedOffset(540)) + tm.assert_index_equal(idx, expected) + expected_i8 = date_range('2013-01-01T00:00:00', + '2016-01-01T23:59:59', freq=freq, + tz='Asia/Tokyo') + tm.assert_numpy_array_equal(idx.asi8, expected_i8.asi8) def test_constructor_dtype(self): @@ -451,36 +450,35 @@ def test_dti_constructor_preserve_dti_freq(self): rng2 = DatetimeIndex(rng) assert rng.freq == rng2.freq - def test_dti_constructor_years_only(self): + @pytest.mark.parametrize('tz', [None, 'UTC', 'Asia/Tokyo', + 'dateutil/US/Pacific']) + def test_dti_constructor_years_only(self, tz): # GH 6961 - for tz in [None, 'UTC', 'Asia/Tokyo', 'dateutil/US/Pacific']: - rng1 = date_range('2014', '2015', freq='M', tz=tz) - expected1 = date_range('2014-01-31', '2014-12-31', freq='M', tz=tz) + rng1 = date_range('2014', '2015', freq='M', tz=tz) + expected1 = date_range('2014-01-31', '2014-12-31', freq='M', tz=tz) - rng2 = date_range('2014', '2015', freq='MS', tz=tz) - expected2 = date_range('2014-01-01', '2015-01-01', freq='MS', - tz=tz) + rng2 = date_range('2014', '2015', freq='MS', tz=tz) + expected2 = date_range('2014-01-01', '2015-01-01', freq='MS', tz=tz) - rng3 = date_range('2014', '2020', freq='A', tz=tz) - expected3 = date_range('2014-12-31', '2019-12-31', freq='A', tz=tz) + rng3 = date_range('2014', '2020', freq='A', tz=tz) + expected3 = date_range('2014-12-31', '2019-12-31', freq='A', tz=tz) - rng4 = date_range('2014', '2020', freq='AS', tz=tz) - expected4 = date_range('2014-01-01', '2020-01-01', freq='AS', - tz=tz) + rng4 = date_range('2014', '2020', freq='AS', tz=tz) + expected4 = date_range('2014-01-01', '2020-01-01', freq='AS', tz=tz) - for rng, expected in [(rng1, expected1), (rng2, expected2), - (rng3, expected3), (rng4, expected4)]: - tm.assert_index_equal(rng, expected) + for rng, expected in [(rng1, expected1), (rng2, expected2), + (rng3, expected3), (rng4, expected4)]: + tm.assert_index_equal(rng, expected) - def test_dti_constructor_small_int(self): + @pytest.mark.parametrize('dtype', [np.int64, np.int32, np.int16, np.int8]) + def test_dti_constructor_small_int(self, dtype): # GH 13721 exp = DatetimeIndex(['1970-01-01 00:00:00.00000000', '1970-01-01 00:00:00.00000001', '1970-01-01 00:00:00.00000002']) - for dtype in [np.int64, np.int32, np.int16, np.int8]: - arr = np.array([0, 10, 20], dtype=dtype) - tm.assert_index_equal(DatetimeIndex(arr), exp) + arr = np.array([0, 10, 20], dtype=dtype) + tm.assert_index_equal(DatetimeIndex(arr), exp) def test_ctor_str_intraday(self): rng = DatetimeIndex(['1-1-2000 00:00:01']) @@ -499,7 +497,7 @@ def test_index_cast_datetime64_other_units(self): assert (idx.values == conversion.ensure_datetime64ns(arr)).all() def test_constructor_int64_nocopy(self): - # #1624 + # GH#1624 arr = np.arange(1000, dtype=np.int64) index = DatetimeIndex(arr) @@ -512,19 +510,17 @@ def test_constructor_int64_nocopy(self): arr[50:100] = -1 assert (index.asi8[50:100] != -1).all() - def test_from_freq_recreate_from_data(self): - freqs = ['M', 'Q', 'A', 'D', 'B', 'BH', 'T', 'S', 'L', 'U', 'H', 'N', - 'C'] - - for f in freqs: - org = DatetimeIndex(start='2001/02/01 09:00', freq=f, periods=1) - idx = DatetimeIndex(org, freq=f) - tm.assert_index_equal(idx, org) - - org = DatetimeIndex(start='2001/02/01 09:00', freq=f, - tz='US/Pacific', periods=1) - idx = DatetimeIndex(org, freq=f, tz='US/Pacific') - tm.assert_index_equal(idx, org) + @pytest.mark.parametrize('freq', ['M', 'Q', 'A', 'D', 'B', 'BH', + 'T', 'S', 'L', 'U', 'H', 'N', 'C']) + def test_from_freq_recreate_from_data(self, freq): + org = DatetimeIndex(start='2001/02/01 09:00', freq=freq, periods=1) + idx = DatetimeIndex(org, freq=freq) + tm.assert_index_equal(idx, org) + + org = DatetimeIndex(start='2001/02/01 09:00', freq=freq, + tz='US/Pacific', periods=1) + idx = DatetimeIndex(org, freq=freq, tz='US/Pacific') + tm.assert_index_equal(idx, org) def test_datetimeindex_constructor_misc(self): arr = ['1/1/2005', '1/2/2005', 'Jn 3, 2005', '2005-01-04'] diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index 3738398d017f8..d2ec465468dfb 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -222,16 +222,13 @@ def test_range_misspecified(self): with tm.assert_raises_regex(ValueError, msg): date_range() - def test_compat_replace(self): + @pytest.mark.parametrize('f', [compat.long, int]) + def test_compat_replace(self, f): # https://github.com/statsmodels/statsmodels/issues/3349 # replace should take ints/longs for compat - - for f in [compat.long, int]: - result = date_range(Timestamp('1960-04-01 00:00:00', - freq='QS-JAN'), - periods=f(76), - freq='QS-JAN') - assert len(result) == 76 + result = date_range(Timestamp('1960-04-01 00:00:00', freq='QS-JAN'), + periods=f(76), freq='QS-JAN') + assert len(result) == 76 def test_catch_infinite_loop(self): offset = offsets.DateOffset(minute=5) @@ -484,24 +481,24 @@ def test_range_tz_dateutil(self): assert dr[0] == start assert dr[2] == end - def test_range_closed(self): + @pytest.mark.parametrize('freq', ["1D", "3D", "2M", "7W", "3H", "A"]) + def test_range_closed(self, freq): begin = datetime(2011, 1, 1) end = datetime(2014, 1, 1) - for freq in ["1D", "3D", "2M", "7W", "3H", "A"]: - closed = date_range(begin, end, closed=None, freq=freq) - left = date_range(begin, end, closed="left", freq=freq) - right = date_range(begin, end, closed="right", freq=freq) - expected_left = left - expected_right = right + closed = date_range(begin, end, closed=None, freq=freq) + left = date_range(begin, end, closed="left", freq=freq) + right = date_range(begin, end, closed="right", freq=freq) + expected_left = left + expected_right = right - if end == closed[-1]: - expected_left = closed[:-1] - if begin == closed[0]: - expected_right = closed[1:] + if end == closed[-1]: + expected_left = closed[:-1] + if begin == closed[0]: + expected_right = closed[1:] - tm.assert_index_equal(expected_left, left) - tm.assert_index_equal(expected_right, right) + tm.assert_index_equal(expected_left, left) + tm.assert_index_equal(expected_right, right) def test_range_closed_with_tz_aware_start_end(self): # GH12409, GH12684 @@ -546,28 +543,28 @@ def test_range_closed_with_tz_aware_start_end(self): tm.assert_index_equal(expected_left, left) tm.assert_index_equal(expected_right, right) - def test_range_closed_boundary(self): - # GH 11804 - for closed in ['right', 'left', None]: - right_boundary = date_range('2015-09-12', '2015-12-01', - freq='QS-MAR', closed=closed) - left_boundary = date_range('2015-09-01', '2015-09-12', - freq='QS-MAR', closed=closed) - both_boundary = date_range('2015-09-01', '2015-12-01', - freq='QS-MAR', closed=closed) - expected_right = expected_left = expected_both = both_boundary - - if closed == 'right': - expected_left = both_boundary[1:] - if closed == 'left': - expected_right = both_boundary[:-1] - if closed is None: - expected_right = both_boundary[1:] - expected_left = both_boundary[:-1] - - tm.assert_index_equal(right_boundary, expected_right) - tm.assert_index_equal(left_boundary, expected_left) - tm.assert_index_equal(both_boundary, expected_both) + @pytest.mark.parametrize('closed', ['right', 'left', None]) + def test_range_closed_boundary(self, closed): + # GH#11804 + right_boundary = date_range('2015-09-12', '2015-12-01', + freq='QS-MAR', closed=closed) + left_boundary = date_range('2015-09-01', '2015-09-12', + freq='QS-MAR', closed=closed) + both_boundary = date_range('2015-09-01', '2015-12-01', + freq='QS-MAR', closed=closed) + expected_right = expected_left = expected_both = both_boundary + + if closed == 'right': + expected_left = both_boundary[1:] + if closed == 'left': + expected_right = both_boundary[:-1] + if closed is None: + expected_right = both_boundary[1:] + expected_left = both_boundary[:-1] + + tm.assert_index_equal(right_boundary, expected_right) + tm.assert_index_equal(left_boundary, expected_left) + tm.assert_index_equal(both_boundary, expected_both) def test_years_only(self): # GH 6961 diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index 05678b0c8dd45..2cf33644377ab 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -356,12 +356,11 @@ def test_does_not_convert_mixed_integer(self): assert cols.dtype == joined.dtype tm.assert_numpy_array_equal(cols.values, joined.values) - def test_join_self(self): + @pytest.mark.parametrize('how', ['outer', 'inner', 'left', 'right']) + def test_join_self(self, how): index = date_range('1/1/2000', periods=10) - kinds = 'outer', 'inner', 'left', 'right' - for kind in kinds: - joined = index.join(index, how=kind) - assert index is joined + joined = index.join(index, how=how) + assert index is joined def assert_index_parameters(self, index): assert index.freq == '40960N' @@ -381,18 +380,17 @@ def test_ns_index(self): freq=index.freq) self.assert_index_parameters(new_index) - def test_join_with_period_index(self): + @pytest.mark.parametrize('how', ['left', 'right', 'inner', 'outer']) + def test_join_with_period_index(self, how): df = tm.makeCustomDataframe( 10, 10, data_gen_f=lambda *args: np.random.randint(2), c_idx_type='p', r_idx_type='dt') s = df.iloc[:5, 0] - joins = 'left', 'right', 'inner', 'outer' - for join in joins: - with tm.assert_raises_regex(ValueError, - 'can only call with other ' - 'PeriodIndex-ed objects'): - df.columns.join(s.index, how=join) + with tm.assert_raises_regex(ValueError, + 'can only call with other ' + 'PeriodIndex-ed objects'): + df.columns.join(s.index, how=how) def test_factorize(self): idx1 = DatetimeIndex(['2014-01', '2014-01', '2014-02', '2014-02', @@ -439,18 +437,18 @@ def test_factorize(self): tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, idx3) - def test_factorize_tz(self): - # GH 13750 - for tz in [None, 'UTC', 'US/Eastern', 'Asia/Tokyo']: - base = pd.date_range('2016-11-05', freq='H', periods=100, tz=tz) - idx = base.repeat(5) + @pytest.mark.parametrize('tz', [None, 'UTC', 'US/Eastern', 'Asia/Tokyo']) + def test_factorize_tz(self, tz): + # GH#13750 + base = pd.date_range('2016-11-05', freq='H', periods=100, tz=tz) + idx = base.repeat(5) - exp_arr = np.arange(100, dtype=np.intp).repeat(5) + exp_arr = np.arange(100, dtype=np.intp).repeat(5) - for obj in [idx, pd.Series(idx)]: - arr, res = obj.factorize() - tm.assert_numpy_array_equal(arr, exp_arr) - tm.assert_index_equal(res, base) + for obj in [idx, pd.Series(idx)]: + arr, res = obj.factorize() + tm.assert_numpy_array_equal(arr, exp_arr) + tm.assert_index_equal(res, base) def test_factorize_dst(self): # GH 13750 diff --git a/pandas/tests/indexes/datetimes/test_indexing.py b/pandas/tests/indexes/datetimes/test_indexing.py index 48ceefd6368c0..a9f1a5e608ac7 100644 --- a/pandas/tests/indexes/datetimes/test_indexing.py +++ b/pandas/tests/indexes/datetimes/test_indexing.py @@ -346,25 +346,25 @@ def test_take_invalid_kwargs(self): indices, mode='clip') # TODO: This method came from test_datetime; de-dup with version above - def test_take2(self): + @pytest.mark.parametrize('tz', [None, 'US/Eastern', 'Asia/Tokyo']) + def test_take2(self, tz): dates = [datetime(2010, 1, 1, 14), datetime(2010, 1, 1, 15), datetime(2010, 1, 1, 17), datetime(2010, 1, 1, 21)] - for tz in [None, 'US/Eastern', 'Asia/Tokyo']: - idx = DatetimeIndex(start='2010-01-01 09:00', - end='2010-02-01 09:00', freq='H', tz=tz, - name='idx') - expected = DatetimeIndex(dates, freq=None, name='idx', tz=tz) + idx = DatetimeIndex(start='2010-01-01 09:00', + end='2010-02-01 09:00', freq='H', tz=tz, + name='idx') + expected = DatetimeIndex(dates, freq=None, name='idx', tz=tz) - taken1 = idx.take([5, 6, 8, 12]) - taken2 = idx[[5, 6, 8, 12]] + taken1 = idx.take([5, 6, 8, 12]) + taken2 = idx[[5, 6, 8, 12]] - for taken in [taken1, taken2]: - tm.assert_index_equal(taken, expected) - assert isinstance(taken, DatetimeIndex) - assert taken.freq is None - assert taken.tz == expected.tz - assert taken.name == expected.name + for taken in [taken1, taken2]: + tm.assert_index_equal(taken, expected) + assert isinstance(taken, DatetimeIndex) + assert taken.freq is None + assert taken.tz == expected.tz + assert taken.name == expected.name def test_take_fill_value(self): # GH 12631 diff --git a/pandas/tests/indexes/datetimes/test_missing.py b/pandas/tests/indexes/datetimes/test_missing.py index adc0b7b3d81e8..c8d47caa7e947 100644 --- a/pandas/tests/indexes/datetimes/test_missing.py +++ b/pandas/tests/indexes/datetimes/test_missing.py @@ -1,50 +1,52 @@ +import pytest + import pandas as pd import pandas.util.testing as tm class TestDatetimeIndex(object): - def test_fillna_datetime64(self): + @pytest.mark.parametrize('tz', ['US/Eastern', 'Asia/Tokyo']) + def test_fillna_datetime64(self, tz): # GH 11343 - for tz in ['US/Eastern', 'Asia/Tokyo']: - idx = pd.DatetimeIndex(['2011-01-01 09:00', pd.NaT, - '2011-01-01 11:00']) - - exp = pd.DatetimeIndex(['2011-01-01 09:00', '2011-01-01 10:00', - '2011-01-01 11:00']) - tm.assert_index_equal( - idx.fillna(pd.Timestamp('2011-01-01 10:00')), exp) - - # tz mismatch - exp = pd.Index([pd.Timestamp('2011-01-01 09:00'), - pd.Timestamp('2011-01-01 10:00', tz=tz), - pd.Timestamp('2011-01-01 11:00')], dtype=object) - tm.assert_index_equal( - idx.fillna(pd.Timestamp('2011-01-01 10:00', tz=tz)), exp) - - # object - exp = pd.Index([pd.Timestamp('2011-01-01 09:00'), 'x', - pd.Timestamp('2011-01-01 11:00')], dtype=object) - tm.assert_index_equal(idx.fillna('x'), exp) - - idx = pd.DatetimeIndex(['2011-01-01 09:00', pd.NaT, - '2011-01-01 11:00'], tz=tz) - - exp = pd.DatetimeIndex(['2011-01-01 09:00', '2011-01-01 10:00', - '2011-01-01 11:00'], tz=tz) - tm.assert_index_equal( - idx.fillna(pd.Timestamp('2011-01-01 10:00', tz=tz)), exp) - - exp = pd.Index([pd.Timestamp('2011-01-01 09:00', tz=tz), - pd.Timestamp('2011-01-01 10:00'), - pd.Timestamp('2011-01-01 11:00', tz=tz)], - dtype=object) - tm.assert_index_equal( - idx.fillna(pd.Timestamp('2011-01-01 10:00')), exp) - - # object - exp = pd.Index([pd.Timestamp('2011-01-01 09:00', tz=tz), - 'x', - pd.Timestamp('2011-01-01 11:00', tz=tz)], - dtype=object) - tm.assert_index_equal(idx.fillna('x'), exp) + idx = pd.DatetimeIndex(['2011-01-01 09:00', pd.NaT, + '2011-01-01 11:00']) + + exp = pd.DatetimeIndex(['2011-01-01 09:00', '2011-01-01 10:00', + '2011-01-01 11:00']) + tm.assert_index_equal( + idx.fillna(pd.Timestamp('2011-01-01 10:00')), exp) + + # tz mismatch + exp = pd.Index([pd.Timestamp('2011-01-01 09:00'), + pd.Timestamp('2011-01-01 10:00', tz=tz), + pd.Timestamp('2011-01-01 11:00')], dtype=object) + tm.assert_index_equal( + idx.fillna(pd.Timestamp('2011-01-01 10:00', tz=tz)), exp) + + # object + exp = pd.Index([pd.Timestamp('2011-01-01 09:00'), 'x', + pd.Timestamp('2011-01-01 11:00')], dtype=object) + tm.assert_index_equal(idx.fillna('x'), exp) + + idx = pd.DatetimeIndex(['2011-01-01 09:00', pd.NaT, + '2011-01-01 11:00'], tz=tz) + + exp = pd.DatetimeIndex(['2011-01-01 09:00', '2011-01-01 10:00', + '2011-01-01 11:00'], tz=tz) + tm.assert_index_equal( + idx.fillna(pd.Timestamp('2011-01-01 10:00', tz=tz)), exp) + + exp = pd.Index([pd.Timestamp('2011-01-01 09:00', tz=tz), + pd.Timestamp('2011-01-01 10:00'), + pd.Timestamp('2011-01-01 11:00', tz=tz)], + dtype=object) + tm.assert_index_equal( + idx.fillna(pd.Timestamp('2011-01-01 10:00')), exp) + + # object + exp = pd.Index([pd.Timestamp('2011-01-01 09:00', tz=tz), + 'x', + pd.Timestamp('2011-01-01 11:00', tz=tz)], + dtype=object) + tm.assert_index_equal(idx.fillna('x'), exp) diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py index b42cd454803b8..ed7e425924097 100644 --- a/pandas/tests/indexes/datetimes/test_ops.py +++ b/pandas/tests/indexes/datetimes/test_ops.py @@ -96,111 +96,111 @@ def test_numpy_minmax(self): tm.assert_raises_regex( ValueError, errmsg, np.argmax, dr, out=0) - def test_repeat_range(self): + @pytest.mark.parametrize('tz', tz) + def test_repeat_range(self, tz): rng = date_range('1/1/2000', '1/1/2001') result = rng.repeat(5) assert result.freq is None assert len(result) == 5 * len(rng) - for tz in self.tz: - index = pd.date_range('2001-01-01', periods=2, freq='D', tz=tz) - exp = pd.DatetimeIndex(['2001-01-01', '2001-01-01', - '2001-01-02', '2001-01-02'], tz=tz) - for res in [index.repeat(2), np.repeat(index, 2)]: - tm.assert_index_equal(res, exp) - assert res.freq is None - - index = pd.date_range('2001-01-01', periods=2, freq='2D', tz=tz) - exp = pd.DatetimeIndex(['2001-01-01', '2001-01-01', - '2001-01-03', '2001-01-03'], tz=tz) - for res in [index.repeat(2), np.repeat(index, 2)]: - tm.assert_index_equal(res, exp) - assert res.freq is None - - index = pd.DatetimeIndex(['2001-01-01', 'NaT', '2003-01-01'], - tz=tz) - exp = pd.DatetimeIndex(['2001-01-01', '2001-01-01', '2001-01-01', - 'NaT', 'NaT', 'NaT', - '2003-01-01', '2003-01-01', '2003-01-01'], - tz=tz) - for res in [index.repeat(3), np.repeat(index, 3)]: - tm.assert_index_equal(res, exp) - assert res.freq is None - - def test_repeat(self): + index = pd.date_range('2001-01-01', periods=2, freq='D', tz=tz) + exp = pd.DatetimeIndex(['2001-01-01', '2001-01-01', + '2001-01-02', '2001-01-02'], tz=tz) + for res in [index.repeat(2), np.repeat(index, 2)]: + tm.assert_index_equal(res, exp) + assert res.freq is None + + index = pd.date_range('2001-01-01', periods=2, freq='2D', tz=tz) + exp = pd.DatetimeIndex(['2001-01-01', '2001-01-01', + '2001-01-03', '2001-01-03'], tz=tz) + for res in [index.repeat(2), np.repeat(index, 2)]: + tm.assert_index_equal(res, exp) + assert res.freq is None + + index = pd.DatetimeIndex(['2001-01-01', 'NaT', '2003-01-01'], + tz=tz) + exp = pd.DatetimeIndex(['2001-01-01', '2001-01-01', '2001-01-01', + 'NaT', 'NaT', 'NaT', + '2003-01-01', '2003-01-01', '2003-01-01'], + tz=tz) + for res in [index.repeat(3), np.repeat(index, 3)]: + tm.assert_index_equal(res, exp) + assert res.freq is None + + @pytest.mark.parametrize('tz', tz) + def test_repeat(self, tz): reps = 2 msg = "the 'axis' parameter is not supported" - for tz in self.tz: - rng = pd.date_range(start='2016-01-01', periods=2, - freq='30Min', tz=tz) - - expected_rng = DatetimeIndex([ - Timestamp('2016-01-01 00:00:00', tz=tz, freq='30T'), - Timestamp('2016-01-01 00:00:00', tz=tz, freq='30T'), - Timestamp('2016-01-01 00:30:00', tz=tz, freq='30T'), - Timestamp('2016-01-01 00:30:00', tz=tz, freq='30T'), - ]) - - res = rng.repeat(reps) - tm.assert_index_equal(res, expected_rng) - assert res.freq is None + rng = pd.date_range(start='2016-01-01', periods=2, + freq='30Min', tz=tz) - tm.assert_index_equal(np.repeat(rng, reps), expected_rng) - tm.assert_raises_regex(ValueError, msg, np.repeat, - rng, reps, axis=1) + expected_rng = DatetimeIndex([ + Timestamp('2016-01-01 00:00:00', tz=tz, freq='30T'), + Timestamp('2016-01-01 00:00:00', tz=tz, freq='30T'), + Timestamp('2016-01-01 00:30:00', tz=tz, freq='30T'), + Timestamp('2016-01-01 00:30:00', tz=tz, freq='30T'), + ]) - def test_resolution(self): + res = rng.repeat(reps) + tm.assert_index_equal(res, expected_rng) + assert res.freq is None + + tm.assert_index_equal(np.repeat(rng, reps), expected_rng) + tm.assert_raises_regex(ValueError, msg, np.repeat, + rng, reps, axis=1) + + @pytest.mark.parametrize('tz', tz) + def test_resolution(self, tz): for freq, expected in zip(['A', 'Q', 'M', 'D', 'H', 'T', 'S', 'L', 'U'], ['day', 'day', 'day', 'day', 'hour', 'minute', 'second', 'millisecond', 'microsecond']): - for tz in self.tz: - idx = pd.date_range(start='2013-04-01', periods=30, freq=freq, - tz=tz) - assert idx.resolution == expected + idx = pd.date_range(start='2013-04-01', periods=30, freq=freq, + tz=tz) + assert idx.resolution == expected - def test_value_counts_unique(self): + @pytest.mark.parametrize('tz', tz) + def test_value_counts_unique(self, tz): # GH 7735 - for tz in self.tz: - idx = pd.date_range('2011-01-01 09:00', freq='H', periods=10) - # create repeated values, 'n'th element is repeated by n+1 times - idx = DatetimeIndex(np.repeat(idx.values, range(1, len(idx) + 1)), - tz=tz) + idx = pd.date_range('2011-01-01 09:00', freq='H', periods=10) + # create repeated values, 'n'th element is repeated by n+1 times + idx = DatetimeIndex(np.repeat(idx.values, range(1, len(idx) + 1)), + tz=tz) - exp_idx = pd.date_range('2011-01-01 18:00', freq='-1H', periods=10, - tz=tz) - expected = Series(range(10, 0, -1), index=exp_idx, dtype='int64') + exp_idx = pd.date_range('2011-01-01 18:00', freq='-1H', periods=10, + tz=tz) + expected = Series(range(10, 0, -1), index=exp_idx, dtype='int64') - for obj in [idx, Series(idx)]: - tm.assert_series_equal(obj.value_counts(), expected) + for obj in [idx, Series(idx)]: + tm.assert_series_equal(obj.value_counts(), expected) - expected = pd.date_range('2011-01-01 09:00', freq='H', periods=10, - tz=tz) - tm.assert_index_equal(idx.unique(), expected) + expected = pd.date_range('2011-01-01 09:00', freq='H', periods=10, + tz=tz) + tm.assert_index_equal(idx.unique(), expected) - idx = DatetimeIndex(['2013-01-01 09:00', '2013-01-01 09:00', - '2013-01-01 09:00', '2013-01-01 08:00', - '2013-01-01 08:00', pd.NaT], tz=tz) + idx = DatetimeIndex(['2013-01-01 09:00', '2013-01-01 09:00', + '2013-01-01 09:00', '2013-01-01 08:00', + '2013-01-01 08:00', pd.NaT], tz=tz) - exp_idx = DatetimeIndex(['2013-01-01 09:00', '2013-01-01 08:00'], - tz=tz) - expected = Series([3, 2], index=exp_idx) + exp_idx = DatetimeIndex(['2013-01-01 09:00', '2013-01-01 08:00'], + tz=tz) + expected = Series([3, 2], index=exp_idx) - for obj in [idx, Series(idx)]: - tm.assert_series_equal(obj.value_counts(), expected) + for obj in [idx, Series(idx)]: + tm.assert_series_equal(obj.value_counts(), expected) - exp_idx = DatetimeIndex(['2013-01-01 09:00', '2013-01-01 08:00', - pd.NaT], tz=tz) - expected = Series([3, 2, 1], index=exp_idx) + exp_idx = DatetimeIndex(['2013-01-01 09:00', '2013-01-01 08:00', + pd.NaT], tz=tz) + expected = Series([3, 2, 1], index=exp_idx) - for obj in [idx, Series(idx)]: - tm.assert_series_equal(obj.value_counts(dropna=False), - expected) + for obj in [idx, Series(idx)]: + tm.assert_series_equal(obj.value_counts(dropna=False), + expected) - tm.assert_index_equal(idx.unique(), exp_idx) + tm.assert_index_equal(idx.unique(), exp_idx) def test_nonunique_contains(self): # GH 9512 @@ -324,15 +324,16 @@ def test_drop_duplicates(self): res = Series(idx).drop_duplicates(keep=False) tm.assert_series_equal(res, Series(base[5:], index=np.arange(5, 31))) - def test_infer_freq(self): + @pytest.mark.parametrize('freq', [ + 'A', '2A', '-2A', 'Q', '-1Q', 'M', '-1M', 'D', '3D', + '-3D', 'W', '-1W', 'H', '2H', '-2H', 'T', '2T', 'S', + '-3S']) + def test_infer_freq(self, freq): # GH 11018 - for freq in ['A', '2A', '-2A', 'Q', '-1Q', 'M', '-1M', 'D', '3D', - '-3D', 'W', '-1W', 'H', '2H', '-2H', 'T', '2T', 'S', - '-3S']: - idx = pd.date_range('2011-01-01 09:00:00', freq=freq, periods=10) - result = pd.DatetimeIndex(idx.asi8, freq='infer') - tm.assert_index_equal(idx, result) - assert result.freq == freq + idx = pd.date_range('2011-01-01 09:00:00', freq=freq, periods=10) + result = pd.DatetimeIndex(idx.asi8, freq='infer') + tm.assert_index_equal(idx, result) + assert result.freq == freq def test_nat_new(self): idx = pd.date_range('2011-01-01', freq='D', periods=5, name='x') @@ -344,57 +345,57 @@ def test_nat_new(self): exp = np.array([tslib.iNaT] * 5, dtype=np.int64) tm.assert_numpy_array_equal(result, exp) - def test_nat(self): + @pytest.mark.parametrize('tz', [None, 'US/Eastern', 'UTC']) + def test_nat(self, tz): assert pd.DatetimeIndex._na_value is pd.NaT assert pd.DatetimeIndex([])._na_value is pd.NaT - for tz in [None, 'US/Eastern', 'UTC']: - idx = pd.DatetimeIndex(['2011-01-01', '2011-01-02'], tz=tz) - assert idx._can_hold_na + idx = pd.DatetimeIndex(['2011-01-01', '2011-01-02'], tz=tz) + assert idx._can_hold_na - tm.assert_numpy_array_equal(idx._isnan, np.array([False, False])) - assert not idx.hasnans - tm.assert_numpy_array_equal(idx._nan_idxs, - np.array([], dtype=np.intp)) + tm.assert_numpy_array_equal(idx._isnan, np.array([False, False])) + assert not idx.hasnans + tm.assert_numpy_array_equal(idx._nan_idxs, + np.array([], dtype=np.intp)) - idx = pd.DatetimeIndex(['2011-01-01', 'NaT'], tz=tz) - assert idx._can_hold_na + idx = pd.DatetimeIndex(['2011-01-01', 'NaT'], tz=tz) + assert idx._can_hold_na - tm.assert_numpy_array_equal(idx._isnan, np.array([False, True])) - assert idx.hasnans - tm.assert_numpy_array_equal(idx._nan_idxs, - np.array([1], dtype=np.intp)) + tm.assert_numpy_array_equal(idx._isnan, np.array([False, True])) + assert idx.hasnans + tm.assert_numpy_array_equal(idx._nan_idxs, + np.array([1], dtype=np.intp)) - def test_equals(self): + @pytest.mark.parametrize('tz', [None, 'UTC', 'US/Eastern', 'Asia/Tokyo']) + def test_equals(self, tz): # GH 13107 - for tz in [None, 'UTC', 'US/Eastern', 'Asia/Tokyo']: - idx = pd.DatetimeIndex(['2011-01-01', '2011-01-02', 'NaT']) - assert idx.equals(idx) - assert idx.equals(idx.copy()) - assert idx.equals(idx.astype(object)) - assert idx.astype(object).equals(idx) - assert idx.astype(object).equals(idx.astype(object)) - assert not idx.equals(list(idx)) - assert not idx.equals(pd.Series(idx)) - - idx2 = pd.DatetimeIndex(['2011-01-01', '2011-01-02', 'NaT'], - tz='US/Pacific') - assert not idx.equals(idx2) - assert not idx.equals(idx2.copy()) - assert not idx.equals(idx2.astype(object)) - assert not idx.astype(object).equals(idx2) - assert not idx.equals(list(idx2)) - assert not idx.equals(pd.Series(idx2)) - - # same internal, different tz - idx3 = pd.DatetimeIndex._simple_new(idx.asi8, tz='US/Pacific') - tm.assert_numpy_array_equal(idx.asi8, idx3.asi8) - assert not idx.equals(idx3) - assert not idx.equals(idx3.copy()) - assert not idx.equals(idx3.astype(object)) - assert not idx.astype(object).equals(idx3) - assert not idx.equals(list(idx3)) - assert not idx.equals(pd.Series(idx3)) + idx = pd.DatetimeIndex(['2011-01-01', '2011-01-02', 'NaT']) + assert idx.equals(idx) + assert idx.equals(idx.copy()) + assert idx.equals(idx.astype(object)) + assert idx.astype(object).equals(idx) + assert idx.astype(object).equals(idx.astype(object)) + assert not idx.equals(list(idx)) + assert not idx.equals(pd.Series(idx)) + + idx2 = pd.DatetimeIndex(['2011-01-01', '2011-01-02', 'NaT'], + tz='US/Pacific') + assert not idx.equals(idx2) + assert not idx.equals(idx2.copy()) + assert not idx.equals(idx2.astype(object)) + assert not idx.astype(object).equals(idx2) + assert not idx.equals(list(idx2)) + assert not idx.equals(pd.Series(idx2)) + + # same internal, different tz + idx3 = pd.DatetimeIndex._simple_new(idx.asi8, tz='US/Pacific') + tm.assert_numpy_array_equal(idx.asi8, idx3.asi8) + assert not idx.equals(idx3) + assert not idx.equals(idx3.copy()) + assert not idx.equals(idx3.astype(object)) + assert not idx.astype(object).equals(idx3) + assert not idx.equals(list(idx3)) + assert not idx.equals(pd.Series(idx3)) class TestBusinessDatetimeIndex(object): diff --git a/pandas/tests/indexes/datetimes/test_scalar_compat.py b/pandas/tests/indexes/datetimes/test_scalar_compat.py index 83e7a0cd68d63..6f0756949edc6 100644 --- a/pandas/tests/indexes/datetimes/test_scalar_compat.py +++ b/pandas/tests/indexes/datetimes/test_scalar_compat.py @@ -38,18 +38,21 @@ def test_dti_date_out_of_range(self): pytest.raises(ValueError, DatetimeIndex, ['1400-01-01']) pytest.raises(ValueError, DatetimeIndex, [datetime(1400, 1, 1)]) - def test_dti_timestamp_fields(self): + @pytest.mark.parametrize('field', [ + 'dayofweek', 'dayofyear', 'week', 'weekofyear', 'quarter', + 'days_in_month', 'is_month_start', 'is_month_end', + 'is_quarter_start', 'is_quarter_end', 'is_year_start', + 'is_year_end', 'weekday_name']) + def test_dti_timestamp_fields(self, field): # extra fields from DatetimeIndex like quarter and week idx = tm.makeDateIndex(100) + expected = getattr(idx, field)[-1] + result = getattr(Timestamp(idx[-1]), field) + assert result == expected - fields = ['dayofweek', 'dayofyear', 'week', 'weekofyear', 'quarter', - 'days_in_month', 'is_month_start', 'is_month_end', - 'is_quarter_start', 'is_quarter_end', 'is_year_start', - 'is_year_end', 'weekday_name'] - for f in fields: - expected = getattr(idx, f)[-1] - result = getattr(Timestamp(idx[-1]), f) - assert result == expected + def test_dti_timestamp_freq_fields(self): + # extra fields from DatetimeIndex like quarter and week + idx = tm.makeDateIndex(100) assert idx.freq == Timestamp(idx[-1], idx.freq).freq assert idx.freqstr == Timestamp(idx[-1], idx.freq).freqstr diff --git a/pandas/tests/indexes/datetimes/test_timezones.py b/pandas/tests/indexes/datetimes/test_timezones.py index 62854676d43be..217610b76cf0f 100644 --- a/pandas/tests/indexes/datetimes/test_timezones.py +++ b/pandas/tests/indexes/datetimes/test_timezones.py @@ -170,17 +170,17 @@ def test_dti_tz_convert_hour_overflow_dst_timestamps(self, tz): expected = Index([9, 9, 9]) tm.assert_index_equal(ut.hour, expected) - def test_dti_tz_convert_trans_pos_plus_1__bug(self): + @pytest.mark.parametrize('freq, n', [('H', 1), ('T', 60), ('S', 3600)]) + def test_dti_tz_convert_trans_pos_plus_1__bug(self, freq, n): # Regression test for tslib.tz_convert(vals, tz1, tz2). # See https://github.com/pandas-dev/pandas/issues/4496 for details. - for freq, n in [('H', 1), ('T', 60), ('S', 3600)]: - idx = date_range(datetime(2011, 3, 26, 23), - datetime(2011, 3, 27, 1), freq=freq) - idx = idx.tz_localize('UTC') - idx = idx.tz_convert('Europe/Moscow') + idx = date_range(datetime(2011, 3, 26, 23), + datetime(2011, 3, 27, 1), freq=freq) + idx = idx.tz_localize('UTC') + idx = idx.tz_convert('Europe/Moscow') - expected = np.repeat(np.array([3, 4, 5]), np.array([n, n, 1])) - tm.assert_index_equal(idx.hour, Index(expected)) + expected = np.repeat(np.array([3, 4, 5]), np.array([n, n, 1])) + tm.assert_index_equal(idx.hour, Index(expected)) def test_dti_tz_convert_dst(self): for freq, n in [('H', 1), ('T', 60), ('S', 3600)]: @@ -700,20 +700,20 @@ def test_dti_tz_constructors(self, tzstr): # ------------------------------------------------------------- # Unsorted - def test_join_utc_convert(self): + @pytest.mark.parametrize('how', ['inner', 'outer', 'left', 'right']) + def test_join_utc_convert(self, how): rng = date_range('1/1/2011', periods=100, freq='H', tz='utc') left = rng.tz_convert('US/Eastern') right = rng.tz_convert('Europe/Berlin') - for how in ['inner', 'outer', 'left', 'right']: - result = left.join(left[:-5], how=how) - assert isinstance(result, DatetimeIndex) - assert result.tz == left.tz + result = left.join(left[:-5], how=how) + assert isinstance(result, DatetimeIndex) + assert result.tz == left.tz - result = left.join(right[:-5], how=how) - assert isinstance(result, DatetimeIndex) - assert result.tz.zone == 'UTC' + result = left.join(right[:-5], how=how) + assert isinstance(result, DatetimeIndex) + assert result.tz.zone == 'UTC' def test_dti_drop_dont_lose_tz(self): # GH#2621 diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index b5926933544e8..fbf0977a04d82 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -1013,18 +1013,20 @@ def test_string_na_nat_conversion(self, cache): assert_series_equal(dresult, expected, check_names=False) assert dresult.name == 'foo' + @pytest.mark.parametrize('dtype', [ + 'datetime64[h]', 'datetime64[m]', + 'datetime64[s]', 'datetime64[ms]', + 'datetime64[us]', 'datetime64[ns]']) @pytest.mark.parametrize('cache', [True, False]) - def test_dti_constructor_numpy_timeunits(self, cache): + def test_dti_constructor_numpy_timeunits(self, cache, dtype): # GH 9114 base = pd.to_datetime(['2000-01-01T00:00', '2000-01-02T00:00', 'NaT'], cache=cache) - for dtype in ['datetime64[h]', 'datetime64[m]', 'datetime64[s]', - 'datetime64[ms]', 'datetime64[us]', 'datetime64[ns]']: - values = base.values.astype(dtype) + values = base.values.astype(dtype) - tm.assert_index_equal(DatetimeIndex(values), base) - tm.assert_index_equal(to_datetime(values, cache=cache), base) + tm.assert_index_equal(DatetimeIndex(values), base) + tm.assert_index_equal(to_datetime(values, cache=cache), base) @pytest.mark.parametrize('cache', [True, False]) def test_dayfirst(self, cache): diff --git a/pandas/tests/indexes/period/test_arithmetic.py b/pandas/tests/indexes/period/test_arithmetic.py index 5f8f9533e9c44..e16d346542b9e 100644 --- a/pandas/tests/indexes/period/test_arithmetic.py +++ b/pandas/tests/indexes/period/test_arithmetic.py @@ -613,7 +613,8 @@ def test_pi_ops(self): exp = pd.Index([0, -1, -2, -3], name='idx') tm.assert_index_equal(result, exp) - def test_pi_ops_errors(self): + @pytest.mark.parametrize('ng', ["str", 1.5]) + def test_pi_ops_errors(self, ng): idx = PeriodIndex(['2011-01', '2011-02', '2011-03', '2011-04'], freq='M', name='idx') ser = pd.Series(idx) @@ -621,34 +622,33 @@ def test_pi_ops_errors(self): msg = r"unsupported operand type\(s\)" for obj in [idx, ser]: - for ng in ["str", 1.5]: - with tm.assert_raises_regex(TypeError, msg): - obj + ng + with tm.assert_raises_regex(TypeError, msg): + obj + ng - with pytest.raises(TypeError): - # error message differs between PY2 and 3 - ng + obj + with pytest.raises(TypeError): + # error message differs between PY2 and 3 + ng + obj + + with tm.assert_raises_regex(TypeError, msg): + obj - ng - with tm.assert_raises_regex(TypeError, msg): - obj - ng + with pytest.raises(TypeError): + np.add(obj, ng) + if _np_version_under1p10: + assert np.add(ng, obj) is NotImplemented + else: with pytest.raises(TypeError): - np.add(obj, ng) + np.add(ng, obj) - if _np_version_under1p10: - assert np.add(ng, obj) is NotImplemented - else: - with pytest.raises(TypeError): - np.add(ng, obj) + with pytest.raises(TypeError): + np.subtract(obj, ng) + if _np_version_under1p10: + assert np.subtract(ng, obj) is NotImplemented + else: with pytest.raises(TypeError): - np.subtract(obj, ng) - - if _np_version_under1p10: - assert np.subtract(ng, obj) is NotImplemented - else: - with pytest.raises(TypeError): - np.subtract(ng, obj) + np.subtract(ng, obj) def test_pi_ops_nat(self): idx = PeriodIndex(['2011-01', '2011-02', 'NaT', '2011-04'], diff --git a/pandas/tests/indexes/period/test_asfreq.py b/pandas/tests/indexes/period/test_asfreq.py index c8724b2a3bc91..ea59a57069faa 100644 --- a/pandas/tests/indexes/period/test_asfreq.py +++ b/pandas/tests/indexes/period/test_asfreq.py @@ -8,9 +8,6 @@ class TestPeriodIndex(object): - def setup_method(self, method): - pass - def test_asfreq(self): pi1 = PeriodIndex(freq='A', start='1/1/2001', end='1/1/2001') pi2 = PeriodIndex(freq='Q', start='1/1/2001', end='1/1/2001') @@ -85,21 +82,21 @@ def test_asfreq_nat(self): expected = PeriodIndex(['2011Q1', '2011Q1', 'NaT', '2011Q2'], freq='Q') tm.assert_index_equal(result, expected) - def test_asfreq_mult_pi(self): + @pytest.mark.parametrize('freq', ['D', '3D']) + def test_asfreq_mult_pi(self, freq): pi = PeriodIndex(['2001-01', '2001-02', 'NaT', '2001-03'], freq='2M') - for freq in ['D', '3D']: - result = pi.asfreq(freq) - exp = PeriodIndex(['2001-02-28', '2001-03-31', 'NaT', - '2001-04-30'], freq=freq) - tm.assert_index_equal(result, exp) - assert result.freq == exp.freq - - result = pi.asfreq(freq, how='S') - exp = PeriodIndex(['2001-01-01', '2001-02-01', 'NaT', - '2001-03-01'], freq=freq) - tm.assert_index_equal(result, exp) - assert result.freq == exp.freq + result = pi.asfreq(freq) + exp = PeriodIndex(['2001-02-28', '2001-03-31', 'NaT', + '2001-04-30'], freq=freq) + tm.assert_index_equal(result, exp) + assert result.freq == exp.freq + + result = pi.asfreq(freq, how='S') + exp = PeriodIndex(['2001-01-01', '2001-02-01', 'NaT', + '2001-03-01'], freq=freq) + tm.assert_index_equal(result, exp) + assert result.freq == exp.freq def test_asfreq_combined_pi(self): pi = pd.PeriodIndex(['2001-01-01 00:00', '2001-01-02 02:00', 'NaT'], diff --git a/pandas/tests/indexes/period/test_construction.py b/pandas/tests/indexes/period/test_construction.py index eca80d17b1dc3..be741592ec7a2 100644 --- a/pandas/tests/indexes/period/test_construction.py +++ b/pandas/tests/indexes/period/test_construction.py @@ -286,14 +286,14 @@ def test_constructor_simple_new_empty(self): result = idx._simple_new(idx, name='p', freq='M') tm.assert_index_equal(result, idx) - def test_constructor_floats(self): - # GH13079 - for floats in [[1.1, 2.1], np.array([1.1, 2.1])]: - with pytest.raises(TypeError): - pd.PeriodIndex._simple_new(floats, freq='M') + @pytest.mark.parametrize('floats', [[1.1, 2.1], np.array([1.1, 2.1])]) + def test_constructor_floats(self, floats): + # GH#13079 + with pytest.raises(TypeError): + pd.PeriodIndex._simple_new(floats, freq='M') - with pytest.raises(TypeError): - pd.PeriodIndex(floats, freq='M') + with pytest.raises(TypeError): + pd.PeriodIndex(floats, freq='M') def test_constructor_nat(self): pytest.raises(ValueError, period_range, start='NaT', @@ -343,16 +343,14 @@ def test_constructor_freq_mult(self): with tm.assert_raises_regex(ValueError, msg): period_range('2011-01', periods=3, freq='0M') - def test_constructor_freq_mult_dti_compat(self): - import itertools - mults = [1, 2, 3, 4, 5] - freqs = ['A', 'M', 'D', 'T', 'S'] - for mult, freq in itertools.product(mults, freqs): - freqstr = str(mult) + freq - pidx = PeriodIndex(start='2014-04-01', freq=freqstr, periods=10) - expected = date_range(start='2014-04-01', freq=freqstr, - periods=10).to_period(freqstr) - tm.assert_index_equal(pidx, expected) + @pytest.mark.parametrize('freq', ['A', 'M', 'D', 'T', 'S']) + @pytest.mark.parametrize('mult', [1, 2, 3, 4, 5]) + def test_constructor_freq_mult_dti_compat(self, mult, freq): + freqstr = str(mult) + freq + pidx = PeriodIndex(start='2014-04-01', freq=freqstr, periods=10) + expected = date_range(start='2014-04-01', freq=freqstr, + periods=10).to_period(freqstr) + tm.assert_index_equal(pidx, expected) def test_constructor_freq_combined(self): for freq in ['1D1H', '1H1D']: @@ -445,11 +443,12 @@ def test_constructor_error(self): with tm.assert_raises_regex(ValueError, msg): PeriodIndex(start=start) - def test_recreate_from_data(self): - for o in ['M', 'Q', 'A', 'D', 'B', 'T', 'S', 'L', 'U', 'N', 'H']: - org = PeriodIndex(start='2001/04/01', freq=o, periods=1) - idx = PeriodIndex(org.values, freq=o) - tm.assert_index_equal(idx, org) + @pytest.mark.parametrize('freq', ['M', 'Q', 'A', 'D', 'B', + 'T', 'S', 'L', 'U', 'N', 'H']) + def test_recreate_from_data(self, freq): + org = PeriodIndex(start='2001/04/01', freq=freq, periods=1) + idx = PeriodIndex(org.values, freq=freq) + tm.assert_index_equal(idx, org) def test_map_with_string_constructor(self): raw = [2005, 2007, 2009] diff --git a/pandas/tests/indexes/period/test_ops.py b/pandas/tests/indexes/period/test_ops.py index 3b6641bc7ad5c..7d117b0b626cf 100644 --- a/pandas/tests/indexes/period/test_ops.py +++ b/pandas/tests/indexes/period/test_ops.py @@ -1,5 +1,6 @@ import numpy as np +import pytest import pandas as pd import pandas._libs.tslib as tslib @@ -368,37 +369,37 @@ def test_nat(self): tm.assert_numpy_array_equal(idx._nan_idxs, np.array([1], dtype=np.intp)) - def test_equals(self): - # GH 13107 - for freq in ['D', 'M']: - idx = pd.PeriodIndex(['2011-01-01', '2011-01-02', 'NaT'], - freq=freq) - assert idx.equals(idx) - assert idx.equals(idx.copy()) - assert idx.equals(idx.astype(object)) - assert idx.astype(object).equals(idx) - assert idx.astype(object).equals(idx.astype(object)) - assert not idx.equals(list(idx)) - assert not idx.equals(pd.Series(idx)) - - idx2 = pd.PeriodIndex(['2011-01-01', '2011-01-02', 'NaT'], - freq='H') - assert not idx.equals(idx2) - assert not idx.equals(idx2.copy()) - assert not idx.equals(idx2.astype(object)) - assert not idx.astype(object).equals(idx2) - assert not idx.equals(list(idx2)) - assert not idx.equals(pd.Series(idx2)) - - # same internal, different tz - idx3 = pd.PeriodIndex._simple_new(idx.asi8, freq='H') - tm.assert_numpy_array_equal(idx.asi8, idx3.asi8) - assert not idx.equals(idx3) - assert not idx.equals(idx3.copy()) - assert not idx.equals(idx3.astype(object)) - assert not idx.astype(object).equals(idx3) - assert not idx.equals(list(idx3)) - assert not idx.equals(pd.Series(idx3)) + @pytest.mark.parametrize('freq', ['D', 'M']) + def test_equals(self, freq): + # GH#13107 + idx = pd.PeriodIndex(['2011-01-01', '2011-01-02', 'NaT'], + freq=freq) + assert idx.equals(idx) + assert idx.equals(idx.copy()) + assert idx.equals(idx.astype(object)) + assert idx.astype(object).equals(idx) + assert idx.astype(object).equals(idx.astype(object)) + assert not idx.equals(list(idx)) + assert not idx.equals(pd.Series(idx)) + + idx2 = pd.PeriodIndex(['2011-01-01', '2011-01-02', 'NaT'], + freq='H') + assert not idx.equals(idx2) + assert not idx.equals(idx2.copy()) + assert not idx.equals(idx2.astype(object)) + assert not idx.astype(object).equals(idx2) + assert not idx.equals(list(idx2)) + assert not idx.equals(pd.Series(idx2)) + + # same internal, different tz + idx3 = pd.PeriodIndex._simple_new(idx.asi8, freq='H') + tm.assert_numpy_array_equal(idx.asi8, idx3.asi8) + assert not idx.equals(idx3) + assert not idx.equals(idx3.copy()) + assert not idx.equals(idx3.astype(object)) + assert not idx.astype(object).equals(idx3) + assert not idx.equals(list(idx3)) + assert not idx.equals(pd.Series(idx3)) class TestPeriodIndexSeriesMethods(object): diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index 4c0c865928031..dd437363cfc1d 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -27,11 +27,11 @@ def create_index(self): def test_pickle_compat_construction(self): pass - def test_pickle_round_trip(self): - for freq in ['D', 'M', 'A']: - idx = PeriodIndex(['2016-05-16', 'NaT', NaT, np.NaN], freq=freq) - result = tm.round_trip_pickle(idx) - tm.assert_index_equal(result, idx) + @pytest.mark.parametrize('freq', ['D', 'M', 'A']) + def test_pickle_round_trip(self, freq): + idx = PeriodIndex(['2016-05-16', 'NaT', NaT, np.NaN], freq=freq) + result = tm.round_trip_pickle(idx) + tm.assert_index_equal(result, idx) @pytest.mark.parametrize('klass', [list, tuple, np.array, Series]) def test_where(self, klass): diff --git a/pandas/tests/indexes/period/test_setops.py b/pandas/tests/indexes/period/test_setops.py index 1ac05f9fa94b7..ec0836dfa174b 100644 --- a/pandas/tests/indexes/period/test_setops.py +++ b/pandas/tests/indexes/period/test_setops.py @@ -14,24 +14,21 @@ def _permute(obj): class TestPeriodIndex(object): - def setup_method(self, method): - pass - - def test_joins(self): + @pytest.mark.parametrize('kind', ['inner', 'outer', 'left', 'right']) + def test_joins(self, kind): index = period_range('1/1/2000', '1/20/2000', freq='D') - for kind in ['inner', 'outer', 'left', 'right']: - joined = index.join(index[:-5], how=kind) + joined = index.join(index[:-5], how=kind) - assert isinstance(joined, PeriodIndex) - assert joined.freq == index.freq + assert isinstance(joined, PeriodIndex) + assert joined.freq == index.freq - def test_join_self(self): + @pytest.mark.parametrize('kind', ['inner', 'outer', 'left', 'right']) + def test_join_self(self, kind): index = period_range('1/1/2000', '1/20/2000', freq='D') - for kind in ['inner', 'outer', 'left', 'right']: - res = index.join(index, how=kind) - assert index is res + res = index.join(index, how=kind) + assert index is res def test_join_does_not_recur(self): df = tm.makeCustomDataframe( diff --git a/pandas/tests/indexes/period/test_tools.py b/pandas/tests/indexes/period/test_tools.py index 97500f2f5ed95..38c6f257b2206 100644 --- a/pandas/tests/indexes/period/test_tools.py +++ b/pandas/tests/indexes/period/test_tools.py @@ -1,5 +1,6 @@ import numpy as np from datetime import datetime, timedelta +import pytest import pandas as pd import pandas.util.testing as tm @@ -29,32 +30,10 @@ def test_annual(self): def test_monthly(self): self._check_freq('M', '1970-01') - def test_weekly(self): - self._check_freq('W-THU', '1970-01-01') - - def test_daily(self): - self._check_freq('D', '1970-01-01') - - def test_business_daily(self): - self._check_freq('B', '1970-01-01') - - def test_hourly(self): - self._check_freq('H', '1970-01-01') - - def test_minutely(self): - self._check_freq('T', '1970-01-01') - - def test_secondly(self): - self._check_freq('S', '1970-01-01') - - def test_millisecondly(self): - self._check_freq('L', '1970-01-01') - - def test_microsecondly(self): - self._check_freq('U', '1970-01-01') - - def test_nanosecondly(self): - self._check_freq('N', '1970-01-01') + @pytest.mark.parametrize('freq', ['W-THU', 'D', 'B', 'H', 'T', + 'S', 'L', 'U', 'N']) + def test_freq(self, freq): + self._check_freq(freq, '1970-01-01') def test_negone_ordinals(self): freqs = ['A', 'M', 'Q', 'D', 'H', 'T', 'S'] @@ -75,19 +54,6 @@ def test_negone_ordinals(self): class TestPeriodIndex(object): - - def setup_method(self, method): - pass - - def test_tolist(self): - index = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') - rs = index.tolist() - for x in rs: - assert isinstance(x, Period) - - recon = PeriodIndex(rs) - tm.assert_index_equal(index, recon) - def test_to_timestamp(self): index = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') series = Series(1, index=index, name='foo') @@ -129,24 +95,6 @@ def _get_with_delta(delta, freq='A-DEC'): tm.assert_index_equal(result.index, exp_index) assert result.name == 'foo' - def test_to_timestamp_quarterly_bug(self): - years = np.arange(1960, 2000).repeat(4) - quarters = np.tile(lrange(1, 5), 40) - - pindex = PeriodIndex(year=years, quarter=quarters) - - stamps = pindex.to_timestamp('D', 'end') - expected = DatetimeIndex([x.to_timestamp('D', 'end') for x in pindex]) - tm.assert_index_equal(stamps, expected) - - def test_to_timestamp_preserve_name(self): - index = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009', - name='foo') - assert index.name == 'foo' - - conv = index.to_timestamp('D') - assert conv.name == 'foo' - def test_to_timestamp_repr_is_code(self): zs = [Timestamp('99-04-17 00:00:00', tz='UTC'), Timestamp('2001-04-17 00:00:00', tz='UTC'), @@ -155,57 +103,6 @@ def test_to_timestamp_repr_is_code(self): for z in zs: assert eval(repr(z)) == z - def test_to_timestamp_pi_nat(self): - # GH 7228 - index = PeriodIndex(['NaT', '2011-01', '2011-02'], freq='M', - name='idx') - - result = index.to_timestamp('D') - expected = DatetimeIndex([pd.NaT, datetime(2011, 1, 1), - datetime(2011, 2, 1)], name='idx') - tm.assert_index_equal(result, expected) - assert result.name == 'idx' - - result2 = result.to_period(freq='M') - tm.assert_index_equal(result2, index) - assert result2.name == 'idx' - - result3 = result.to_period(freq='3M') - exp = PeriodIndex(['NaT', '2011-01', '2011-02'], freq='3M', name='idx') - tm.assert_index_equal(result3, exp) - assert result3.freqstr == '3M' - - msg = ('Frequency must be positive, because it' - ' represents span: -2A') - with tm.assert_raises_regex(ValueError, msg): - result.to_period(freq='-2A') - - def test_to_timestamp_pi_mult(self): - idx = PeriodIndex(['2011-01', 'NaT', '2011-02'], freq='2M', name='idx') - result = idx.to_timestamp() - expected = DatetimeIndex( - ['2011-01-01', 'NaT', '2011-02-01'], name='idx') - tm.assert_index_equal(result, expected) - result = idx.to_timestamp(how='E') - expected = DatetimeIndex( - ['2011-02-28', 'NaT', '2011-03-31'], name='idx') - tm.assert_index_equal(result, expected) - - def test_to_timestamp_pi_combined(self): - idx = PeriodIndex(start='2011', periods=2, freq='1D1H', name='idx') - result = idx.to_timestamp() - expected = DatetimeIndex( - ['2011-01-01 00:00', '2011-01-02 01:00'], name='idx') - tm.assert_index_equal(result, expected) - result = idx.to_timestamp(how='E') - expected = DatetimeIndex( - ['2011-01-02 00:59:59', '2011-01-03 01:59:59'], name='idx') - tm.assert_index_equal(result, expected) - result = idx.to_timestamp(how='E', freq='H') - expected = DatetimeIndex( - ['2011-01-02 00:00', '2011-01-03 01:00'], name='idx') - tm.assert_index_equal(result, expected) - def test_to_timestamp_to_period_astype(self): idx = DatetimeIndex([pd.NaT, '2011-01-01', '2011-02-01'], name='idx') @@ -238,47 +135,26 @@ def test_dti_to_period(self): tm.assert_index_equal(pi3, period_range('1/1/2005', '11/1/2005', freq='M').asfreq('3D')) - def test_period_astype_to_timestamp(self): - pi = pd.PeriodIndex(['2011-01', '2011-02', '2011-03'], freq='M') - - exp = pd.DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01']) - tm.assert_index_equal(pi.astype('datetime64[ns]'), exp) - - exp = pd.DatetimeIndex(['2011-01-31', '2011-02-28', '2011-03-31']) - tm.assert_index_equal(pi.astype('datetime64[ns]', how='end'), exp) - - exp = pd.DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01'], - tz='US/Eastern') - res = pi.astype('datetime64[ns, US/Eastern]') - tm.assert_index_equal(pi.astype('datetime64[ns, US/Eastern]'), exp) - - exp = pd.DatetimeIndex(['2011-01-31', '2011-02-28', '2011-03-31'], - tz='US/Eastern') - res = pi.astype('datetime64[ns, US/Eastern]', how='end') - tm.assert_index_equal(res, exp) - - def test_to_period_quarterly(self): + @pytest.mark.parametrize('month', MONTHS) + def test_to_period_quarterly(self, month): # make sure we can make the round trip - for month in MONTHS: - freq = 'Q-%s' % month - rng = period_range('1989Q3', '1991Q3', freq=freq) - stamps = rng.to_timestamp() - result = stamps.to_period(freq) - tm.assert_index_equal(rng, result) - - def test_to_period_quarterlyish(self): - offsets = ['BQ', 'QS', 'BQS'] - for off in offsets: - rng = date_range('01-Jan-2012', periods=8, freq=off) - prng = rng.to_period() - assert prng.freq == 'Q-DEC' + freq = 'Q-%s' % month + rng = period_range('1989Q3', '1991Q3', freq=freq) + stamps = rng.to_timestamp() + result = stamps.to_period(freq) + tm.assert_index_equal(rng, result) + + @pytest.mark.parametrize('off', ['BQ', 'QS', 'BQS']) + def test_to_period_quarterlyish(self, off): + rng = date_range('01-Jan-2012', periods=8, freq=off) + prng = rng.to_period() + assert prng.freq == 'Q-DEC' - def test_to_period_annualish(self): - offsets = ['BA', 'AS', 'BAS'] - for off in offsets: - rng = date_range('01-Jan-2012', periods=8, freq=off) - prng = rng.to_period() - assert prng.freq == 'A-DEC' + @pytest.mark.parametrize('off', ['BA', 'AS', 'BAS']) + def test_to_period_annualish(self, off): + rng = date_range('01-Jan-2012', periods=8, freq=off) + prng = rng.to_period() + assert prng.freq == 'A-DEC' def test_to_period_monthish(self): offsets = ['MS', 'BM'] @@ -304,12 +180,6 @@ def test_period_dt64_round_trip(self): pi = dti.to_period(freq='H') tm.assert_index_equal(pi.to_timestamp(), dti) - def test_to_timestamp_1703(self): - index = period_range('1/1/2012', periods=4, freq='D') - - result = index.to_timestamp() - assert result[0] == Timestamp('1/1/2012') - def test_combine_first(self): # GH 3367 didx = pd.DatetimeIndex(start='1950-01-31', end='1950-07-31', freq='M') @@ -325,26 +195,137 @@ def test_combine_first(self): dtype=np.float64) tm.assert_series_equal(result, expected) - def test_searchsorted(self): - for freq in ['D', '2D']: - pidx = pd.PeriodIndex(['2014-01-01', '2014-01-02', '2014-01-03', - '2014-01-04', '2014-01-05'], freq=freq) + @pytest.mark.parametrize('freq', ['D', '2D']) + def test_searchsorted(self, freq): + pidx = pd.PeriodIndex(['2014-01-01', '2014-01-02', '2014-01-03', + '2014-01-04', '2014-01-05'], freq=freq) + + p1 = pd.Period('2014-01-01', freq=freq) + assert pidx.searchsorted(p1) == 0 + + p2 = pd.Period('2014-01-04', freq=freq) + assert pidx.searchsorted(p2) == 3 + + msg = "Input has different freq=H from PeriodIndex" + with tm.assert_raises_regex(period.IncompatibleFrequency, msg): + pidx.searchsorted(pd.Period('2014-01-01', freq='H')) + + msg = "Input has different freq=5D from PeriodIndex" + with tm.assert_raises_regex(period.IncompatibleFrequency, msg): + pidx.searchsorted(pd.Period('2014-01-01', freq='5D')) + + with tm.assert_produces_warning(FutureWarning): + pidx.searchsorted(key=p2) + + +class TestPeriodIndexConversion(object): + def test_tolist(self): + index = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') + rs = index.tolist() + for x in rs: + assert isinstance(x, Period) + + recon = PeriodIndex(rs) + tm.assert_index_equal(index, recon) + + def test_to_timestamp_pi_nat(self): + # GH#7228 + index = PeriodIndex(['NaT', '2011-01', '2011-02'], freq='M', + name='idx') + + result = index.to_timestamp('D') + expected = DatetimeIndex([pd.NaT, datetime(2011, 1, 1), + datetime(2011, 2, 1)], name='idx') + tm.assert_index_equal(result, expected) + assert result.name == 'idx' + + result2 = result.to_period(freq='M') + tm.assert_index_equal(result2, index) + assert result2.name == 'idx' + + result3 = result.to_period(freq='3M') + exp = PeriodIndex(['NaT', '2011-01', '2011-02'], + freq='3M', name='idx') + tm.assert_index_equal(result3, exp) + assert result3.freqstr == '3M' + + msg = ('Frequency must be positive, because it' + ' represents span: -2A') + with tm.assert_raises_regex(ValueError, msg): + result.to_period(freq='-2A') + + def test_to_timestamp_preserve_name(self): + index = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009', + name='foo') + assert index.name == 'foo' + + conv = index.to_timestamp('D') + assert conv.name == 'foo' + + def test_to_timestamp_quarterly_bug(self): + years = np.arange(1960, 2000).repeat(4) + quarters = np.tile(lrange(1, 5), 40) + + pindex = PeriodIndex(year=years, quarter=quarters) + + stamps = pindex.to_timestamp('D', 'end') + expected = DatetimeIndex([x.to_timestamp('D', 'end') for x in pindex]) + tm.assert_index_equal(stamps, expected) + + def test_to_timestamp_pi_mult(self): + idx = PeriodIndex(['2011-01', 'NaT', '2011-02'], + freq='2M', name='idx') + + result = idx.to_timestamp() + expected = DatetimeIndex(['2011-01-01', 'NaT', '2011-02-01'], + name='idx') + tm.assert_index_equal(result, expected) - p1 = pd.Period('2014-01-01', freq=freq) - assert pidx.searchsorted(p1) == 0 + result = idx.to_timestamp(how='E') + expected = DatetimeIndex(['2011-02-28', 'NaT', '2011-03-31'], + name='idx') + tm.assert_index_equal(result, expected) - p2 = pd.Period('2014-01-04', freq=freq) - assert pidx.searchsorted(p2) == 3 + def test_to_timestamp_pi_combined(self): + idx = PeriodIndex(start='2011', periods=2, freq='1D1H', name='idx') - msg = "Input has different freq=H from PeriodIndex" - with tm.assert_raises_regex( - period.IncompatibleFrequency, msg): - pidx.searchsorted(pd.Period('2014-01-01', freq='H')) + result = idx.to_timestamp() + expected = DatetimeIndex(['2011-01-01 00:00', '2011-01-02 01:00'], + name='idx') + tm.assert_index_equal(result, expected) - msg = "Input has different freq=5D from PeriodIndex" - with tm.assert_raises_regex( - period.IncompatibleFrequency, msg): - pidx.searchsorted(pd.Period('2014-01-01', freq='5D')) + result = idx.to_timestamp(how='E') + expected = DatetimeIndex(['2011-01-02 00:59:59', + '2011-01-03 01:59:59'], + name='idx') + tm.assert_index_equal(result, expected) - with tm.assert_produces_warning(FutureWarning): - pidx.searchsorted(key=p2) + result = idx.to_timestamp(how='E', freq='H') + expected = DatetimeIndex(['2011-01-02 00:00', '2011-01-03 01:00'], + name='idx') + tm.assert_index_equal(result, expected) + + def test_period_astype_to_timestamp(self): + pi = pd.PeriodIndex(['2011-01', '2011-02', '2011-03'], freq='M') + + exp = pd.DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01']) + tm.assert_index_equal(pi.astype('datetime64[ns]'), exp) + + exp = pd.DatetimeIndex(['2011-01-31', '2011-02-28', '2011-03-31']) + tm.assert_index_equal(pi.astype('datetime64[ns]', how='end'), exp) + + exp = pd.DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01'], + tz='US/Eastern') + res = pi.astype('datetime64[ns, US/Eastern]') + tm.assert_index_equal(pi.astype('datetime64[ns, US/Eastern]'), exp) + + exp = pd.DatetimeIndex(['2011-01-31', '2011-02-28', '2011-03-31'], + tz='US/Eastern') + res = pi.astype('datetime64[ns, US/Eastern]', how='end') + tm.assert_index_equal(res, exp) + + def test_to_timestamp_1703(self): + index = period_range('1/1/2012', periods=4, freq='D') + + result = index.to_timestamp() + assert result[0] == Timestamp('1/1/2012') diff --git a/pandas/tests/indexes/timedeltas/test_arithmetic.py b/pandas/tests/indexes/timedeltas/test_arithmetic.py index 029fdfcefc299..4141d66cb519b 100644 --- a/pandas/tests/indexes/timedeltas/test_arithmetic.py +++ b/pandas/tests/indexes/timedeltas/test_arithmetic.py @@ -671,19 +671,19 @@ def test_dti_tdi_numeric_ops(self): expected = DatetimeIndex(['20121231', pd.NaT, '20130101']) tm.assert_index_equal(result, expected) - def test_sub_period(self): + @pytest.mark.parametrize('freq', [None, 'H']) + def test_sub_period(self, freq): # GH 13078 # not supported, check TypeError p = pd.Period('2011-01-01', freq='D') - for freq in [None, 'H']: - idx = pd.TimedeltaIndex(['1 hours', '2 hours'], freq=freq) + idx = pd.TimedeltaIndex(['1 hours', '2 hours'], freq=freq) - with pytest.raises(TypeError): - idx - p + with pytest.raises(TypeError): + idx - p - with pytest.raises(TypeError): - p - idx + with pytest.raises(TypeError): + p - idx def test_addition_ops(self): # with datetimes/timedelta and tdi/dti diff --git a/pandas/tests/indexes/timedeltas/test_ops.py b/pandas/tests/indexes/timedeltas/test_ops.py index 690ba66b6f5ef..49737e5359c2f 100644 --- a/pandas/tests/indexes/timedeltas/test_ops.py +++ b/pandas/tests/indexes/timedeltas/test_ops.py @@ -227,14 +227,15 @@ def test_drop_duplicates(self): res = Series(idx).drop_duplicates(keep=False) tm.assert_series_equal(res, Series(base[5:], index=np.arange(5, 31))) - def test_infer_freq(self): - # GH 11018 - for freq in ['D', '3D', '-3D', 'H', '2H', '-2H', 'T', '2T', 'S', '-3S' - ]: - idx = pd.timedelta_range('1', freq=freq, periods=10) - result = pd.TimedeltaIndex(idx.asi8, freq='infer') - tm.assert_index_equal(idx, result) - assert result.freq == freq + @pytest.mark.parametrize('freq', ['D', '3D', '-3D', + 'H', '2H', '-2H', + 'T', '2T', 'S', '-3S']) + def test_infer_freq(self, freq): + # GH#11018 + idx = pd.timedelta_range('1', freq=freq, periods=10) + result = pd.TimedeltaIndex(idx.asi8, freq='infer') + tm.assert_index_equal(idx, result) + assert result.freq == freq def test_nat_new(self): diff --git a/pandas/tests/indexes/timedeltas/test_timedelta.py b/pandas/tests/indexes/timedeltas/test_timedelta.py index ce0f3b89b753e..37db9d704aa1f 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta.py @@ -102,13 +102,11 @@ def test_factorize(self): tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, idx3) - def test_join_self(self): - + @pytest.mark.parametrize('kind', ['outer', 'inner', 'left', 'right']) + def test_join_self(self, kind): index = timedelta_range('1 day', periods=10) - kinds = 'outer', 'inner', 'left', 'right' - for kind in kinds: - joined = index.join(index, how=kind) - tm.assert_index_equal(index, joined) + joined = index.join(index, how=kind) + tm.assert_index_equal(index, joined) def test_does_not_convert_mixed_integer(self): df = tm.makeCustomDataframe(10, 10, diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index 41b3bb55bfff1..dff5433adcf79 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -20,21 +20,21 @@ class TestPeriodProperties(object): "Test properties such as year, month, weekday, etc...." - def test_is_leap_year(self): + @pytest.mark.parametrize('freq', ['A', 'M', 'D', 'H']) + def test_is_leap_year(self, freq): # GH 13727 - for freq in ['A', 'M', 'D', 'H']: - p = Period('2000-01-01 00:00:00', freq=freq) - assert p.is_leap_year - assert isinstance(p.is_leap_year, bool) + p = Period('2000-01-01 00:00:00', freq=freq) + assert p.is_leap_year + assert isinstance(p.is_leap_year, bool) - p = Period('1999-01-01 00:00:00', freq=freq) - assert not p.is_leap_year + p = Period('1999-01-01 00:00:00', freq=freq) + assert not p.is_leap_year - p = Period('2004-01-01 00:00:00', freq=freq) - assert p.is_leap_year + p = Period('2004-01-01 00:00:00', freq=freq) + assert p.is_leap_year - p = Period('2100-01-01 00:00:00', freq=freq) - assert not p.is_leap_year + p = Period('2100-01-01 00:00:00', freq=freq) + assert not p.is_leap_year def test_quarterly_negative_ordinals(self): p = Period(ordinal=-1, freq='Q-DEC') @@ -52,40 +52,40 @@ def test_quarterly_negative_ordinals(self): assert p.month == 11 assert isinstance(p, Period) - def test_period_cons_quarterly(self): + @pytest.mark.parametrize('month', MONTHS) + def test_period_cons_quarterly(self, month): # bugs in scikits.timeseries - for month in MONTHS: - freq = 'Q-%s' % month - exp = Period('1989Q3', freq=freq) - assert '1989Q3' in str(exp) - stamp = exp.to_timestamp('D', how='end') - p = Period(stamp, freq=freq) - assert p == exp - - stamp = exp.to_timestamp('3D', how='end') - p = Period(stamp, freq=freq) - assert p == exp - - def test_period_cons_annual(self): + freq = 'Q-%s' % month + exp = Period('1989Q3', freq=freq) + assert '1989Q3' in str(exp) + stamp = exp.to_timestamp('D', how='end') + p = Period(stamp, freq=freq) + assert p == exp + + stamp = exp.to_timestamp('3D', how='end') + p = Period(stamp, freq=freq) + assert p == exp + + @pytest.mark.parametrize('month', MONTHS) + def test_period_cons_annual(self, month): # bugs in scikits.timeseries - for month in MONTHS: - freq = 'A-%s' % month - exp = Period('1989', freq=freq) - stamp = exp.to_timestamp('D', how='end') + timedelta(days=30) - p = Period(stamp, freq=freq) - assert p == exp + 1 - assert isinstance(p, Period) - - def test_period_cons_weekly(self): - for num in range(10, 17): - daystr = '2011-02-%d' % num - for day in DAYS: - freq = 'W-%s' % day - - result = Period(daystr, freq=freq) - expected = Period(daystr, freq='D').asfreq(freq) - assert result == expected - assert isinstance(result, Period) + freq = 'A-%s' % month + exp = Period('1989', freq=freq) + stamp = exp.to_timestamp('D', how='end') + timedelta(days=30) + p = Period(stamp, freq=freq) + assert p == exp + 1 + assert isinstance(p, Period) + + @pytest.mark.parametrize('day', DAYS) + @pytest.mark.parametrize('num', range(10, 17)) + def test_period_cons_weekly(self, num, day): + daystr = '2011-02-%d' % num + freq = 'W-%s' % day + + result = Period(daystr, freq=freq) + expected = Period(daystr, freq='D').asfreq(freq) + assert result == expected + assert isinstance(result, Period) def test_period_from_ordinal(self): p = pd.Period('2011-01', freq='M') @@ -212,58 +212,59 @@ def test_period_cons_combined(self): with tm.assert_raises_regex(ValueError, msg): Period('2011-01', freq='1D1W') - def test_timestamp_tz_arg(self): - for case in ['Europe/Brussels', 'Asia/Tokyo', 'US/Pacific']: - p = Period('1/1/2005', freq='M').to_timestamp(tz=case) - exp = Timestamp('1/1/2005', tz='UTC').tz_convert(case) - exp_zone = pytz.timezone(case).normalize(p) - - assert p == exp - assert p.tz == exp_zone.tzinfo - assert p.tz == exp.tz - - p = Period('1/1/2005', freq='3H').to_timestamp(tz=case) - exp = Timestamp('1/1/2005', tz='UTC').tz_convert(case) - exp_zone = pytz.timezone(case).normalize(p) - - assert p == exp - assert p.tz == exp_zone.tzinfo - assert p.tz == exp.tz - - p = Period('1/1/2005', freq='A').to_timestamp(freq='A', tz=case) - exp = Timestamp('31/12/2005', tz='UTC').tz_convert(case) - exp_zone = pytz.timezone(case).normalize(p) - - assert p == exp - assert p.tz == exp_zone.tzinfo - assert p.tz == exp.tz - - p = Period('1/1/2005', freq='A').to_timestamp(freq='3H', tz=case) - exp = Timestamp('1/1/2005', tz='UTC').tz_convert(case) - exp_zone = pytz.timezone(case).normalize(p) - - assert p == exp - assert p.tz == exp_zone.tzinfo - assert p.tz == exp.tz - - def test_timestamp_tz_arg_dateutil(self): + @pytest.mark.parametrize('tzstr', ['Europe/Brussels', + 'Asia/Tokyo', 'US/Pacific']) + def test_timestamp_tz_arg(self, tzstr): + p = Period('1/1/2005', freq='M').to_timestamp(tz=tzstr) + exp = Timestamp('1/1/2005', tz='UTC').tz_convert(tzstr) + exp_zone = pytz.timezone(tzstr).normalize(p) + + assert p == exp + assert p.tz == exp_zone.tzinfo + assert p.tz == exp.tz + + p = Period('1/1/2005', freq='3H').to_timestamp(tz=tzstr) + exp = Timestamp('1/1/2005', tz='UTC').tz_convert(tzstr) + exp_zone = pytz.timezone(tzstr).normalize(p) + + assert p == exp + assert p.tz == exp_zone.tzinfo + assert p.tz == exp.tz + + p = Period('1/1/2005', freq='A').to_timestamp(freq='A', tz=tzstr) + exp = Timestamp('31/12/2005', tz='UTC').tz_convert(tzstr) + exp_zone = pytz.timezone(tzstr).normalize(p) + + assert p == exp + assert p.tz == exp_zone.tzinfo + assert p.tz == exp.tz + + p = Period('1/1/2005', freq='A').to_timestamp(freq='3H', tz=tzstr) + exp = Timestamp('1/1/2005', tz='UTC').tz_convert(tzstr) + exp_zone = pytz.timezone(tzstr).normalize(p) + + assert p == exp + assert p.tz == exp_zone.tzinfo + assert p.tz == exp.tz + + @pytest.mark.parametrize('tzstr', ['dateutil/Europe/Brussels', + 'dateutil/Asia/Tokyo', + 'dateutil/US/Pacific']) + def test_timestamp_tz_arg_dateutil(self, tzstr): from pandas._libs.tslibs.timezones import dateutil_gettz from pandas._libs.tslibs.timezones import maybe_get_tz - for case in ['dateutil/Europe/Brussels', 'dateutil/Asia/Tokyo', - 'dateutil/US/Pacific']: - p = Period('1/1/2005', freq='M').to_timestamp( - tz=maybe_get_tz(case)) - exp = Timestamp('1/1/2005', tz='UTC').tz_convert(case) - assert p == exp - assert p.tz == dateutil_gettz(case.split('/', 1)[1]) - assert p.tz == exp.tz - - p = Period('1/1/2005', - freq='M').to_timestamp(freq='3H', tz=maybe_get_tz(case)) - exp = Timestamp('1/1/2005', tz='UTC').tz_convert(case) - assert p == exp - assert p.tz == dateutil_gettz(case.split('/', 1)[1]) - assert p.tz == exp.tz + tz = maybe_get_tz(tzstr) + p = Period('1/1/2005', freq='M').to_timestamp(tz=tz) + exp = Timestamp('1/1/2005', tz='UTC').tz_convert(tzstr) + assert p == exp + assert p.tz == dateutil_gettz(tzstr.split('/', 1)[1]) + assert p.tz == exp.tz + + p = Period('1/1/2005', freq='M').to_timestamp(freq='3H', tz=tz) + exp = Timestamp('1/1/2005', tz='UTC').tz_convert(tzstr) + assert p == exp + assert p.tz == dateutil_gettz(tzstr.split('/', 1)[1]) + assert p.tz == exp.tz def test_timestamp_tz_arg_dateutil_from_string(self): from pandas._libs.tslibs.timezones import dateutil_gettz @@ -1403,14 +1404,14 @@ def test_sub_offset_nat(self): timedelta(hours=23, minutes=30)]: assert p - o is tslib.NaT - def test_nat_ops(self): - for freq in ['M', '2M', '3M']: - p = Period('NaT', freq=freq) - assert p + 1 is tslib.NaT - assert 1 + p is tslib.NaT - assert p - 1 is tslib.NaT - assert p - Period('2011-01', freq=freq) is tslib.NaT - assert Period('2011-01', freq=freq) - p is tslib.NaT + @pytest.mark.parametrize('freq', ['M', '2M', '3M']) + def test_nat_ops(self, freq): + p = Period('NaT', freq=freq) + assert p + 1 is tslib.NaT + assert 1 + p is tslib.NaT + assert p - 1 is tslib.NaT + assert p - Period('2011-01', freq=freq) is tslib.NaT + assert Period('2011-01', freq=freq) - p is tslib.NaT def test_period_ops_offset(self): p = Period('2011-04-01', freq='D') From 842d350a17785b8316d6e67311ee1396fa02429a Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 21 Feb 2018 19:12:22 -0500 Subject: [PATCH 159/214] DEPR: remove pandas.core.common is_* (#19769) --- doc/source/whatsnew/v0.23.0.txt | 18 ++++++++++ pandas/core/base.py | 10 +++--- pandas/core/common.py | 60 --------------------------------- pandas/core/resample.py | 14 ++++---- pandas/tests/api/test_types.py | 38 --------------------- 5 files changed, 30 insertions(+), 110 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index c9951e0ec4378..f947cacbfde07 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -345,6 +345,23 @@ Convert to an xarray DataArray p.to_xarray() + + +.. _whatsnew_0230.api_breaking.core_common: + +pandas.core.common removals +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The following error & warning messages are removed from ``pandas.core.common`` (:issue:`13634`, :issue:`19769`): + +- ``PerformanceWarning`` +- ``UnsupportedFunctionCall`` +- ``UnsortedIndexError`` +- ``AbstractMethodError`` + +These are available from import from ``pandas.errors`` (since 0.19.0). + + .. _whatsnew_0230.api_breaking.apply: Changes to make output of ``DataFrame.apply`` consistent @@ -644,6 +661,7 @@ Removal of prior version deprecations/changes - The modules ``pandas.tools.hashing`` and ``pandas.util.hashing`` have been removed (:issue:`16223`) - The top-level functions ``pd.rolling_*``, ``pd.expanding_*`` and ``pd.ewm*`` have been removed (Deprecated since v0.18). Instead, use the DataFrame/Series methods :attr:`~DataFrame.rolling`, :attr:`~DataFrame.expanding` and :attr:`~DataFrame.ewm` (:issue:`18723`) +- Imports from ``pandas.core.common`` for functions such as ``is_datetime64_dtype`` are now removed. These are located in ``pandas.api.types``. (:issue:`13634`, :issue:`19769`) .. _whatsnew_0230.performance: diff --git a/pandas/core/base.py b/pandas/core/base.py index ebd69a5f9aac1..280b8849792e3 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -17,7 +17,7 @@ is_extension_array_dtype) from pandas.util._validators import validate_bool_kwarg - +from pandas.errors import AbstractMethodError from pandas.core import common as com, algorithms import pandas.core.nanops as nanops import pandas._libs.lib as lib @@ -46,7 +46,7 @@ class StringMixin(object): # Formatting def __unicode__(self): - raise com.AbstractMethodError(self) + raise AbstractMethodError(self) def __str__(self): """ @@ -278,10 +278,10 @@ def _gotitem(self, key, ndim, subset=None): subset to act on """ - raise com.AbstractMethodError(self) + raise AbstractMethodError(self) def aggregate(self, func, *args, **kwargs): - raise com.AbstractMethodError(self) + raise AbstractMethodError(self) agg = aggregate @@ -1247,4 +1247,4 @@ def duplicated(self, keep='first'): # abstracts def _update_inplace(self, result, **kwargs): - raise com.AbstractMethodError(self) + raise AbstractMethodError(self) diff --git a/pandas/core/common.py b/pandas/core/common.py index 77dc1522052d4..c4fbcf28cbcae 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -2,8 +2,6 @@ Misc tools for implementing data structures """ -import sys -import warnings from datetime import datetime, timedelta from functools import partial import inspect @@ -20,66 +18,8 @@ from pandas.core.dtypes.inference import _iterable_not_string from pandas.core.dtypes.missing import isna, isnull, notnull # noqa from pandas.api import types -from pandas.core.dtypes import common from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike -# compat -from pandas.errors import ( # noqa - PerformanceWarning, UnsupportedFunctionCall, UnsortedIndexError, - AbstractMethodError) - -# back-compat of public API -# deprecate these functions -m = sys.modules['pandas.core.common'] -for t in [t for t in dir(types) if not t.startswith('_')]: - - def outer(t=t): - - def wrapper(*args, **kwargs): - warnings.warn("pandas.core.common.{t} is deprecated. " - "import from the public API: " - "pandas.api.types.{t} instead".format(t=t), - DeprecationWarning, stacklevel=3) - return getattr(types, t)(*args, **kwargs) - return wrapper - - setattr(m, t, outer(t)) - -# back-compat for non-public functions -# deprecate these functions -for t in ['is_datetime_arraylike', - 'is_datetime_or_timedelta_dtype', - 'is_datetimelike', - 'is_datetimelike_v_numeric', - 'is_datetimelike_v_object', - 'is_datetimetz', - 'is_int_or_datetime_dtype', - 'is_period_arraylike', - 'is_string_like', - 'is_string_like_dtype']: - - def outer(t=t): - - def wrapper(*args, **kwargs): - warnings.warn("pandas.core.common.{t} is deprecated. " - "These are not longer public API functions, " - "but can be imported from " - "pandas.api.types.{t} instead".format(t=t), - DeprecationWarning, stacklevel=3) - return getattr(common, t)(*args, **kwargs) - return wrapper - - setattr(m, t, outer(t)) - - -# deprecate array_equivalent - -def array_equivalent(*args, **kwargs): - warnings.warn("'pandas.core.common.array_equivalent' is deprecated and " - "is no longer public API", DeprecationWarning, stacklevel=2) - from pandas.core.dtypes import missing - return missing.array_equivalent(*args, **kwargs) - class SettingWithCopyError(ValueError): pass diff --git a/pandas/core/resample.py b/pandas/core/resample.py index df656092f476e..772568ee84737 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -16,7 +16,7 @@ from pandas.core.indexes.timedeltas import TimedeltaIndex from pandas.tseries.offsets import DateOffset, Tick, Day, delta_to_nanoseconds from pandas.core.indexes.period import PeriodIndex -import pandas.core.common as com +from pandas.errors import AbstractMethodError import pandas.core.algorithms as algos from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries @@ -205,10 +205,10 @@ def __setattr__(self, attr, value): def __getitem__(self, key): try: return super(Resampler, self).__getitem__(key) - except (KeyError, com.AbstractMethodError): + except (KeyError, AbstractMethodError): # compat for deprecated - if isinstance(self.obj, com.ABCSeries): + if isinstance(self.obj, ABCSeries): return self._deprecated('__getitem__')[key] raise @@ -233,7 +233,7 @@ def _convert_obj(self, obj): return obj def _get_binner_for_time(self): - raise com.AbstractMethodError(self) + raise AbstractMethodError(self) def _set_binner(self): """ @@ -372,10 +372,10 @@ def transform(self, arg, *args, **kwargs): arg, *args, **kwargs) def _downsample(self, f): - raise com.AbstractMethodError(self) + raise AbstractMethodError(self) def _upsample(self, f, limit=None, fill_value=None): - raise com.AbstractMethodError(self) + raise AbstractMethodError(self) def _gotitem(self, key, ndim, subset=None): """ @@ -464,7 +464,7 @@ def _get_resampler_for_grouping(self, groupby, **kwargs): def _wrap_result(self, result): """ potentially wrap any results """ - if isinstance(result, com.ABCSeries) and self._selection is not None: + if isinstance(result, ABCSeries) and self._selection is not None: result.name = self._selection if isinstance(result, ABCSeries) and result.empty: diff --git a/pandas/tests/api/test_types.py b/pandas/tests/api/test_types.py index 7e6430accc546..bd4891326c751 100644 --- a/pandas/tests/api/test_types.py +++ b/pandas/tests/api/test_types.py @@ -3,10 +3,8 @@ import pytest from warnings import catch_warnings -import numpy as np import pandas -from pandas.core import common as com from pandas.api import types from pandas.util import testing as tm @@ -52,42 +50,6 @@ def check_deprecation(self, fold, fnew): except AttributeError: pytest.raises(AttributeError, lambda: fnew('foo')) - def test_deprecation_core_common(self): - - # test that we are in fact deprecating - # the pandas.core.common introspectors - for t in self.allowed: - self.check_deprecation(getattr(com, t), getattr(types, t)) - - def test_deprecation_core_common_array_equivalent(self): - - with tm.assert_produces_warning(DeprecationWarning): - com.array_equivalent(np.array([1, 2]), np.array([1, 2])) - - def test_deprecation_core_common_moved(self): - - # these are in pandas.core.dtypes.common - l = ['is_datetime_arraylike', - 'is_datetime_or_timedelta_dtype', - 'is_datetimelike', - 'is_datetimelike_v_numeric', - 'is_datetimelike_v_object', - 'is_datetimetz', - 'is_int_or_datetime_dtype', - 'is_period_arraylike', - 'is_string_like', - 'is_string_like_dtype'] - - from pandas.core.dtypes import common as c - for t in l: - self.check_deprecation(getattr(com, t), getattr(c, t)) - - def test_removed_from_core_common(self): - - for t in ['is_null_datelike_scalar', - 'ensure_float']: - pytest.raises(AttributeError, lambda: getattr(com, t)) - def test_deprecated_from_api_types(self): for t in self.deprecated: From 4ed83131d5f9e84a98d14dc4482ba1ff7e8895e3 Mon Sep 17 00:00:00 2001 From: HagaiHargil Date: Thu, 22 Feb 2018 02:16:21 +0200 Subject: [PATCH 160/214] DOC: Clarify and add fill_value example in arithmetic ops (#19675) --- pandas/core/ops.py | 97 ++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 90 insertions(+), 7 deletions(-) diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 9e80ab3b3da4c..b20f208d14dc5 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -408,8 +408,10 @@ def _get_op_name(op, special): ---------- other : Series or scalar value fill_value : None or float value, default None (NaN) - Fill missing (NaN) values with this value. If both Series are - missing, the result will be missing + Fill existing missing (NaN) values, and any new element needed for + successful Series alignment, with this value before computation. + If data in both corresponding Series locations is missing + the result will be missing level : int or name Broadcast across a level, matching Index values on the passed MultiIndex level @@ -418,6 +420,30 @@ def _get_op_name(op, special): ------- result : Series +Examples +-------- +>>> a = pd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) +>>> a +a 1.0 +b 1.0 +c 1.0 +d NaN +dtype: float64 +>>> b = pd.Series([1, np.nan, 1, np.nan], index=['a', 'b', 'd', 'e']) +>>> b +a 1.0 +b NaN +d 1.0 +e NaN +dtype: float64 +>>> a.add(b, fill_value=0) +a 2.0 +b 1.0 +c 1.0 +d 1.0 +e NaN +dtype: float64 + See also -------- Series.{reverse} @@ -433,8 +459,10 @@ def _get_op_name(op, special): axis : {0, 1, 'index', 'columns'} For Series input, axis to match Series index on fill_value : None or float value, default None - Fill missing (NaN) values with this value. If both DataFrame locations are - missing, the result will be missing + Fill existing missing (NaN) values, and any new element needed for + successful DataFrame alignment, with this value before computation. + If data in both corresponding DataFrame locations is missing + the result will be missing level : int or name Broadcast across a level, matching Index values on the passed MultiIndex level @@ -446,6 +474,33 @@ def _get_op_name(op, special): Returns ------- result : DataFrame + +Examples +-------- +>>> a = pd.DataFrame([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd'], + columns=['one']) +>>> a + one +a 1.0 +b 1.0 +c 1.0 +d NaN +>>> b = pd.DataFrame(dict(one=[1, np.nan, 1, np.nan], + two=[np.nan, 2, np.nan, 2]), + index=['a', 'b', 'd', 'e']) +>>> b + one two +a 1.0 NaN +b NaN 2.0 +d 1.0 NaN +e NaN 2.0 +>>> a.add(b, fill_value=0) + one two +a 2.0 NaN +b 1.0 2.0 +c 1.0 NaN +d 1.0 NaN +e NaN 2.0 """ _flex_doc_FRAME = """ @@ -460,8 +515,10 @@ def _get_op_name(op, special): axis : {{0, 1, 'index', 'columns'}} For Series input, axis to match Series index on fill_value : None or float value, default None - Fill missing (NaN) values with this value. If both DataFrame - locations are missing, the result will be missing + Fill existing missing (NaN) values, and any new element needed for + successful DataFrame alignment, with this value before computation. + If data in both corresponding DataFrame locations is missing + the result will be missing level : int or name Broadcast across a level, matching Index values on the passed MultiIndex level @@ -474,6 +531,33 @@ def _get_op_name(op, special): ------- result : DataFrame +Examples +-------- +>>> a = pd.DataFrame([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd'], + columns=['one']) +>>> a + one +a 1.0 +b 1.0 +c 1.0 +d NaN +>>> b = pd.DataFrame(dict(one=[1, np.nan, 1, np.nan], + two=[np.nan, 2, np.nan, 2]), + index=['a', 'b', 'd', 'e']) +>>> b + one two +a 1.0 NaN +b NaN 2.0 +d 1.0 NaN +e NaN 2.0 +>>> a.add(b, fill_value=0) + one two +a 2.0 NaN +b 1.0 2.0 +c 1.0 NaN +d 1.0 NaN +e NaN 2.0 + See also -------- DataFrame.{reverse} @@ -545,7 +629,6 @@ def _make_flex_doc(op_name, typ): base_doc = _flex_doc_PANEL else: raise AssertionError('Invalid typ argument.') - doc = base_doc.format(desc=op_desc['desc'], op_name=op_name, equiv=equiv, reverse=op_desc['reverse']) return doc From aa68c06e1ec7c4ddb887dbf33b799ef395fb4fd1 Mon Sep 17 00:00:00 2001 From: Marco Hemken Date: Wed, 21 Feb 2018 16:18:03 -0800 Subject: [PATCH 161/214] DOC: added plotting module to the api reference docs (#19780) --- doc/source/api.rst | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/doc/source/api.rst b/doc/source/api.rst index 3b38f0caa1766..b8aad67e147ba 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -2388,15 +2388,23 @@ Style Export and Import Styler.to_excel Plotting -~~~~~~~~ +-------- -.. currentmodule:: pandas +.. currentmodule:: pandas.plotting + +The following functions are contained in the `pandas.plotting` module. .. autosummary:: :toctree: generated/ - plotting.register_matplotlib_converters - plotting.deregister_matplotlib_converters + andrews_curves + bootstrap_plot + deregister_matplotlib_converters + lag_plot + parallel_coordinates + radviz + register_matplotlib_converters + scatter_matrix .. currentmodule:: pandas From 3b135c3c4424cfa10b955a0d505189f0a06e9122 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 21 Feb 2018 18:20:57 -0600 Subject: [PATCH 162/214] API: Validate keyword arguments to fillna (#19684) --- doc/source/whatsnew/v0.23.0.txt | 3 +- pandas/core/arrays/categorical.py | 5 +++- pandas/core/generic.py | 12 ++------ pandas/tests/categorical/test_missing.py | 16 +++++++++++ pandas/util/_validators.py | 36 ++++++++++++++++++++++++ 5 files changed, 61 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index f947cacbfde07..4c1e98b236db7 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -573,6 +573,7 @@ Datetimelike API Changes - Subtracting ``NaT`` from a :class:`Series` with ``dtype='datetime64[ns]'`` returns a ``Series`` with ``dtype='timedelta64[ns]'`` instead of ``dtype='datetime64[ns]'`` (:issue:`18808`) - Operations between a :class:`Series` with dtype ``dtype='datetime64[ns]'`` and a :class:`PeriodIndex` will correctly raises ``TypeError`` (:issue:`18850`) - Subtraction of :class:`Series` with timezone-aware ``dtype='datetime64[ns]'`` with mis-matched timezones will raise ``TypeError`` instead of ``ValueError`` (:issue:`18817`) +- :func:`pandas.merge` provides a more informative error message when trying to merge on timezone-aware and timezone-naive columns (:issue:`15800`) .. _whatsnew_0230.api.other: @@ -592,7 +593,6 @@ Other API Changes - :func:`Series.fillna` now raises a ``TypeError`` instead of a ``ValueError`` when passed a list, tuple or DataFrame as a ``value`` (:issue:`18293`) - :func:`pandas.DataFrame.merge` no longer casts a ``float`` column to ``object`` when merging on ``int`` and ``float`` columns (:issue:`16572`) - :func:`pandas.merge` now raises a ``ValueError`` when trying to merge on incompatible data types (:issue:`9780`) -- :func:`pandas.merge` provides a more informative error message when trying to merge on timezone-aware and timezone-naive columns (:issue:`15800`) - The default NA value for :class:`UInt64Index` has changed from 0 to ``NaN``, which impacts methods that mask with NA, such as ``UInt64Index.where()`` (:issue:`18398`) - Refactored ``setup.py`` to use ``find_packages`` instead of explicitly listing out all subpackages (:issue:`18535`) - Rearranged the order of keyword arguments in :func:`read_excel()` to align with :func:`read_csv()` (:issue:`16672`) @@ -606,6 +606,7 @@ Other API Changes - :func:`Series.to_csv` now accepts a ``compression`` argument that works in the same way as the ``compression`` argument in :func:`DataFrame.to_csv` (:issue:`18958`) - Set operations (union, difference...) on :class:`IntervalIndex` with incompatible index types will now raise a ``TypeError`` rather than a ``ValueError`` (:issue:`19329`) - :class:`DateOffset` objects render more simply, e.g. ```` instead of ```` (:issue:`19403`) +- ``Categorical.fillna`` now validates its ``value`` and ``method`` keyword arguments. It now raises when both or none are specified, matching the behavior of :meth:`Series.fillna` (:issue:`19682`) .. _whatsnew_0230.deprecations: diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 7354115f8295e..493b2e5bd899b 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -40,7 +40,7 @@ Appender, cache_readonly, deprecate_kwarg, Substitution) from pandas.io.formats.terminal import get_terminal_size -from pandas.util._validators import validate_bool_kwarg +from pandas.util._validators import validate_bool_kwarg, validate_fillna_kwargs from pandas.core.config import get_option from .base import ExtensionArray @@ -1610,6 +1610,9 @@ def fillna(self, value=None, method=None, limit=None): ------- filled : Categorical with NA/NaN filled """ + value, method = validate_fillna_kwargs( + value, method, validate_scalar_dict_value=False + ) if value is None: value = np.nan diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 297450417e3cf..8034cf89cf8b7 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -54,7 +54,7 @@ import pandas.core.nanops as nanops from pandas.util._decorators import (Appender, Substitution, deprecate_kwarg) -from pandas.util._validators import validate_bool_kwarg +from pandas.util._validators import validate_bool_kwarg, validate_fillna_kwargs from pandas.core import config # goal is to be able to define the docs close to function, while still being @@ -4697,10 +4697,8 @@ def infer_objects(self): def fillna(self, value=None, method=None, axis=None, inplace=False, limit=None, downcast=None): inplace = validate_bool_kwarg(inplace, 'inplace') + value, method = validate_fillna_kwargs(value, method) - if isinstance(value, (list, tuple)): - raise TypeError('"value" parameter must be a scalar or dict, but ' - 'you passed a "{0}"'.format(type(value).__name__)) self._consolidate_inplace() # set the default here, so functions examining the signaure @@ -4711,8 +4709,7 @@ def fillna(self, value=None, method=None, axis=None, inplace=False, method = missing.clean_fill_method(method) from pandas import DataFrame if value is None: - if method is None: - raise ValueError('must specify a fill method or value') + if self._is_mixed_type and axis == 1: if inplace: raise NotImplementedError() @@ -4746,9 +4743,6 @@ def fillna(self, value=None, method=None, axis=None, inplace=False, coerce=True, downcast=downcast) else: - if method is not None: - raise ValueError('cannot specify both a fill method and value') - if len(self._get_axis(axis)) == 0: return self diff --git a/pandas/tests/categorical/test_missing.py b/pandas/tests/categorical/test_missing.py index 79758dee5cfda..fca5573547071 100644 --- a/pandas/tests/categorical/test_missing.py +++ b/pandas/tests/categorical/test_missing.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- import numpy as np +import pytest import pandas.util.testing as tm from pandas import (Categorical, Index, isna) @@ -53,3 +54,18 @@ def test_set_item_nan(self): exp = Categorical([1, np.nan, 3], categories=[1, 2, 3]) tm.assert_categorical_equal(cat, exp) + + @pytest.mark.parametrize('fillna_kwargs, msg', [ + (dict(value=1, method='ffill'), + "Cannot specify both 'value' and 'method'."), + (dict(), + "Must specify a fill 'value' or 'method'."), + (dict(method='bad'), + "Invalid fill method. Expecting .* bad"), + ]) + def test_fillna_raises(self, fillna_kwargs, msg): + # https://github.com/pandas-dev/pandas/issues/19682 + cat = Categorical([1, 2, 3]) + + with tm.assert_raises_regex(ValueError, msg): + cat.fillna(**fillna_kwargs) diff --git a/pandas/util/_validators.py b/pandas/util/_validators.py index b30ffc7416f92..a96563051e7de 100644 --- a/pandas/util/_validators.py +++ b/pandas/util/_validators.py @@ -320,3 +320,39 @@ def validate_axis_style_args(data, args, kwargs, arg_name, method_name): msg = "Cannot specify all of '{}', 'index', 'columns'." raise TypeError(msg.format(arg_name)) return out + + +def validate_fillna_kwargs(value, method, validate_scalar_dict_value=True): + """Validate the keyword arguments to 'fillna'. + + This checks that exactly one of 'value' and 'method' is specified. + If 'method' is specified, this validates that it's a valid method. + + Parameters + ---------- + value, method : object + The 'value' and 'method' keyword arguments for 'fillna'. + validate_scalar_dict_value : bool, default True + Whether to validate that 'value' is a scalar or dict. Specifically, + validate that it is not a list or tuple. + + Returns + ------- + value, method : object + """ + from pandas.core.missing import clean_fill_method + + if value is None and method is None: + raise ValueError("Must specify a fill 'value' or 'method'.") + elif value is None and method is not None: + method = clean_fill_method(method) + + elif value is not None and method is None: + if validate_scalar_dict_value and isinstance(value, (list, tuple)): + raise TypeError('"value" parameter must be a scalar or dict, but ' + 'you passed a "{0}"'.format(type(value).__name__)) + + elif value is not None and method is not None: + raise ValueError("Cannot specify both 'value' and 'method'.") + + return value, method From 3ab862326249979649b0a2611419f2e85a032168 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 21 Feb 2018 17:37:33 -0800 Subject: [PATCH 163/214] Fix Index __mul__-like ops with timedelta scalars (#19333) --- doc/source/whatsnew/v0.23.0.txt | 2 + pandas/core/indexes/base.py | 23 +++++++++-- pandas/core/indexes/range.py | 12 +++++- pandas/tests/indexes/test_numeric.py | 38 ++++++++++++++++++- .../indexes/timedeltas/test_arithmetic.py | 16 +++++++- 5 files changed, 85 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 4c1e98b236db7..76c4fa08fca4d 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -765,6 +765,7 @@ Timedelta - Bug in :func:`Timedelta.__floordiv__`, :func:`Timedelta.__rfloordiv__` where operating with a ``Tick`` object would raise a ``TypeError`` instead of returning a numeric value (:issue:`19738`) - Bug in :func:`Period.asfreq` where periods near ``datetime(1, 1, 1)`` could be converted incorrectly (:issue:`19643`) - Bug in :func:`Timedelta.total_seconds()` causing precision errors i.e. ``Timedelta('30S').total_seconds()==30.000000000000004`` (:issue:`19458`) +- Multiplication of :class:`TimedeltaIndex` by ``TimedeltaIndex`` will now raise ``TypeError`` instead of raising ``ValueError`` in cases of length mis-match (:issue`19333`) - Timezones @@ -799,6 +800,7 @@ Numeric - Bug in the :class:`DataFrame` constructor in which data containing very large positive or very large negative numbers was causing ``OverflowError`` (:issue:`18584`) - Bug in :class:`Index` constructor with ``dtype='uint64'`` where int-like floats were not coerced to :class:`UInt64Index` (:issue:`18400`) - Bug in :class:`DataFrame` flex arithmetic (e.g. ``df.add(other, fill_value=foo)``) with a ``fill_value`` other than ``None`` failed to raise ``NotImplementedError`` in corner cases where either the frame or ``other`` has length zero (:issue:`19522`) +- Multiplication and division of numeric-dtyped :class:`Index` objects with timedelta-like scalars returns ``TimedeltaIndex`` instead of raising ``TypeError`` (:issue:`19333`) Indexing diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 7dfa34bd634ad..59fe4bba649d3 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -5,7 +5,7 @@ import numpy as np from pandas._libs import (lib, index as libindex, tslib as libts, algos as libalgos, join as libjoin, - Timestamp) + Timestamp, Timedelta) from pandas._libs.lib import is_datetime_array from pandas.compat import range, u, set_function_name @@ -16,7 +16,7 @@ from pandas.core.dtypes.generic import ( ABCSeries, ABCDataFrame, ABCMultiIndex, - ABCPeriodIndex, + ABCPeriodIndex, ABCTimedeltaIndex, ABCDateOffset) from pandas.core.dtypes.missing import isna, array_equivalent from pandas.core.dtypes.common import ( @@ -3918,7 +3918,21 @@ def dropna(self, how='any'): return self._shallow_copy() def _evaluate_with_timedelta_like(self, other, op, opstr, reversed=False): - raise TypeError("can only perform ops with timedelta like values") + # Timedelta knows how to operate with np.array, so dispatch to that + # operation and then wrap the results + other = Timedelta(other) + values = self.values + if reversed: + values, other = other, values + + with np.errstate(all='ignore'): + result = op(values, other) + + attrs = self._get_attributes_dict() + attrs = self._maybe_update_attributes(attrs) + if op == divmod: + return Index(result[0], **attrs), Index(result[1], **attrs) + return Index(result, **attrs) def _evaluate_with_datetime_like(self, other, op, opstr): raise TypeError("can only perform ops with datetime like values") @@ -4061,6 +4075,9 @@ def _make_evaluate_binop(op, opstr, reversed=False, constructor=Index): def _evaluate_numeric_binop(self, other): if isinstance(other, (ABCSeries, ABCDataFrame)): return NotImplemented + elif isinstance(other, ABCTimedeltaIndex): + # Defer to subclass implementation + return NotImplemented other = self._validate_for_numeric_binop(other, op, opstr) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 0ed92a67c7e14..0ac415ee0b701 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -1,5 +1,6 @@ from sys import getsizeof import operator +from datetime import timedelta import numpy as np from pandas._libs import index as libindex @@ -8,7 +9,7 @@ is_integer, is_scalar, is_int64_dtype) -from pandas.core.dtypes.generic import ABCSeries +from pandas.core.dtypes.generic import ABCSeries, ABCTimedeltaIndex from pandas import compat from pandas.compat import lrange, range, get_range_parameters @@ -587,6 +588,15 @@ def _make_evaluate_binop(op, opstr, reversed=False, step=False): def _evaluate_numeric_binop(self, other): if isinstance(other, ABCSeries): return NotImplemented + elif isinstance(other, ABCTimedeltaIndex): + # Defer to TimedeltaIndex implementation + return NotImplemented + elif isinstance(other, (timedelta, np.timedelta64)): + # GH#19333 is_integer evaluated True on timedelta64, + # so we need to catch these explicitly + if reversed: + return op(other, self._int64index) + return op(self._int64index, other) other = self._validate_for_numeric_binop(other, op, opstr) attrs = self._get_attributes_dict() diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index c6883df7ee91a..bafb6ae2e45f4 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -13,7 +13,7 @@ import pandas.util.testing as tm import pandas as pd -from pandas._libs.tslib import Timestamp +from pandas._libs.tslib import Timestamp, Timedelta from pandas.tests.indexes.common import Base @@ -26,6 +26,42 @@ def full_like(array, value): return ret +class TestIndexArithmeticWithTimedeltaScalar(object): + + @pytest.mark.parametrize('index', [ + Int64Index(range(1, 11)), + UInt64Index(range(1, 11)), + Float64Index(range(1, 11)), + RangeIndex(1, 11)]) + @pytest.mark.parametrize('scalar_td', [Timedelta(days=1), + Timedelta(days=1).to_timedelta64(), + Timedelta(days=1).to_pytimedelta()]) + def test_index_mul_timedelta(self, scalar_td, index): + # GH#19333 + expected = pd.timedelta_range('1 days', '10 days') + + result = index * scalar_td + tm.assert_index_equal(result, expected) + commute = scalar_td * index + tm.assert_index_equal(commute, expected) + + @pytest.mark.parametrize('index', [Int64Index(range(1, 3)), + UInt64Index(range(1, 3)), + Float64Index(range(1, 3)), + RangeIndex(1, 3)]) + @pytest.mark.parametrize('scalar_td', [Timedelta(days=1), + Timedelta(days=1).to_timedelta64(), + Timedelta(days=1).to_pytimedelta()]) + def test_index_rdiv_timedelta(self, scalar_td, index): + expected = pd.TimedeltaIndex(['1 Day', '12 Hours']) + + result = scalar_td / index + tm.assert_index_equal(result, expected) + + with pytest.raises(TypeError): + index / scalar_td + + class Numeric(Base): def test_numeric_compat(self): diff --git a/pandas/tests/indexes/timedeltas/test_arithmetic.py b/pandas/tests/indexes/timedeltas/test_arithmetic.py index 4141d66cb519b..24341b3419859 100644 --- a/pandas/tests/indexes/timedeltas/test_arithmetic.py +++ b/pandas/tests/indexes/timedeltas/test_arithmetic.py @@ -368,7 +368,7 @@ def test_dti_mul_dti_raises(self): def test_dti_mul_too_short_raises(self): idx = self._holder(np.arange(5, dtype='int64')) - with pytest.raises(ValueError): + with pytest.raises(TypeError): idx * self._holder(np.arange(3)) with pytest.raises(ValueError): idx * np.array([1, 2]) @@ -544,6 +544,20 @@ def test_tdi_div_tdlike_scalar_with_nat(self, delta): result = rng / delta tm.assert_index_equal(result, expected) + @pytest.mark.parametrize('other', [np.arange(1, 11), + pd.Int64Index(range(1, 11)), + pd.UInt64Index(range(1, 11)), + pd.Float64Index(range(1, 11)), + pd.RangeIndex(1, 11)]) + def test_tdi_rmul_arraylike(self, other): + tdi = TimedeltaIndex(['1 Day'] * 10) + expected = timedelta_range('1 days', '10 days') + + result = other * tdi + tm.assert_index_equal(result, expected) + commute = tdi * other + tm.assert_index_equal(commute, expected) + def test_subtraction_ops(self): # with datetimes/timedelta and tdi/dti tdi = TimedeltaIndex(['1 days', pd.NaT, '2 days'], name='foo') From b27c541b9c4a07726c792ba481496eddd6191cd1 Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Thu, 22 Feb 2018 02:05:42 +0000 Subject: [PATCH 164/214] DOC: Improving code quality of doc/make.py, PEP-8, refactoring and removing unused commands (#19631) (#19634) --- ci/lint.sh | 7 + doc/make.py | 567 +++++++++++----------------------- doc/source/index.rst.template | 10 +- 3 files changed, 198 insertions(+), 386 deletions(-) diff --git a/ci/lint.sh b/ci/lint.sh index 49bf9a690b990..b862a3bfcf29e 100755 --- a/ci/lint.sh +++ b/ci/lint.sh @@ -37,6 +37,13 @@ if [ "$LINT" ]; then fi echo "Linting scripts/*.py DONE" + echo "Linting doc script" + flake8 doc/make.py + if [ $? -ne "0" ]; then + RET=1 + fi + echo "Linting doc script DONE" + echo "Linting *.pyx" flake8 pandas --filename=*.pyx --select=E501,E302,E203,E111,E114,E221,E303,E128,E231,E126,E265,E305,E301,E127,E261,E271,E129,W291,E222,E241,E123,F403 if [ $? -ne "0" ]; then diff --git a/doc/make.py b/doc/make.py index acef563f301e4..e3cb29aa3e086 100755 --- a/doc/make.py +++ b/doc/make.py @@ -1,128 +1,62 @@ #!/usr/bin/env python - """ Python script for building documentation. To build the docs you must have all optional dependencies for pandas installed. See the installation instructions for a list of these. -Note: currently latex builds do not work because of table formats that are not -supported in the latex generation. - -2014-01-30: Latex has some issues but 'latex_forced' works ok for 0.13.0-400 or so - Usage ----- -python make.py clean -python make.py html + $ python make.py clean + $ python make.py html + $ python make.py latex """ -from __future__ import print_function - -import io -import glob # noqa +import sys import os import shutil -import sys +import subprocess +import argparse from contextlib import contextmanager +import jinja2 -import sphinx # noqa -import argparse -import jinja2 # noqa -os.environ['PYTHONPATH'] = '..' +DOC_PATH = os.path.dirname(os.path.abspath(__file__)) +SOURCE_PATH = os.path.join(DOC_PATH, 'source') +BUILD_PATH = os.path.join(DOC_PATH, 'build') +BUILD_DIRS = ['doctrees', 'html', 'latex', 'plots', '_static', '_templates'] -SPHINX_BUILD = 'sphinxbuild' +def _generate_index(include_api, single_doc=None): + """Create index.rst file with the specified sections. -def _process_user(user): - if user is None or user is False: - user = '' - else: - user = user + '@' - return user - - -def upload_dev(user=None): - 'push a copy to the pydata dev directory' - user = _process_user(user) - if os.system('cd build/html; rsync -avz . {0}pandas.pydata.org' - ':/usr/share/nginx/pandas/pandas-docs/dev/ -essh'.format(user)): - raise SystemExit('Upload to Pydata Dev failed') - - -def upload_dev_pdf(user=None): - 'push a copy to the pydata dev directory' - user = _process_user(user) - if os.system('cd build/latex; scp pandas.pdf {0}pandas.pydata.org' - ':/usr/share/nginx/pandas/pandas-docs/dev/'.format(user)): - raise SystemExit('PDF upload to Pydata Dev failed') - - -def upload_stable(user=None): - 'push a copy to the pydata stable directory' - user = _process_user(user) - if os.system('cd build/html; rsync -avz . {0}pandas.pydata.org' - ':/usr/share/nginx/pandas/pandas-docs/stable/ -essh'.format(user)): - raise SystemExit('Upload to stable failed') - - -def upload_stable_pdf(user=None): - 'push a copy to the pydata dev directory' - user = _process_user(user) - if os.system('cd build/latex; scp pandas.pdf {0}pandas.pydata.org' - ':/usr/share/nginx/pandas/pandas-docs/stable/'.format(user)): - raise SystemExit('PDF upload to stable failed') - - -def upload_prev(ver, doc_root='./', user=None): - 'push a copy of older release to appropriate version directory' - user = _process_user(user) - local_dir = doc_root + 'build/html' - remote_dir = '/usr/share/nginx/pandas/pandas-docs/version/%s/' % ver - cmd = 'cd %s; rsync -avz . %spandas.pydata.org:%s -essh' - cmd = cmd % (local_dir, user, remote_dir) - print(cmd) - if os.system(cmd): - raise SystemExit( - 'Upload to %s from %s failed' % (remote_dir, local_dir)) - - local_dir = doc_root + 'build/latex' - pdf_cmd = 'cd %s; scp pandas.pdf %spandas.pydata.org:%s' - pdf_cmd = pdf_cmd % (local_dir, user, remote_dir) - if os.system(pdf_cmd): - raise SystemExit('Upload PDF to %s from %s failed' % (ver, doc_root)) - -def build_pandas(): - os.chdir('..') - os.system('python setup.py clean') - os.system('python setup.py build_ext --inplace') - os.chdir('doc') - -def build_prev(ver): - if os.system('git checkout v%s' % ver) != 1: - os.chdir('..') - os.system('python setup.py clean') - os.system('python setup.py build_ext --inplace') - os.chdir('doc') - os.system('python make.py clean') - os.system('python make.py html') - os.system('python make.py latex') - os.system('git checkout master') - - -def clean(): - if os.path.exists('build'): - shutil.rmtree('build') - - if os.path.exists('source/generated'): - shutil.rmtree('source/generated') + Parameters + ---------- + include_api : bool + Whether API documentation will be built. + single_doc : str or None + If provided, this single documentation page will be generated. + """ + if single_doc is not None: + single_doc = os.path.splitext(os.path.basename(single_doc))[0] + include_api = False + + with open(os.path.join(SOURCE_PATH, 'index.rst.template')) as f: + t = jinja2.Template(f.read()) + + with open(os.path.join(SOURCE_PATH, 'index.rst'), 'w') as f: + f.write(t.render(include_api=include_api, + single_doc=single_doc)) @contextmanager -def maybe_exclude_notebooks(): - """ - Skip building the notebooks if pandoc is not installed. +def _maybe_exclude_notebooks(): + """Skip building the notebooks if pandoc is not installed. + This assumes that nbsphinx is installed. + + Skip notebook conversion if: + 1. nbconvert isn't installed, or + 2. nbconvert is installed, but pandoc isn't """ base = os.path.dirname(__file__) notebooks = [os.path.join(base, 'source', nb) @@ -135,304 +69,175 @@ def _remove_notebooks(): contents[nb] = f.read() os.remove(nb) - # Skip notebook conversion if - # 1. nbconvert isn't installed, or - # 2. nbconvert is installed, but pandoc isn't try: import nbconvert except ImportError: - print("Warning: nbconvert not installed. Skipping notebooks.") + sys.stderr.write('Warning: nbconvert not installed. ' + 'Skipping notebooks.\n') _remove_notebooks() else: try: nbconvert.utils.pandoc.get_pandoc_version() except nbconvert.utils.pandoc.PandocMissing: - print("Warning: Pandoc is not installed. Skipping notebooks.") + sys.stderr.write('Warning: Pandoc is not installed. ' + 'Skipping notebooks.\n') _remove_notebooks() yield + for nb, content in contents.items(): with open(nb, 'wt') as f: f.write(content) -def html(): - check_build() - - with maybe_exclude_notebooks(): - if os.system('sphinx-build -P -b html -d build/doctrees ' - 'source build/html'): - raise SystemExit("Building HTML failed.") - try: - # remove stale file - os.remove('build/html/pandas.zip') - except: - pass - - -def zip_html(): - try: - print("\nZipping up HTML docs...") - # just in case the wonky build box doesn't have zip - # don't fail this. - os.system('cd build; rm -f html/pandas.zip; zip html/pandas.zip -r -q html/* ') - print("\n") - except: - pass - -def latex(): - check_build() - if sys.platform != 'win32': - # LaTeX format. - if os.system('sphinx-build -j 2 -b latex -d build/doctrees ' - 'source build/latex'): - raise SystemExit("Building LaTeX failed.") - # Produce pdf. - - os.chdir('build/latex') - - # Call the makefile produced by sphinx... - if os.system('make'): - print("Rendering LaTeX failed.") - print("You may still be able to get a usable PDF file by going into 'build/latex'") - print("and executing 'pdflatex pandas.tex' for the requisite number of passes.") - print("Or using the 'latex_forced' target") - raise SystemExit - - os.chdir('../..') - else: - print('latex build has not been tested on windows') - -def latex_forced(): - check_build() - if sys.platform != 'win32': - # LaTeX format. - if os.system('sphinx-build -j 2 -b latex -d build/doctrees ' - 'source build/latex'): - raise SystemExit("Building LaTeX failed.") - # Produce pdf. - - os.chdir('build/latex') - - # Manually call pdflatex, 3 passes should ensure latex fixes up - # all the required cross-references and such. - os.system('pdflatex -interaction=nonstopmode pandas.tex') - os.system('pdflatex -interaction=nonstopmode pandas.tex') - os.system('pdflatex -interaction=nonstopmode pandas.tex') - raise SystemExit("You should check the file 'build/latex/pandas.pdf' for problems.") - - os.chdir('../..') - else: - print('latex build has not been tested on windows') - - -def check_build(): - build_dirs = [ - 'build', 'build/doctrees', 'build/html', - 'build/latex', 'build/plots', 'build/_static', - 'build/_templates'] - for d in build_dirs: - try: - os.mkdir(d) - except OSError: - pass - - -def all(): - # clean() - html() - - -def auto_dev_build(debug=False): - msg = '' - try: - step = 'clean' - clean() - step = 'html' - html() - step = 'upload dev' - upload_dev() - if not debug: - sendmail(step) - - step = 'latex' - latex() - step = 'upload pdf' - upload_dev_pdf() - if not debug: - sendmail(step) - except (Exception, SystemExit) as inst: - msg = str(inst) + '\n' - sendmail(step, '[ERROR] ' + msg) - - -def sendmail(step=None, err_msg=None): - from_name, to_name = _get_config() - - if step is None: - step = '' - - if err_msg is None or '[ERROR]' not in err_msg: - msgstr = 'Daily docs %s completed successfully' % step - subject = "DOC: %s successful" % step - else: - msgstr = err_msg - subject = "DOC: %s failed" % step - - import smtplib - from email.MIMEText import MIMEText - msg = MIMEText(msgstr) - msg['Subject'] = subject - msg['From'] = from_name - msg['To'] = to_name - - server_str, port, login, pwd = _get_credentials() - server = smtplib.SMTP(server_str, port) - server.ehlo() - server.starttls() - server.ehlo() - - server.login(login, pwd) - try: - server.sendmail(from_name, to_name, msg.as_string()) - finally: - server.close() - - -def _get_dir(subdir=None): - import getpass - USERNAME = getpass.getuser() - if sys.platform == 'darwin': - HOME = '/Users/%s' % USERNAME - else: - HOME = '/home/%s' % USERNAME - - if subdir is None: - subdir = '/code/scripts/config' - conf_dir = '%s/%s' % (HOME, subdir) - return conf_dir - - -def _get_credentials(): - tmp_dir = _get_dir() - cred = '%s/credentials' % tmp_dir - with open(cred, 'r') as fh: - server, port, un, domain = fh.read().split(',') - port = int(port) - login = un + '@' + domain + '.com' - - import base64 - with open('%s/cron_email_pwd' % tmp_dir, 'r') as fh: - pwd = base64.b64decode(fh.read()) - - return server, port, login, pwd - - -def _get_config(): - tmp_dir = _get_dir() - with open('%s/addresses' % tmp_dir, 'r') as fh: - from_name, to_name = fh.read().split(',') - return from_name, to_name - -funcd = { - 'html': html, - 'zip_html': zip_html, - 'upload_dev': upload_dev, - 'upload_stable': upload_stable, - 'upload_dev_pdf': upload_dev_pdf, - 'upload_stable_pdf': upload_stable_pdf, - 'latex': latex, - 'latex_forced': latex_forced, - 'clean': clean, - 'auto_dev': auto_dev_build, - 'auto_debug': lambda: auto_dev_build(True), - 'build_pandas': build_pandas, - 'all': all, -} - -small_docs = False - -# current_dir = os.getcwd() -# os.chdir(os.path.dirname(os.path.join(current_dir, __file__))) - -import argparse -argparser = argparse.ArgumentParser(description=""" -pandas documentation builder -""".strip()) - -# argparser.add_argument('-arg_name', '--arg_name', -# metavar='label for arg help', -# type=str|etc, -# nargs='N|*|?|+|argparse.REMAINDER', -# required=False, -# #choices='abc', -# help='help string', -# action='store|store_true') - -# args = argparser.parse_args() - -#print args.accumulate(args.integers) - -def generate_index(api=True, single=False, **kwds): - from jinja2 import Template - with open("source/index.rst.template") as f: - t = Template(f.read()) +class DocBuilder: + """Class to wrap the different commands of this script. - with open("source/index.rst","w") as f: - f.write(t.render(api=api,single=single,**kwds)) + All public methods of this class can be called as parameters of the + script. + """ + def __init__(self, num_jobs=1): + self.num_jobs = num_jobs + + @staticmethod + def _create_build_structure(): + """Create directories required to build documentation.""" + for dirname in BUILD_DIRS: + try: + os.makedirs(os.path.join(BUILD_PATH, dirname)) + except OSError: + pass + + @staticmethod + def _run_os(*args): + """Execute a command as a OS terminal. + + Parameters + ---------- + *args : list of str + Command and parameters to be executed + + Examples + -------- + >>> DocBuilder()._run_os('python', '--version') + """ + subprocess.check_call(args, stderr=subprocess.STDOUT) + + def _sphinx_build(self, kind): + """Call sphinx to build documentation. + + Attribute `num_jobs` from the class is used. + + Parameters + ---------- + kind : {'html', 'latex'} + + Examples + -------- + >>> DocBuilder(num_jobs=4)._sphinx_build('html') + """ + if kind not in ('html', 'latex'): + raise ValueError('kind must be html or latex, not {}'.format(kind)) + + self._run_os('sphinx-build', + '-j{}'.format(self.num_jobs), + '-b{}'.format(kind), + '-d{}'.format(os.path.join(BUILD_PATH, + 'doctrees')), + SOURCE_PATH, + os.path.join(BUILD_PATH, kind)) + + def html(self): + """Build HTML documentation.""" + self._create_build_structure() + with _maybe_exclude_notebooks(): + self._sphinx_build('html') + zip_fname = os.path.join(BUILD_PATH, 'html', 'pandas.zip') + if os.path.exists(zip_fname): + os.remove(zip_fname) + + def latex(self, force=False): + """Build PDF documentation.""" + self._create_build_structure() + if sys.platform == 'win32': + sys.stderr.write('latex build has not been tested on windows\n') + else: + self._sphinx_build('latex') + os.chdir(os.path.join(BUILD_PATH, 'latex')) + if force: + for i in range(3): + self._run_os('pdflatex', + '-interaction=nonstopmode', + 'pandas.tex') + raise SystemExit('You should check the file ' + '"build/latex/pandas.pdf" for problems.') + else: + self._run_os('make') + + def latex_forced(self): + """Build PDF documentation with retries to find missing references.""" + self.latex(force=True) + + @staticmethod + def clean(): + """Clean documentation generated files.""" + shutil.rmtree(BUILD_PATH, ignore_errors=True) + shutil.rmtree(os.path.join(SOURCE_PATH, 'generated'), + ignore_errors=True) + + def zip_html(self): + """Compress HTML documentation into a zip file.""" + zip_fname = os.path.join(BUILD_PATH, 'html', 'pandas.zip') + if os.path.exists(zip_fname): + os.remove(zip_fname) + dirname = os.path.join(BUILD_PATH, 'html') + fnames = os.listdir(dirname) + os.chdir(dirname) + self._run_os('zip', + zip_fname, + '-r', + '-q', + *fnames) -import argparse -argparser = argparse.ArgumentParser(description="pandas documentation builder", - epilog="Targets : %s" % funcd.keys()) - -argparser.add_argument('--no-api', - default=False, - help='Ommit api and autosummary', - action='store_true') -argparser.add_argument('--single', - metavar='FILENAME', - type=str, - default=False, - help='filename of section to compile, e.g. "indexing"') -argparser.add_argument('--user', - type=str, - default=False, - help='Username to connect to the pydata server') def main(): - args, unknown = argparser.parse_known_args() - sys.argv = [sys.argv[0]] + unknown - if args.single: - args.single = os.path.basename(args.single).split(".rst")[0] - - if 'clean' in unknown: - args.single=False - - generate_index(api=not args.no_api and not args.single, single=args.single) - - if len(sys.argv) > 2: - ftype = sys.argv[1] - ver = sys.argv[2] - - if ftype == 'build_previous': - build_prev(ver, user=args.user) - if ftype == 'upload_previous': - upload_prev(ver, user=args.user) - elif len(sys.argv) == 2: - for arg in sys.argv[1:]: - func = funcd.get(arg) - if func is None: - raise SystemExit('Do not know how to handle %s; valid args are %s' % ( - arg, list(funcd.keys()))) - if args.user: - func(user=args.user) - else: - func() - else: - small_docs = False - all() -# os.chdir(current_dir) + cmds = [method for method in dir(DocBuilder) if not method.startswith('_')] + + argparser = argparse.ArgumentParser( + description='pandas documentation builder', + epilog='Commands: {}'.format(','.join(cmds))) + argparser.add_argument('command', + nargs='?', + default='html', + help='command to run: {}'.format(', '.join(cmds))) + argparser.add_argument('--num-jobs', + type=int, + default=1, + help='number of jobs used by sphinx-build') + argparser.add_argument('--no-api', + default=False, + help='ommit api and autosummary', + action='store_true') + argparser.add_argument('--single', + metavar='FILENAME', + type=str, + default=None, + help=('filename of section to compile, ' + 'e.g. "indexing"')) + argparser.add_argument('--python-path', + type=str, + default=os.path.join(DOC_PATH, '..'), + help='path') + args = argparser.parse_args() + + if args.command not in cmds: + raise ValueError('Unknown command {}. Available options: {}'.format( + args.command, ', '.join(cmds))) + + os.environ['PYTHONPATH'] = args.python_path + _generate_index(not args.no_api, args.single) + getattr(DocBuilder(args.num_jobs), args.command)() + if __name__ == '__main__': - import sys sys.exit(main()) diff --git a/doc/source/index.rst.template b/doc/source/index.rst.template index 7c7457df8ea93..eff1227e98994 100644 --- a/doc/source/index.rst.template +++ b/doc/source/index.rst.template @@ -109,10 +109,10 @@ See the package overview for more detail about what's in the library. .. toctree:: :maxdepth: 4 - {% if single -%} - {{ single }} + {% if single_doc -%} + {{ single_doc }} {% endif -%} - {%if not single -%} + {% if not single_doc -%} whatsnew install contributing @@ -146,10 +146,10 @@ See the package overview for more detail about what's in the library. comparison_with_sql comparison_with_sas {% endif -%} - {% if api -%} + {% if include_api -%} api {% endif -%} - {%if not single -%} + {% if not single_doc -%} developer internals release From 820b4d05c392d2d984a89f1902ab0f51b7f1d1ae Mon Sep 17 00:00:00 2001 From: Maximilian Roos <5635139+maxim-lian@users.noreply.github.com> Date: Wed, 21 Feb 2018 21:17:53 -0500 Subject: [PATCH 165/214] DOC: RangeIndex as default index (#19781) --- pandas/core/frame.py | 4 ++-- pandas/core/series.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d81d22173bfbd..c607f1fa1c24c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -251,11 +251,11 @@ class DataFrame(NDFrame): data : numpy ndarray (structured or homogeneous), dict, or DataFrame Dict can contain Series, arrays, constants, or list-like objects index : Index or array-like - Index to use for resulting frame. Will default to np.arange(n) if + Index to use for resulting frame. Will default to RangeIndex if no indexing information part of input data and no index provided columns : Index or array-like Column labels to use for resulting frame. Will default to - np.arange(n) if no column labels are provided + RangeIndex (0, 1, 2, ..., n) if no column labels are provided dtype : dtype, default None Data type to force. Only a single dtype is allowed. If None, infer copy : boolean, default False diff --git a/pandas/core/series.py b/pandas/core/series.py index 79ffb8be65838..5f2194bda870c 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -131,7 +131,7 @@ class Series(base.IndexOpsMixin, generic.NDFrame): index : array-like or Index (1d) Values must be hashable and have the same length as `data`. Non-unique index values are allowed. Will default to - RangeIndex(len(data)) if not provided. If both a dict and index + RangeIndex (0, 1, 2, ..., n) if not provided. If both a dict and index sequence are used, the index will override the keys found in the dict. dtype : numpy.dtype or None From ca05d7c10cb0e4641b702002b2d0281f0ba37a42 Mon Sep 17 00:00:00 2001 From: Kyle Barron Date: Wed, 21 Feb 2018 21:20:05 -0500 Subject: [PATCH 166/214] Update df.to_stata() docstring (#19818) --- pandas/core/frame.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c607f1fa1c24c..c7e9cd9411633 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1612,7 +1612,7 @@ def to_stata(self, fname, convert_dates=None, write_index=True, time_stamp : datetime A datetime to use as file creation date. Default is the current time. - dataset_label : str + data_label : str A label for the data set. Must be 80 characters or smaller. variable_labels : dict Dictionary containing columns as keys and variable labels as @@ -1635,10 +1635,18 @@ def to_stata(self, fname, convert_dates=None, write_index=True, Examples -------- + >>> data.to_stata('./data_file.dta') + + Or with dates + + >>> data.to_stata('./date_data_file.dta', {2 : 'tw'}) + + Alternatively you can create an instance of the StataWriter class + >>> writer = StataWriter('./data_file.dta', data) >>> writer.write_file() - Or with dates + With dates: >>> writer = StataWriter('./date_data_file.dta', data, {2 : 'tw'}) >>> writer.write_file() From a6183a2d9c8d1c5a83940e891475ac23682be75a Mon Sep 17 00:00:00 2001 From: ZhuBaohe Date: Thu, 22 Feb 2018 10:23:11 +0800 Subject: [PATCH 167/214] DOC: correct Series.reset_index example (#19832) --- pandas/core/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 5f2194bda870c..6fcd54ecc6118 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1015,7 +1015,7 @@ def reset_index(self, level=None, drop=False, name=None, inplace=False): >>> s = pd.Series([1, 2, 3, 4], index=pd.Index(['a', 'b', 'c', 'd'], ... name = 'idx')) >>> s.reset_index() - index 0 + idx 0 0 0 1 1 1 2 2 2 3 From b585e3b5145a73cc7480db9a3b0c3ce7bd55511e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 21 Feb 2018 19:08:50 -0800 Subject: [PATCH 168/214] implement add_offset_array for PeriodIndex (#19826) --- pandas/core/indexes/period.py | 23 ++++++++ .../tests/indexes/period/test_arithmetic.py | 54 ++++++++++++++----- 2 files changed, 63 insertions(+), 14 deletions(-) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 60798e6d77e37..88f9297652ebf 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -44,6 +44,7 @@ from pandas.util._decorators import (Appender, Substitution, cache_readonly, deprecate_kwarg) from pandas.compat import zip, u +from pandas.errors import PerformanceWarning import pandas.core.indexes.base as ibase _index_doc_kwargs = dict(ibase._index_doc_kwargs) @@ -746,6 +747,28 @@ def _sub_period(self, other): # result must be Int64Index or Float64Index return Index(new_data) + def _add_offset_array(self, other): + # Array/Index of DateOffset objects + if len(other) == 1: + return self + other[0] + else: + warnings.warn("Adding/subtracting array of DateOffsets to " + "{cls} not vectorized" + .format(cls=type(self).__name__), PerformanceWarning) + res_values = self.astype('O').values + np.array(other) + return self.__class__(res_values) + + def _sub_offset_array(self, other): + # Array/Index of DateOffset objects + if len(other) == 1: + return self - other[0] + else: + warnings.warn("Adding/subtracting array of DateOffsets to " + "{cls} not vectorized" + .format(cls=type(self).__name__), PerformanceWarning) + res_values = self.astype('O').values - np.array(other) + return self.__class__(res_values) + def shift(self, n): """ Specialized shift which produces an PeriodIndex diff --git a/pandas/tests/indexes/period/test_arithmetic.py b/pandas/tests/indexes/period/test_arithmetic.py index e16d346542b9e..0c06e6a4963b4 100644 --- a/pandas/tests/indexes/period/test_arithmetic.py +++ b/pandas/tests/indexes/period/test_arithmetic.py @@ -9,6 +9,7 @@ period_range, Period, PeriodIndex, _np_version_under1p10) import pandas.core.indexes.period as period +from pandas.errors import PerformanceWarning _common_mismatch = [pd.offsets.YearBegin(2), @@ -254,32 +255,57 @@ def test_comp_nat(self, dtype): class TestPeriodIndexArithmetic(object): - def test_pi_add_offset_array(self): + @pytest.mark.parametrize('box', [np.array, pd.Index]) + def test_pi_add_offset_array(self, box): # GH#18849 pi = pd.PeriodIndex([pd.Period('2015Q1'), pd.Period('2016Q2')]) - offs = np.array([pd.offsets.QuarterEnd(n=1, startingMonth=12), - pd.offsets.QuarterEnd(n=-2, startingMonth=12)]) - res = pi + offs + offs = box([pd.offsets.QuarterEnd(n=1, startingMonth=12), + pd.offsets.QuarterEnd(n=-2, startingMonth=12)]) expected = pd.PeriodIndex([pd.Period('2015Q2'), pd.Period('2015Q4')]) + + with tm.assert_produces_warning(PerformanceWarning): + res = pi + offs tm.assert_index_equal(res, expected) + with tm.assert_produces_warning(PerformanceWarning): + res2 = offs + pi + tm.assert_index_equal(res2, expected) + unanchored = np.array([pd.offsets.Hour(n=1), pd.offsets.Minute(n=-2)]) + # addition/subtraction ops with incompatible offsets should issue + # a PerformanceWarning and _then_ raise a TypeError. with pytest.raises(period.IncompatibleFrequency): - pi + unanchored - with pytest.raises(TypeError): - unanchored + pi + with tm.assert_produces_warning(PerformanceWarning): + pi + unanchored + with pytest.raises(period.IncompatibleFrequency): + with tm.assert_produces_warning(PerformanceWarning): + unanchored + pi - @pytest.mark.xfail(reason='GH#18824 radd doesnt implement this case') - def test_pi_radd_offset_array(self): - # GH#18849 + @pytest.mark.parametrize('box', [np.array, pd.Index]) + def test_pi_sub_offset_array(self, box): + # GH#18824 pi = pd.PeriodIndex([pd.Period('2015Q1'), pd.Period('2016Q2')]) - offs = np.array([pd.offsets.QuarterEnd(n=1, startingMonth=12), - pd.offsets.QuarterEnd(n=-2, startingMonth=12)]) - res = offs + pi - expected = pd.PeriodIndex([pd.Period('2015Q2'), pd.Period('2015Q4')]) + other = box([pd.offsets.QuarterEnd(n=1, startingMonth=12), + pd.offsets.QuarterEnd(n=-2, startingMonth=12)]) + + expected = PeriodIndex([pi[n] - other[n] for n in range(len(pi))]) + + with tm.assert_produces_warning(PerformanceWarning): + res = pi - other tm.assert_index_equal(res, expected) + anchored = box([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)]) + + # addition/subtraction ops with anchored offsets should issue + # a PerformanceWarning and _then_ raise a TypeError. + with pytest.raises(period.IncompatibleFrequency): + with tm.assert_produces_warning(PerformanceWarning): + pi - anchored + with pytest.raises(period.IncompatibleFrequency): + with tm.assert_produces_warning(PerformanceWarning): + anchored - pi + def test_pi_add_iadd_pi_raises(self): rng = pd.period_range('1/1/2000', freq='D', periods=5) other = pd.period_range('1/6/2000', freq='D', periods=5) From 02f630883f2ebe3365863c914d6898a06b51e4b6 Mon Sep 17 00:00:00 2001 From: Paul Reidy Date: Thu, 22 Feb 2018 10:34:46 +0000 Subject: [PATCH 169/214] ENH: Add columns parameter to from_dict (#19802) --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/core/frame.py | 15 ++++++++++++--- pandas/tests/frame/test_constructors.py | 19 +++++++++++++++++++ 3 files changed, 32 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 76c4fa08fca4d..1ae15f363a2d0 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -295,6 +295,7 @@ Other Enhancements - ``IntervalIndex.astype`` now supports conversions between subtypes when passed an ``IntervalDtype`` (:issue:`19197`) - :class:`IntervalIndex` and its associated constructor methods (``from_arrays``, ``from_breaks``, ``from_tuples``) have gained a ``dtype`` parameter (:issue:`19262`) - Added :func:`SeriesGroupBy.is_monotonic_increasing` and :func:`SeriesGroupBy.is_monotonic_decreasing` (:issue:`17015`) +- :func:`DataFrame.from_dict` now accepts a ``columns`` argument that can be used to specify the column names when ``orient='index'`` is used (:issue:`18529`) .. _whatsnew_0230.api_breaking: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c7e9cd9411633..2aae4dffbeaaf 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -876,7 +876,7 @@ def dot(self, other): # IO methods (to / from other formats) @classmethod - def from_dict(cls, data, orient='columns', dtype=None): + def from_dict(cls, data, orient='columns', dtype=None, columns=None): """ Construct DataFrame from dict of array-like or dicts @@ -890,12 +890,17 @@ def from_dict(cls, data, orient='columns', dtype=None): (default). Otherwise if the keys should be rows, pass 'index'. dtype : dtype, default None Data type to force, otherwise infer + columns: list, default None + Column labels to use when orient='index'. Raises a ValueError + if used with orient='columns' + + .. versionadded:: 0.23.0 Returns ------- DataFrame """ - index, columns = None, None + index = None orient = orient.lower() if orient == 'index': if len(data) > 0: @@ -904,7 +909,11 @@ def from_dict(cls, data, orient='columns', dtype=None): data = _from_nested_dict(data) else: data, index = list(data.values()), list(data.keys()) - elif orient != 'columns': # pragma: no cover + elif orient == 'columns': + if columns is not None: + raise ValueError("cannot use columns parameter with " + "orient='columns'") + else: # pragma: no cover raise ValueError('only recognize index or columns for orient') return cls(data, index=index, columns=columns, dtype=dtype) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 8abd88d8a379c..394997201f320 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1091,6 +1091,25 @@ def test_constructor_orient(self): xp = DataFrame.from_dict(a).T.reindex(list(a.keys())) tm.assert_frame_equal(rs, xp) + def test_from_dict_columns_parameter(self): + # GH 18529 + # Test new columns parameter for from_dict that was added to make + # from_items(..., orient='index', columns=[...]) easier to replicate + result = DataFrame.from_dict(OrderedDict([('A', [1, 2]), + ('B', [4, 5])]), + orient='index', columns=['one', 'two']) + expected = DataFrame([[1, 2], [4, 5]], index=['A', 'B'], + columns=['one', 'two']) + tm.assert_frame_equal(result, expected) + + msg = "cannot use columns parameter with orient='columns'" + with tm.assert_raises_regex(ValueError, msg): + DataFrame.from_dict(dict([('A', [1, 2]), ('B', [4, 5])]), + orient='columns', columns=['one', 'two']) + with tm.assert_raises_regex(ValueError, msg): + DataFrame.from_dict(dict([('A', [1, 2]), ('B', [4, 5])]), + columns=['one', 'two']) + def test_constructor_Series_named(self): a = Series([1, 2, 3], index=['a', 'b', 'c'], name='x') df = DataFrame(a) From 399a96b609802c5259c4f471efad3b9843abaacc Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 22 Feb 2018 03:12:52 -0800 Subject: [PATCH 170/214] fix Timedelta.__mul__(NaT) (#19819) --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/_libs/tslibs/timedeltas.pyx | 2 +- pandas/tests/scalar/timedelta/test_arithmetic.py | 10 ++++++++++ pandas/tests/scalar/timedelta/test_timedelta.py | 5 ----- 4 files changed, 12 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 1ae15f363a2d0..b2ac6ecc7e011 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -757,6 +757,7 @@ Datetimelike Timedelta ^^^^^^^^^ +- Bug in :func:`Timedelta.__mul__` where multiplying by ``NaT`` returned ``NaT`` instead of raising a ``TypeError`` (:issue:`19819`) - Bug in :class:`Series` with ``dtype='timedelta64[ns]`` where addition or subtraction of ``TimedeltaIndex`` had results cast to ``dtype='int64'`` (:issue:`17250`) - Bug in :class:`Series` with ``dtype='timedelta64[ns]`` where addition or subtraction of ``TimedeltaIndex`` could return a ``Series`` with an incorrect name (:issue:`19043`) - Bug in :func:`Timedelta.__floordiv__` and :func:`Timedelta.__rfloordiv__` dividing by many incompatible numpy objects was incorrectly allowed (:issue:`18846`) diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 78fdeb988e0f2..1285cbb9ff62b 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -1057,7 +1057,7 @@ class Timedelta(_Timedelta): return other * self.to_timedelta64() elif other is NaT: - return NaT + raise TypeError('Cannot multiply Timedelta with NaT') elif not (is_integer_object(other) or is_float_object(other)): # only integers and floats allowed diff --git a/pandas/tests/scalar/timedelta/test_arithmetic.py b/pandas/tests/scalar/timedelta/test_arithmetic.py index 48da23f3575ab..8460633febba9 100644 --- a/pandas/tests/scalar/timedelta/test_arithmetic.py +++ b/pandas/tests/scalar/timedelta/test_arithmetic.py @@ -216,6 +216,16 @@ class TestTimedeltaMultiplicationDivision(object): # --------------------------------------------------------------- # Timedelta.__mul__, __rmul__ + @pytest.mark.parametrize('td_nat', [pd.NaT, + np.timedelta64('NaT', 'ns'), + np.timedelta64('NaT')]) + @pytest.mark.parametrize('op', [operator.mul, ops.rmul]) + def test_td_mul_nat(self, op, td_nat): + # GH#19819 + td = Timedelta(10, unit='d') + with pytest.raises(TypeError): + op(td, td_nat) + @pytest.mark.parametrize('op', [operator.mul, ops.rmul]) def test_td_mul_scalar(self, op): # GH#19738 diff --git a/pandas/tests/scalar/timedelta/test_timedelta.py b/pandas/tests/scalar/timedelta/test_timedelta.py index 4257c610fb960..a80c5d6611b8a 100644 --- a/pandas/tests/scalar/timedelta/test_timedelta.py +++ b/pandas/tests/scalar/timedelta/test_timedelta.py @@ -62,11 +62,6 @@ def test_unary_ops(self): assert abs(-td) == td assert abs(-td) == Timedelta('10d') - def test_binary_ops_nat(self): - td = Timedelta(10, unit='d') - # FIXME: The next test is wrong: td * NaT should raise - assert (td * pd.NaT) is pd.NaT - class TestTimedeltaComparison(object): def test_comparison_object_array(self): From ca27ee9d4f1ac83e0b4a66072277c070557299ff Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 22 Feb 2018 03:15:40 -0800 Subject: [PATCH 171/214] Fix rfloordiv return type, un-xfail Timedelta tests (#19820) --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/_libs/tslibs/timedeltas.pyx | 12 ++++++++++-- pandas/tests/scalar/timedelta/test_arithmetic.py | 6 ------ 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index b2ac6ecc7e011..f0bd6fe4a0bc2 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -767,6 +767,7 @@ Timedelta - Bug in :func:`Timedelta.__floordiv__`, :func:`Timedelta.__rfloordiv__` where operating with a ``Tick`` object would raise a ``TypeError`` instead of returning a numeric value (:issue:`19738`) - Bug in :func:`Period.asfreq` where periods near ``datetime(1, 1, 1)`` could be converted incorrectly (:issue:`19643`) - Bug in :func:`Timedelta.total_seconds()` causing precision errors i.e. ``Timedelta('30S').total_seconds()==30.000000000000004`` (:issue:`19458`) +- Bug in :func: `Timedelta.__rmod__` where operating with a ``numpy.timedelta64`` returned a ``timedelta64`` object instead of a ``Timedelta`` (:issue:`19820`) - Multiplication of :class:`TimedeltaIndex` by ``TimedeltaIndex`` will now raise ``TypeError`` instead of raising ``ValueError`` in cases of length mis-match (:issue`19333`) - diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 1285cbb9ff62b..c4578a289b020 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -1109,7 +1109,11 @@ class Timedelta(_Timedelta): return self // other.delta return NotImplemented - if hasattr(other, 'dtype'): + elif is_timedelta64_object(other): + # convert to Timedelta below + pass + + elif hasattr(other, 'dtype'): if other.dtype.kind == 'm': # also timedelta-like return _broadcast_floordiv_td64(self.value, other, _floordiv) @@ -1144,7 +1148,11 @@ class Timedelta(_Timedelta): return other.delta // self return NotImplemented - if hasattr(other, 'dtype'): + elif is_timedelta64_object(other): + # convert to Timedelta below + pass + + elif hasattr(other, 'dtype'): if other.dtype.kind == 'm': # also timedelta-like return _broadcast_floordiv_td64(self.value, other, _rfloordiv) diff --git a/pandas/tests/scalar/timedelta/test_arithmetic.py b/pandas/tests/scalar/timedelta/test_arithmetic.py index 8460633febba9..179768fcc6709 100644 --- a/pandas/tests/scalar/timedelta/test_arithmetic.py +++ b/pandas/tests/scalar/timedelta/test_arithmetic.py @@ -451,7 +451,6 @@ def test_mod_timedeltalike(self): result = td % NaT assert result is NaT - @pytest.mark.xfail(reason='GH#19378 floordiv td64 returns td64') def test_mod_timedelta64_nat(self): # GH#19365 td = Timedelta(hours=37) @@ -459,7 +458,6 @@ def test_mod_timedelta64_nat(self): result = td % np.timedelta64('NaT', 'ns') assert result is NaT - @pytest.mark.xfail(reason='GH#19378 floordiv td64 returns td64') def test_mod_timedelta64(self): # GH#19365 td = Timedelta(hours=37) @@ -468,7 +466,6 @@ def test_mod_timedelta64(self): assert isinstance(result, Timedelta) assert result == Timedelta(hours=1) - @pytest.mark.xfail(reason='GH#19378 floordiv by Tick not implemented') def test_mod_offset(self): # GH#19365 td = Timedelta(hours=37) @@ -515,7 +512,6 @@ def test_rmod_pytimedelta(self): assert isinstance(result, Timedelta) assert result == Timedelta(minutes=1) - @pytest.mark.xfail(reason='GH#19378 floordiv by Tick not implemented') def test_rmod_timedelta64(self): # GH#19365 td = Timedelta(minutes=3) @@ -574,7 +570,6 @@ def test_divmod(self): assert np.isnan(result[0]) assert result[1] is pd.NaT - @pytest.mark.xfail(reason='GH#19378 floordiv by Tick not implemented') def test_divmod_offset(self): # GH#19365 td = Timedelta(days=2, hours=6) @@ -598,7 +593,6 @@ def test_rdivmod_pytimedelta(self): assert isinstance(result[1], Timedelta) assert result[1] == Timedelta(hours=6) - @pytest.mark.xfail(reason='GH#19378 floordiv by Tick not implemented') def test_rdivmod_offset(self): result = divmod(pd.offsets.Hour(54), Timedelta(hours=-4)) assert result[0] == -14 From 8768876a51a3dfa0625bab6925caca8b3e9fcceb Mon Sep 17 00:00:00 2001 From: jschendel Date: Thu, 22 Feb 2018 04:39:39 -0700 Subject: [PATCH 172/214] BUG: Fix qcut with NaT present (#19833) --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/core/reshape/tile.py | 10 +++++++--- pandas/tests/reshape/test_tile.py | 15 ++++++++++++++- 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index f0bd6fe4a0bc2..ed93503388893 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -901,6 +901,7 @@ Reshaping - Bug in :func:`DataFrame.join` which does an ``outer`` instead of a ``left`` join when being called with multiple DataFrames and some have non-unique indices (:issue:`19624`) - :func:`Series.rename` now accepts ``axis`` as a kwarg (:issue:`18589`) - Comparisons between :class:`Series` and :class:`Index` would return a ``Series`` with an incorrect name, ignoring the ``Index``'s name attribute (:issue:`19582`) +- Bug in :func:`qcut` where datetime and timedelta data with ``NaT`` present raised a ``ValueError`` (:issue:`19768`) Other ^^^^^ diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 777f08bd9db2b..359c030157bd3 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -279,18 +279,22 @@ def _trim_zeros(x): def _coerce_to_type(x): """ if the passed data is of datetime/timedelta type, - this method converts it to integer so that cut method can + this method converts it to numeric so that cut method can handle it """ dtype = None if is_timedelta64_dtype(x): - x = to_timedelta(x).view(np.int64) + x = to_timedelta(x) dtype = np.timedelta64 elif is_datetime64_dtype(x): - x = to_datetime(x).view(np.int64) + x = to_datetime(x) dtype = np.datetime64 + if dtype is not None: + # GH 19768: force NaT to NaN during integer conversion + x = np.where(x.notna(), x.view(np.int64), np.nan) + return x, dtype diff --git a/pandas/tests/reshape/test_tile.py b/pandas/tests/reshape/test_tile.py index f7262a2f0da63..ff914273d47b1 100644 --- a/pandas/tests/reshape/test_tile.py +++ b/pandas/tests/reshape/test_tile.py @@ -6,7 +6,8 @@ from pandas import (Series, isna, to_datetime, DatetimeIndex, Timestamp, Interval, IntervalIndex, Categorical, - cut, qcut, date_range) + cut, qcut, date_range, NaT, TimedeltaIndex) +from pandas.tseries.offsets import Nano, Day import pandas.util.testing as tm from pandas.api.types import CategoricalDtype as CDT @@ -250,6 +251,18 @@ def test_qcut_nas(self): result = qcut(arr, 4) assert isna(result[:20]).all() + @pytest.mark.parametrize('s', [ + Series(DatetimeIndex(['20180101', NaT, '20180103'])), + Series(TimedeltaIndex(['0 days', NaT, '2 days']))], + ids=lambda x: str(x.dtype)) + def test_qcut_nat(self, s): + # GH 19768 + intervals = IntervalIndex.from_tuples( + [(s[0] - Nano(), s[2] - Day()), np.nan, (s[2] - Day(), s[2])]) + expected = Series(Categorical(intervals, ordered=True)) + result = qcut(s, 2) + tm.assert_series_equal(result, expected) + def test_qcut_index(self): result = qcut([0, 2], 2) intervals = [Interval(-0.001, 1), Interval(1, 2)] From 5af06f0587549ce460b20560c1bbe76396705c26 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 22 Feb 2018 14:55:02 +0100 Subject: [PATCH 173/214] CI: Align pep8speaks config with setup.cfg (#19841) --- .pep8speaks.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.pep8speaks.yml b/.pep8speaks.yml index 299b76c8922cc..fda26d87bf7f6 100644 --- a/.pep8speaks.yml +++ b/.pep8speaks.yml @@ -6,5 +6,7 @@ scanner: pycodestyle: max-line-length: 79 ignore: # Errors and warnings to ignore - - E731 - - E402 + - E402, # module level import not at top of file + - E731, # do not assign a lambda expression, use a def + - E741, # do not use variables named 'l', 'O', or 'I' + - W503 # line break before binary operator From abc4ef9132d8bcc8ad987c18fdd757595fd65d8c Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Thu, 22 Feb 2018 14:06:08 +0000 Subject: [PATCH 174/214] DOC: Making doc/source/conf.py pass PEP-8, and added to lint (#19839) --- ci/lint.sh | 6 ++--- doc/source/conf.py | 65 ++++++++++++++++++++++++---------------------- 2 files changed, 37 insertions(+), 34 deletions(-) diff --git a/ci/lint.sh b/ci/lint.sh index b862a3bfcf29e..e3a39668885f0 100755 --- a/ci/lint.sh +++ b/ci/lint.sh @@ -37,12 +37,12 @@ if [ "$LINT" ]; then fi echo "Linting scripts/*.py DONE" - echo "Linting doc script" - flake8 doc/make.py + echo "Linting doc scripts" + flake8 doc/make.py doc/source/conf.py if [ $? -ne "0" ]; then RET=1 fi - echo "Linting doc script DONE" + echo "Linting doc scripts DONE" echo "Linting *.pyx" flake8 pandas --filename=*.pyx --select=E501,E302,E203,E111,E114,E221,E303,E128,E231,E126,E265,E305,E301,E127,E261,E271,E129,W291,E222,E241,E123,F403 diff --git a/doc/source/conf.py b/doc/source/conf.py index 7c4edd0486636..b5fbf096f2626 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -2,7 +2,8 @@ # # pandas documentation build configuration file, created by # -# This file is execfile()d with the current directory set to its containing dir. +# This file is execfile()d with the current directory set to its containing +# dir. # # Note that not all possible configuration values are present in this # autogenerated file. @@ -49,8 +50,9 @@ # -- General configuration ----------------------------------------------- -# Add any Sphinx extension module names here, as strings. They can be extensions -# coming with Sphinx (named 'sphinx.ext.*') or your custom ones. sphinxext. +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. +# sphinxext. extensions = ['sphinx.ext.autodoc', 'sphinx.ext.autosummary', @@ -60,7 +62,8 @@ 'numpydoc', 'ipython_sphinxext.ipython_directive', 'ipython_sphinxext.ipython_console_highlighting', - 'IPython.sphinxext.ipython_console_highlighting', # lowercase didn't work + # lowercase didn't work + 'IPython.sphinxext.ipython_console_highlighting', 'sphinx.ext.intersphinx', 'sphinx.ext.coverage', 'sphinx.ext.mathjax', @@ -95,22 +98,24 @@ files_to_delete.append(f) if files_to_delete: - print("I'm about to DELETE the following:\n%s\n" % list(sorted(files_to_delete))) - sys.stdout.write("WARNING: I'd like to delete those to speed up processing (yes/no)? ") + print("I'm about to DELETE the following:\n{}\n".format( + list(sorted(files_to_delete)))) + sys.stdout.write("WARNING: I'd like to delete those " + "to speed up processing (yes/no)? ") if PY3: answer = input() else: answer = raw_input() - if answer.lower().strip() in ('y','yes'): + if answer.lower().strip() in ('y', 'yes'): for f in files_to_delete: - f = os.path.join(os.path.join(os.path.dirname(__file__),f)) - f= os.path.abspath(f) + f = os.path.join(os.path.join(os.path.dirname(__file__), f)) + f = os.path.abspath(f) try: - print("Deleting %s" % f) + print("Deleting {}".format(f)) os.unlink(f) except: - print("Error deleting %s" % f) + print("Error deleting {}".format(f)) pass # Add any paths that contain templates here, relative to this directory. @@ -137,7 +142,7 @@ import pandas # version = '%s r%s' % (pandas.__version__, svn_version()) -version = '%s' % (pandas.__version__) +version = str(pandas.__version__) # The full version, including alpha/beta/rc tags. release = version @@ -159,8 +164,8 @@ # for source files. exclude_trees = [] -# The reST default role (used for this markup: `text`) to use for all documents. -# default_role = None +# The reST default role (used for this markup: `text`) to use for all +# documents. default_role = None # If true, '()' will be appended to :func: etc. cross-reference text. # add_function_parentheses = True @@ -334,8 +339,8 @@ # The font size ('10pt', '11pt' or '12pt'). # latex_font_size = '10pt' -# Grouping the document tree into LaTeX files. List of tuples -# (source start file, target name, title, author, documentclass [howto/manual]). +# Grouping the document tree into LaTeX files. List of tuples (source start +# file, target name, title, author, documentclass [howto/manual]). latex_documents = [ ('index', 'pandas.tex', u('pandas: powerful Python data analysis toolkit'), @@ -392,7 +397,7 @@ # wherever the docs are built. The docs' target is the browser, not # the console, so this is fine. 'pd.options.display.encoding="utf8"' - ] +] # Add custom Documenter to handle attributes/methods of an AccessorProperty @@ -400,7 +405,8 @@ import sphinx from sphinx.util import rpartition -from sphinx.ext.autodoc import Documenter, MethodDocumenter, AttributeDocumenter +from sphinx.ext.autodoc import ( + Documenter, MethodDocumenter, AttributeDocumenter) from sphinx.ext.autosummary import Autosummary @@ -408,7 +414,6 @@ class AccessorDocumenter(MethodDocumenter): """ Specialized Documenter subclass for accessors. """ - objtype = 'accessor' directivetype = 'method' @@ -426,7 +431,6 @@ class AccessorLevelDocumenter(Documenter): Specialized Documenter subclass for objects on accessor level (methods, attributes). """ - # This is the simple straightforward version # modname is None, base the last elements (eg 'hour') # and path the part before (eg 'Series.dt') @@ -436,7 +440,6 @@ class AccessorLevelDocumenter(Documenter): # mod_cls = mod_cls.split('.') # # return modname, mod_cls + [base] - def resolve_name(self, modname, parents, path, base): if modname is None: if path: @@ -471,16 +474,17 @@ def resolve_name(self, modname, parents, path, base): return modname, parents + [base] -class AccessorAttributeDocumenter(AccessorLevelDocumenter, AttributeDocumenter): - +class AccessorAttributeDocumenter(AccessorLevelDocumenter, + AttributeDocumenter): objtype = 'accessorattribute' directivetype = 'attribute' - # lower than AttributeDocumenter so this is not chosen for normal attributes + # lower than AttributeDocumenter so this is not chosen for normal + # attributes priority = 0.6 -class AccessorMethodDocumenter(AccessorLevelDocumenter, MethodDocumenter): +class AccessorMethodDocumenter(AccessorLevelDocumenter, MethodDocumenter): objtype = 'accessormethod' directivetype = 'method' @@ -508,7 +512,6 @@ class PandasAutosummary(Autosummary): This alternative autosummary class lets us override the table summary for Series.plot and DataFrame.plot in the API docs. """ - def _replace_pandas_items(self, display_name, sig, summary, real_name): # this a hack: ideally we should extract the signature from the # .__call__ method instead of hard coding this @@ -561,18 +564,18 @@ def linkcode_resolve(domain, info): lineno = None if lineno: - linespec = "#L%d-L%d" % (lineno, lineno + len(source) - 1) + linespec = "#L{:d}-L{:d}".format(lineno, lineno + len(source) - 1) else: linespec = "" fn = os.path.relpath(fn, start=os.path.dirname(pandas.__file__)) if '+' in pandas.__version__: - return "http://github.com/pandas-dev/pandas/blob/master/pandas/%s%s" % ( - fn, linespec) + return ("http://github.com/pandas-dev/pandas/blob/master/pandas/" + "{}{}".format(fn, linespec)) else: - return "http://github.com/pandas-dev/pandas/blob/v%s/pandas/%s%s" % ( - pandas.__version__, fn, linespec) + return ("http://github.com/pandas-dev/pandas/blob/" + "v{}/pandas/{}{}".format(pandas.__version__, fn, linespec)) # remove the docstring of the flags attribute (inherited from numpy ndarray) From 4871b48f7b314c96cfb5bfd60afad8d8312937a4 Mon Sep 17 00:00:00 2001 From: topper-123 Date: Fri, 23 Feb 2018 01:22:34 +0000 Subject: [PATCH 175/214] Let initialisation from dicts use insertion order for py>=36, part I (#19830) --- pandas/tests/frame/test_apply.py | 2 +- pandas/tests/frame/test_block_internals.py | 4 ++-- pandas/tests/frame/test_constructors.py | 21 ++++++++++---------- pandas/tests/frame/test_dtypes.py | 4 ++-- pandas/tests/frame/test_indexing.py | 4 ++-- pandas/tests/frame/test_mutate_columns.py | 12 +++++------ pandas/tests/frame/test_nonunique_indexes.py | 8 ++++---- pandas/tests/frame/test_reshape.py | 5 +++-- pandas/tests/frame/test_to_csv.py | 6 +++--- 9 files changed, 33 insertions(+), 33 deletions(-) diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index d1ad9f71e6350..a057ca0879cac 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -646,7 +646,7 @@ def test_infer_output_shape_columns(self): 'datetime': [pd.Timestamp('2017-11-29 03:30:00'), pd.Timestamp('2017-11-29 03:45:00')]}) result = df.apply(lambda row: (row.number, row.string), axis=1) - expected = Series([t[2:] for t in df.itertuples()]) + expected = Series([(t.number, t.string) for t in df.itertuples()]) assert_series_equal(result, expected) def test_infer_output_shape_listlike_columns(self): diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index 8b1fd7d50cb4d..8e012922d25f1 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -405,8 +405,8 @@ def test_get_numeric_data(self): result = df.get_dtype_counts() expected = Series({'int64': 1, 'float64': 1, datetime64name: 1, objectname: 1}) - result.sort_index() - expected.sort_index() + result = result.sort_index() + expected = expected.sort_index() assert_series_equal(result, expected) df = DataFrame({'a': 1., 'b': 2, 'c': 'foo', diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 394997201f320..e0b94815878dd 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1381,9 +1381,8 @@ def test_constructor_with_datetimes(self): expected['float64'] = 1 expected[floatname] = 1 - result.sort_index() - expected = Series(expected) - expected.sort_index() + result = result.sort_index() + expected = Series(expected).sort_index() tm.assert_series_equal(result, expected) # check with ndarray construction ndim>0 @@ -1392,7 +1391,7 @@ def test_constructor_with_datetimes(self): intname: np.array([1] * 10, dtype=intname)}, index=np.arange(10)) result = df.get_dtype_counts() - result.sort_index() + result = result.sort_index() tm.assert_series_equal(result, expected) # GH 2809 @@ -1403,8 +1402,8 @@ def test_constructor_with_datetimes(self): df = DataFrame({'datetime_s': datetime_s}) result = df.get_dtype_counts() expected = Series({datetime64name: 1}) - result.sort_index() - expected.sort_index() + result = result.sort_index() + expected = expected.sort_index() tm.assert_series_equal(result, expected) # GH 2810 @@ -1414,8 +1413,8 @@ def test_constructor_with_datetimes(self): df = DataFrame({'datetimes': datetimes, 'dates': dates}) result = df.get_dtype_counts() expected = Series({datetime64name: 1, objectname: 1}) - result.sort_index() - expected.sort_index() + result = result.sort_index() + expected = expected.sort_index() tm.assert_series_equal(result, expected) # GH 7594 @@ -1538,8 +1537,8 @@ def test_constructor_for_list_with_dtypes(self): result = df.get_dtype_counts() expected = Series( {'int64': 1, 'float64': 2, datetime64name: 1, objectname: 1}) - result.sort_index() - expected.sort_index() + result = result.sort_index() + expected = expected.sort_index() tm.assert_series_equal(result, expected) def test_constructor_frame_copy(self): @@ -1851,7 +1850,7 @@ def test_from_records_misc_brokenness(self): rows.append([datetime(2010, 1, 1), 1]) rows.append([datetime(2010, 1, 2), 1]) df2_obj = DataFrame.from_records(rows, columns=['date', 'test']) - results = df2_obj.get_dtype_counts() + results = df2_obj.get_dtype_counts().sort_index() expected = Series({'datetime64[ns]': 1, 'int64': 1}) tm.assert_series_equal(results, expected) diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index 38bdecc9eb88f..e9e5b2a447a4a 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -725,9 +725,9 @@ def test_timedeltas(self): df = DataFrame(dict(A=Series(date_range('2012-1-1', periods=3, freq='D')), B=Series([timedelta(days=i) for i in range(3)]))) - result = df.get_dtype_counts().sort_values() + result = df.get_dtype_counts().sort_index() expected = Series( - {'datetime64[ns]': 1, 'timedelta64[ns]': 1}).sort_values() + {'datetime64[ns]': 1, 'timedelta64[ns]': 1}).sort_index() assert_series_equal(result, expected) df['C'] = df['A'] + df['B'] diff --git a/pandas/tests/frame/test_indexing.py b/pandas/tests/frame/test_indexing.py index 882fa634d167d..a8b81b1b03552 100644 --- a/pandas/tests/frame/test_indexing.py +++ b/pandas/tests/frame/test_indexing.py @@ -2430,8 +2430,8 @@ def _check_get(df, cond, check_dtypes=True): # upcasting case (GH # 2794) df = DataFrame(dict((c, Series([1] * 3, dtype=c)) - for c in ['int64', 'int32', - 'float32', 'float64'])) + for c in ['float32', 'float64', + 'int32', 'int64'])) df.iloc[1, :] = 0 result = df.where(df >= 0).get_dtype_counts() diff --git a/pandas/tests/frame/test_mutate_columns.py b/pandas/tests/frame/test_mutate_columns.py index 8236a41d00243..4c560129bfa45 100644 --- a/pandas/tests/frame/test_mutate_columns.py +++ b/pandas/tests/frame/test_mutate_columns.py @@ -166,17 +166,17 @@ def test_insert(self): # new item df['x'] = df['a'].astype('float32') - result = Series(dict(float64=5, float32=1)) - assert (df.get_dtype_counts() == result).all() + result = Series(dict(float32=1, float64=5)) + assert (df.get_dtype_counts().sort_index() == result).all() # replacing current (in different block) df['a'] = df['a'].astype('float32') - result = Series(dict(float64=4, float32=2)) - assert (df.get_dtype_counts() == result).all() + result = Series(dict(float32=2, float64=4)) + assert (df.get_dtype_counts().sort_index() == result).all() df['y'] = df['a'].astype('int32') - result = Series(dict(float64=4, float32=2, int32=1)) - assert (df.get_dtype_counts() == result).all() + result = Series(dict(float32=2, float64=4, int32=1)) + assert (df.get_dtype_counts().sort_index() == result).all() with tm.assert_raises_regex(ValueError, 'already exists'): df.insert(1, 'a', df['b']) diff --git a/pandas/tests/frame/test_nonunique_indexes.py b/pandas/tests/frame/test_nonunique_indexes.py index 36465db78361f..0b32ec89d3909 100644 --- a/pandas/tests/frame/test_nonunique_indexes.py +++ b/pandas/tests/frame/test_nonunique_indexes.py @@ -155,14 +155,14 @@ def check(result, expected=None): # rename, GH 4403 df4 = DataFrame( - {'TClose': [22.02], - 'RT': [0.0454], + {'RT': [0.0454], + 'TClose': [22.02], 'TExg': [0.0422]}, index=MultiIndex.from_tuples([(600809, 20130331)], names=['STK_ID', 'RPT_Date'])) - df5 = DataFrame({'STK_ID': [600809] * 3, - 'RPT_Date': [20120930, 20121231, 20130331], + df5 = DataFrame({'RPT_Date': [20120930, 20121231, 20130331], + 'STK_ID': [600809] * 3, 'STK_Name': [u('饡驦'), u('饡驦'), u('饡驦')], 'TClose': [38.05, 41.66, 30.01]}, index=MultiIndex.from_tuples( diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index 7907486c7c98d..68df0982a1e3e 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -719,9 +719,10 @@ def verify(df): assert_frame_equal(left, right) # GH7401 - df = pd.DataFrame({'A': list('aaaaabbbbb'), 'C': np.arange(10), + df = pd.DataFrame({'A': list('aaaaabbbbb'), 'B': (date_range('2012-01-01', periods=5) - .tolist() * 2)}) + .tolist() * 2), + 'C': np.arange(10)}) df.iloc[3, 1] = np.NaN left = df.set_index(['A', 'B']).unstack() diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index a3ba34ae92283..dda5cdea52cac 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -1054,10 +1054,10 @@ def test_to_csv_with_dst_transitions(self): def test_to_csv_quoting(self): df = DataFrame({ - 'c_string': ['a', 'b,c'], - 'c_int': [42, np.nan], - 'c_float': [1.0, 3.2], 'c_bool': [True, False], + 'c_float': [1.0, 3.2], + 'c_int': [42, np.nan], + 'c_string': ['a', 'b,c'], }) expected = """\ From 572476f0a3652222c17458d418a107554580eaa5 Mon Sep 17 00:00:00 2001 From: Kate Surta Date: Fri, 23 Feb 2018 04:40:19 +0300 Subject: [PATCH 176/214] BUG: Fix MultiIndex .loc with all numpy arrays (#19772) --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/core/indexing.py | 3 +-- pandas/tests/indexing/test_loc.py | 43 ++++++++++++++++++++++++++++++- 3 files changed, 44 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index ed93503388893..603e4e6ce0522 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -836,6 +836,7 @@ MultiIndex - Bug in :func:`MultiIndex.get_loc` which would cast boolean to integer labels (:issue:`19086`) - Bug in :func:`MultiIndex.get_loc` which would fail to locate keys containing ``NaN`` (:issue:`18485`) - Bug in :func:`MultiIndex.get_loc` in large :class:`MultiIndex`, would fail when levels had different dtypes (:issue:`18520`) +- Bug in indexing where nested indexers having only numpy arrays are handled incorrectly (:issue:`19686`) I/O diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 352ce921d1d44..eb3aeda7902fc 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -2107,10 +2107,9 @@ def is_nested_tuple(tup, labels): if not isinstance(tup, tuple): return False - # are we nested tuple of: tuple,list,slice for i, k in enumerate(tup): - if isinstance(k, (tuple, list, slice)): + if is_list_like(k) or isinstance(k, slice): return isinstance(labels, MultiIndex) return False diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 433b0d87ac005..86a5a82441ee8 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -8,7 +8,7 @@ import pandas as pd from pandas.compat import lrange, StringIO -from pandas import Series, DataFrame, Timestamp, date_range, MultiIndex +from pandas import Series, DataFrame, Timestamp, date_range, MultiIndex, Index from pandas.util import testing as tm from pandas.tests.indexing.common import Base @@ -711,3 +711,44 @@ def test_identity_slice_returns_new_object(self): original_series[:3] = [7, 8, 9] assert all(sliced_series[:3] == [7, 8, 9]) + + @pytest.mark.parametrize( + 'indexer_type_1', + (list, tuple, set, slice, np.ndarray, Series, Index)) + @pytest.mark.parametrize( + 'indexer_type_2', + (list, tuple, set, slice, np.ndarray, Series, Index)) + def test_loc_getitem_nested_indexer(self, indexer_type_1, indexer_type_2): + # GH #19686 + # .loc should work with nested indexers which can be + # any list-like objects (see `pandas.api.types.is_list_like`) or slices + + def convert_nested_indexer(indexer_type, keys): + if indexer_type == np.ndarray: + return np.array(keys) + if indexer_type == slice: + return slice(*keys) + return indexer_type(keys) + + a = [10, 20, 30] + b = [1, 2, 3] + index = pd.MultiIndex.from_product([a, b]) + df = pd.DataFrame( + np.arange(len(index), dtype='int64'), + index=index, columns=['Data']) + + keys = ([10, 20], [2, 3]) + types = (indexer_type_1, indexer_type_2) + + # check indexers with all the combinations of nested objects + # of all the valid types + indexer = tuple( + convert_nested_indexer(indexer_type, k) + for indexer_type, k in zip(types, keys)) + + result = df.loc[indexer, 'Data'] + expected = pd.Series( + [1, 2, 4, 5], name='Data', + index=pd.MultiIndex.from_product(keys)) + + tm.assert_series_equal(result, expected) From 7e9ac660bf47ab520ba6052d2aba4cd67ea45024 Mon Sep 17 00:00:00 2001 From: ZhuBaohe Date: Fri, 23 Feb 2018 19:24:23 +0800 Subject: [PATCH 177/214] DOC: correct min_count param docstring (#19836) --- pandas/core/generic.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 8034cf89cf8b7..85e2ce475ffa2 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7855,7 +7855,7 @@ def _doc_parms(cls): >>> pd.Series([np.nan]).prod() 1.0 ->>> pd.Series([np.nan]).sum(min_count=1) +>>> pd.Series([np.nan]).prod(min_count=1) nan """ @@ -7867,8 +7867,9 @@ def _doc_parms(cls): .. versionadded :: 0.22.0 - Added with the default being 1. This means the sum or product - of an all-NA or empty series is ``NaN``. + Added with the default being 0. This means the sum of an all-NA + or empty Series is 0, and the product of an all-NA or empty + Series is 1. """ From c3e35a0318e8b187e8dba98e72f2a8f6e260ce64 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 23 Feb 2018 03:35:00 -0800 Subject: [PATCH 178/214] Continue porting period_helper; fix leftover asfreq bug (#19834) --- doc/source/whatsnew/v0.23.0.txt | 2 +- pandas/_libs/src/period_helper.c | 120 +++++++------- pandas/_libs/src/period_helper.h | 24 --- pandas/_libs/tslibs/period.pyx | 152 +++++++----------- pandas/tests/scalar/period/test_period.py | 7 + .../tests/scalar/period/test_period_asfreq.py | 10 ++ 6 files changed, 128 insertions(+), 187 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 603e4e6ce0522..ca5749afd11bc 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -765,7 +765,7 @@ Timedelta - Bug in :class:`TimedeltaIndex` where division by a ``Series`` would return a ``TimedeltaIndex`` instead of a ``Series`` (:issue:`19042`) - Bug in :func:`Timedelta.__add__`, :func:`Timedelta.__sub__` where adding or subtracting a ``np.timedelta64`` object would return another ``np.timedelta64`` instead of a ``Timedelta`` (:issue:`19738`) - Bug in :func:`Timedelta.__floordiv__`, :func:`Timedelta.__rfloordiv__` where operating with a ``Tick`` object would raise a ``TypeError`` instead of returning a numeric value (:issue:`19738`) -- Bug in :func:`Period.asfreq` where periods near ``datetime(1, 1, 1)`` could be converted incorrectly (:issue:`19643`) +- Bug in :func:`Period.asfreq` where periods near ``datetime(1, 1, 1)`` could be converted incorrectly (:issue:`19643`, :issue:`19834`) - Bug in :func:`Timedelta.total_seconds()` causing precision errors i.e. ``Timedelta('30S').total_seconds()==30.000000000000004`` (:issue:`19458`) - Bug in :func: `Timedelta.__rmod__` where operating with a ``numpy.timedelta64`` returned a ``timedelta64`` object instead of a ``Timedelta`` (:issue:`19820`) - Multiplication of :class:`TimedeltaIndex` by ``TimedeltaIndex`` will now raise ``TypeError`` instead of raising ``ValueError`` in cases of length mis-match (:issue`19333`) diff --git a/pandas/_libs/src/period_helper.c b/pandas/_libs/src/period_helper.c index a812ed2e7e2b3..e3d250aa44f17 100644 --- a/pandas/_libs/src/period_helper.c +++ b/pandas/_libs/src/period_helper.c @@ -42,10 +42,10 @@ static int floordiv(int x, int divisor) { static int monthToQuarter(int month) { return ((month - 1) / 3) + 1; } -/* Find the absdate (days elapsed since datetime(1, 1, 1) +/* Find the unix_date (days elapsed since datetime(1970, 1, 1) * for the given year/month/day. * Assumes GREGORIAN_CALENDAR */ -npy_int64 absdate_from_ymd(int year, int month, int day) { +npy_int64 unix_date_from_ymd(int year, int month, int day) { /* Calculate the absolute date */ pandas_datetimestruct dts; npy_int64 unix_date; @@ -55,16 +55,16 @@ npy_int64 absdate_from_ymd(int year, int month, int day) { dts.month = month; dts.day = day; unix_date = pandas_datetimestruct_to_datetime(PANDAS_FR_D, &dts); - return ORD_OFFSET + unix_date; + return unix_date; } /* Sets the date part of the date_info struct Assumes GREGORIAN_CALENDAR */ static int dInfoCalc_SetFromAbsDate(register struct date_info *dinfo, - npy_int64 absdate) { + npy_int64 unix_date) { pandas_datetimestruct dts; - pandas_datetime_to_datetimestruct(absdate - ORD_OFFSET, PANDAS_FR_D, &dts); + pandas_datetime_to_datetimestruct(unix_date, PANDAS_FR_D, &dts); dinfo->year = dts.year; dinfo->month = dts.month; dinfo->day = dts.day; @@ -137,26 +137,26 @@ PANDAS_INLINE npy_int64 transform_via_day(npy_int64 ordinal, return result; } -static npy_int64 DtoB_weekday(npy_int64 absdate) { - return floordiv(absdate, 7) * 5 + mod_compat(absdate, 7) - BDAY_OFFSET; +static npy_int64 DtoB_weekday(npy_int64 unix_date) { + return floordiv(unix_date + 4, 7) * 5 + mod_compat(unix_date + 4, 7) - 4; } static npy_int64 DtoB(struct date_info *dinfo, - int roll_back, npy_int64 absdate) { + int roll_back, npy_int64 unix_date) { int day_of_week = dayofweek(dinfo->year, dinfo->month, dinfo->day); if (roll_back == 1) { if (day_of_week > 4) { // change to friday before weekend - absdate -= (day_of_week - 4); + unix_date -= (day_of_week - 4); } } else { if (day_of_week > 4) { // change to Monday after weekend - absdate += (7 - day_of_week); + unix_date += (7 - day_of_week); } } - return DtoB_weekday(absdate); + return DtoB_weekday(unix_date); } @@ -165,18 +165,19 @@ static npy_int64 DtoB(struct date_info *dinfo, static npy_int64 asfreq_DTtoA(npy_int64 ordinal, asfreq_info *af_info) { struct date_info dinfo; ordinal = downsample_daytime(ordinal, af_info); - dInfoCalc_SetFromAbsDate(&dinfo, ordinal + ORD_OFFSET); + dInfoCalc_SetFromAbsDate(&dinfo, ordinal); if (dinfo.month > af_info->to_a_year_end) { - return (npy_int64)(dinfo.year + 1 - BASE_YEAR); + return (npy_int64)(dinfo.year + 1 - 1970); } else { - return (npy_int64)(dinfo.year - BASE_YEAR); + return (npy_int64)(dinfo.year - 1970); } } -static npy_int64 DtoQ_yq(npy_int64 ordinal, asfreq_info *af_info, int *year, - int *quarter) { +static int DtoQ_yq(npy_int64 ordinal, asfreq_info *af_info, int *year) { struct date_info dinfo; - dInfoCalc_SetFromAbsDate(&dinfo, ordinal + ORD_OFFSET); + int quarter; + + dInfoCalc_SetFromAbsDate(&dinfo, ordinal); if (af_info->to_q_year_end != 12) { dinfo.month -= af_info->to_q_year_end; if (dinfo.month <= 0) { @@ -187,9 +188,8 @@ static npy_int64 DtoQ_yq(npy_int64 ordinal, asfreq_info *af_info, int *year, } *year = dinfo.year; - *quarter = monthToQuarter(dinfo.month); - - return 0; + quarter = monthToQuarter(dinfo.month); + return quarter; } static npy_int64 asfreq_DTtoQ(npy_int64 ordinal, asfreq_info *af_info) { @@ -197,8 +197,8 @@ static npy_int64 asfreq_DTtoQ(npy_int64 ordinal, asfreq_info *af_info) { ordinal = downsample_daytime(ordinal, af_info); - DtoQ_yq(ordinal, af_info, &year, &quarter); - return (npy_int64)((year - BASE_YEAR) * 4 + quarter - 1); + quarter = DtoQ_yq(ordinal, af_info, &year); + return (npy_int64)((year - 1970) * 4 + quarter - 1); } static npy_int64 asfreq_DTtoM(npy_int64 ordinal, asfreq_info *af_info) { @@ -206,28 +206,25 @@ static npy_int64 asfreq_DTtoM(npy_int64 ordinal, asfreq_info *af_info) { ordinal = downsample_daytime(ordinal, af_info); - dInfoCalc_SetFromAbsDate(&dinfo, ordinal + ORD_OFFSET); - return (npy_int64)((dinfo.year - BASE_YEAR) * 12 + dinfo.month - 1); + dInfoCalc_SetFromAbsDate(&dinfo, ordinal); + return (npy_int64)((dinfo.year - 1970) * 12 + dinfo.month - 1); } static npy_int64 asfreq_DTtoW(npy_int64 ordinal, asfreq_info *af_info) { ordinal = downsample_daytime(ordinal, af_info); - return (ordinal + ORD_OFFSET - (1 + af_info->to_week_end)) / 7 + 1 - - WEEK_OFFSET; + return floordiv(ordinal + 3 - af_info->to_week_end, 7) + 1; } static npy_int64 asfreq_DTtoB(npy_int64 ordinal, asfreq_info *af_info) { struct date_info dinfo; - npy_int64 absdate; int roll_back; ordinal = downsample_daytime(ordinal, af_info); - absdate = ordinal + ORD_OFFSET; - dInfoCalc_SetFromAbsDate(&dinfo, absdate); + dInfoCalc_SetFromAbsDate(&dinfo, ordinal); // This usage defines roll_back the opposite way from the others roll_back = 1 - af_info->is_end; - return DtoB(&dinfo, roll_back, absdate); + return DtoB(&dinfo, roll_back, ordinal); } // all intra day calculations are now done within one function @@ -243,10 +240,7 @@ static npy_int64 asfreq_UpsampleWithinDay(npy_int64 ordinal, //************ FROM BUSINESS *************** static npy_int64 asfreq_BtoDT(npy_int64 ordinal, asfreq_info *af_info) { - ordinal += BDAY_OFFSET; - ordinal = - (floordiv(ordinal - 1, 5) * 7 + mod_compat(ordinal - 1, 5) + 1 - - ORD_OFFSET); + ordinal = floordiv(ordinal + 3, 5) * 7 + mod_compat(ordinal + 3, 5) - 3; return upsample_daytime(ordinal, af_info); } @@ -270,8 +264,7 @@ static npy_int64 asfreq_BtoW(npy_int64 ordinal, asfreq_info *af_info) { //************ FROM WEEKLY *************** static npy_int64 asfreq_WtoDT(npy_int64 ordinal, asfreq_info *af_info) { - ordinal = (ordinal + WEEK_OFFSET) * 7 + - af_info->from_week_end - ORD_OFFSET + + ordinal = ordinal * 7 + af_info->from_week_end - 4 + (7 - 1) * (af_info->is_end - 1); return upsample_daytime(ordinal, af_info); } @@ -294,30 +287,29 @@ static npy_int64 asfreq_WtoW(npy_int64 ordinal, asfreq_info *af_info) { static npy_int64 asfreq_WtoB(npy_int64 ordinal, asfreq_info *af_info) { struct date_info dinfo; - npy_int64 absdate = asfreq_WtoDT(ordinal, af_info) + ORD_OFFSET; + npy_int64 unix_date = asfreq_WtoDT(ordinal, af_info); int roll_back = af_info->is_end; - dInfoCalc_SetFromAbsDate(&dinfo, absdate); + dInfoCalc_SetFromAbsDate(&dinfo, unix_date); - return DtoB(&dinfo, roll_back, absdate); + return DtoB(&dinfo, roll_back, unix_date); } //************ FROM MONTHLY *************** static void MtoD_ym(npy_int64 ordinal, int *y, int *m) { - *y = floordiv(ordinal, 12) + BASE_YEAR; + *y = floordiv(ordinal, 12) + 1970; *m = mod_compat(ordinal, 12) + 1; } static npy_int64 asfreq_MtoDT(npy_int64 ordinal, asfreq_info *af_info) { - npy_int64 absdate; + npy_int64 unix_date; int y, m; ordinal += af_info->is_end; MtoD_ym(ordinal, &y, &m); - absdate = absdate_from_ymd(y, m, 1); - ordinal = absdate - ORD_OFFSET; + unix_date = unix_date_from_ymd(y, m, 1); - ordinal -= af_info->is_end; - return upsample_daytime(ordinal, af_info); + unix_date -= af_info->is_end; + return upsample_daytime(unix_date, af_info); } static npy_int64 asfreq_MtoA(npy_int64 ordinal, asfreq_info *af_info) { @@ -334,18 +326,18 @@ static npy_int64 asfreq_MtoW(npy_int64 ordinal, asfreq_info *af_info) { static npy_int64 asfreq_MtoB(npy_int64 ordinal, asfreq_info *af_info) { struct date_info dinfo; - npy_int64 absdate = asfreq_MtoDT(ordinal, af_info) + ORD_OFFSET; + npy_int64 unix_date = asfreq_MtoDT(ordinal, af_info); int roll_back = af_info->is_end; - dInfoCalc_SetFromAbsDate(&dinfo, absdate); + dInfoCalc_SetFromAbsDate(&dinfo, unix_date); - return DtoB(&dinfo, roll_back, absdate); + return DtoB(&dinfo, roll_back, unix_date); } //************ FROM QUARTERLY *************** static void QtoD_ym(npy_int64 ordinal, int *y, int *m, asfreq_info *af_info) { - *y = floordiv(ordinal, 4) + BASE_YEAR; + *y = floordiv(ordinal, 4) + 1970; *m = mod_compat(ordinal, 4) * 3 + 1; if (af_info->from_q_year_end != 12) { @@ -359,16 +351,16 @@ static void QtoD_ym(npy_int64 ordinal, int *y, int *m, asfreq_info *af_info) { } static npy_int64 asfreq_QtoDT(npy_int64 ordinal, asfreq_info *af_info) { - npy_int64 absdate; + npy_int64 unix_date; int y, m; ordinal += af_info->is_end; QtoD_ym(ordinal, &y, &m, af_info); - absdate = absdate_from_ymd(y, m, 1); + unix_date = unix_date_from_ymd(y, m, 1); - absdate -= af_info->is_end; - return upsample_daytime(absdate - ORD_OFFSET, af_info); + unix_date -= af_info->is_end; + return upsample_daytime(unix_date, af_info); } static npy_int64 asfreq_QtoQ(npy_int64 ordinal, asfreq_info *af_info) { @@ -389,21 +381,21 @@ static npy_int64 asfreq_QtoW(npy_int64 ordinal, asfreq_info *af_info) { static npy_int64 asfreq_QtoB(npy_int64 ordinal, asfreq_info *af_info) { struct date_info dinfo; - npy_int64 absdate = asfreq_QtoDT(ordinal, af_info) + ORD_OFFSET; + npy_int64 unix_date = asfreq_QtoDT(ordinal, af_info); int roll_back = af_info->is_end; - dInfoCalc_SetFromAbsDate(&dinfo, absdate); + dInfoCalc_SetFromAbsDate(&dinfo, unix_date); - return DtoB(&dinfo, roll_back, absdate); + return DtoB(&dinfo, roll_back, unix_date); } //************ FROM ANNUAL *************** static npy_int64 asfreq_AtoDT(npy_int64 ordinal, asfreq_info *af_info) { - npy_int64 absdate; + npy_int64 unix_date; // start from 1970 - npy_int64 year = ordinal + BASE_YEAR; + npy_int64 year = ordinal + 1970; int month = (af_info->from_a_year_end % 12) + 1; if (af_info->from_a_year_end != 12) { @@ -411,10 +403,10 @@ static npy_int64 asfreq_AtoDT(npy_int64 ordinal, asfreq_info *af_info) { } year += af_info->is_end; - absdate = absdate_from_ymd(year, month, 1); + unix_date = unix_date_from_ymd(year, month, 1); - absdate -= af_info->is_end; - return upsample_daytime(absdate - ORD_OFFSET, af_info); + unix_date -= af_info->is_end; + return upsample_daytime(unix_date, af_info); } static npy_int64 asfreq_AtoA(npy_int64 ordinal, asfreq_info *af_info) { @@ -435,11 +427,11 @@ static npy_int64 asfreq_AtoW(npy_int64 ordinal, asfreq_info *af_info) { static npy_int64 asfreq_AtoB(npy_int64 ordinal, asfreq_info *af_info) { struct date_info dinfo; - npy_int64 absdate = asfreq_AtoDT(ordinal, af_info) + ORD_OFFSET; + npy_int64 unix_date = asfreq_AtoDT(ordinal, af_info); int roll_back = af_info->is_end; - dInfoCalc_SetFromAbsDate(&dinfo, absdate); + dInfoCalc_SetFromAbsDate(&dinfo, unix_date); - return DtoB(&dinfo, roll_back, absdate); + return DtoB(&dinfo, roll_back, unix_date); } static npy_int64 nofunc(npy_int64 ordinal, asfreq_info *af_info) { diff --git a/pandas/_libs/src/period_helper.h b/pandas/_libs/src/period_helper.h index 1573b1eeec74b..7163dc960d152 100644 --- a/pandas/_libs/src/period_helper.h +++ b/pandas/_libs/src/period_helper.h @@ -20,32 +20,8 @@ frequency conversion routines. #include "limits.h" #include "numpy/ndarraytypes.h" -/* - * declarations from period here - */ - -#define Py_Error(errortype, errorstr) \ - { \ - PyErr_SetString(errortype, errorstr); \ - goto onError; \ - } - /*** FREQUENCY CONSTANTS ***/ -// HIGHFREQ_ORIG is the datetime ordinal from which to begin the second -// frequency ordinal sequence - -// #define HIGHFREQ_ORIG 62135683200LL -#define BASE_YEAR 1970 -#define ORD_OFFSET 719163LL // days until 1970-01-01 -#define BDAY_OFFSET 513689LL // days until 1970-01-01 -#define WEEK_OFFSET 102737LL -#define BASE_WEEK_TO_DAY_OFFSET \ - 1 // difference between day 0 and end of week in days -#define DAYS_PER_WEEK 7 -#define BUSINESS_DAYS_PER_WEEK 5 -#define HIGHFREQ_ORIG 0 // ORD_OFFSET * 86400LL // days until 1970-01-01 - #define FR_ANN 1000 /* Annual */ #define FR_ANNDEC FR_ANN /* Annual - December year end*/ #define FR_ANNJAN 1001 /* Annual - January year end*/ diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index e1c783ac9fa54..f1a193706144f 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -75,10 +75,6 @@ cdef extern from "period_helper.h": int FR_BUS int FR_UND - int ORD_OFFSET - int WEEK_OFFSET - int BDAY_OFFSET - ctypedef struct date_info: double second int minute @@ -181,7 +177,7 @@ cdef int64_t get_period_ordinal(int year, int month, int day, period_ordinal : int64_t """ cdef: - int64_t absdays, unix_date, seconds, delta + int64_t unix_date, seconds, delta int64_t weeks int64_t day_adj int freq_group, fmonth, mdiff @@ -215,8 +211,7 @@ cdef int64_t get_period_ordinal(int year, int month, int day, elif freq == FR_MTH: return (year - 1970) * 12 + month - 1 - absdays = absdate_from_ymd(year, month, day) - unix_date = absdays - ORD_OFFSET + unix_date = unix_date_from_ymd(year, month, day) if freq >= FR_SEC: seconds = unix_date * 86400 + hour * 3600 + minute * 60 + second @@ -247,48 +242,48 @@ cdef int64_t get_period_ordinal(int year, int month, int day, return unix_date elif freq == FR_BUS: - # calculate the current week assuming sunday as last day of a week - # Jan 1 0001 is a Monday, so subtract 1 to get to end-of-week - weeks = (unix_date + ORD_OFFSET - 1) // 7 + # calculate the current week (counting from 1970-01-01) treating + # sunday as last day of a week + weeks = (unix_date + 3) // 7 # calculate the current weekday (in range 1 .. 7) - delta = (unix_date + ORD_OFFSET - 1) % 7 + 1 + delta = (unix_date + 3) % 7 + 1 # return the number of business days in full weeks plus the business # days in the last - possible partial - week if delta <= 5: - return (weeks * 5) + delta - BDAY_OFFSET + return (5 * weeks) + delta - 4 else: - return (weeks * 5) + (5 + 1) - BDAY_OFFSET + return (5 * weeks) + (5 + 1) - 4 elif freq_group == FR_WK: day_adj = freq - FR_WK - return (unix_date + ORD_OFFSET - (1 + day_adj)) // 7 + 1 - WEEK_OFFSET + return (unix_date + 3 - day_adj) // 7 + 1 # raise ValueError cdef void get_date_info(int64_t ordinal, int freq, date_info *dinfo) nogil: cdef: - int64_t absdate + int64_t unix_date double abstime - absdate = get_python_ordinal(ordinal, freq); - abstime = get_abs_time(freq, absdate - ORD_OFFSET, ordinal) + unix_date = get_unix_date(ordinal, freq) + abstime = get_abs_time(freq, unix_date, ordinal) while abstime < 0: abstime += 86400 - absdate -= 1 + unix_date -= 1 while abstime >= 86400: abstime -= 86400 - absdate += 1 + unix_date += 1 - dInfoCalc_SetFromAbsDateTime(dinfo, absdate, abstime) + date_info_from_days_and_time(dinfo, unix_date, abstime) -cdef int64_t get_python_ordinal(int64_t period_ordinal, int freq) nogil: +cdef int64_t get_unix_date(int64_t period_ordinal, int freq) nogil: """ Returns the proleptic Gregorian ordinal of the date, as an integer. - This corresponds to the number of days since Jan., 1st, 1AD. + This corresponds to the number of days since Jan., 1st, 1970 AD. When the instance has a frequency less than daily, the proleptic date is calculated for the last day of the period. @@ -299,92 +294,56 @@ cdef int64_t get_python_ordinal(int64_t period_ordinal, int freq) nogil: Returns ------- - absdate : int64_t number of days since datetime(1, 1, 1) + unix_date : int64_t number of days since datetime(1970, 1, 1) """ cdef: asfreq_info af_info freq_conv_func toDaily = NULL if freq == FR_DAY: - return period_ordinal + ORD_OFFSET + return period_ordinal toDaily = get_asfreq_func(freq, FR_DAY) get_asfreq_info(freq, FR_DAY, 'E', &af_info) - return toDaily(period_ordinal, &af_info) + ORD_OFFSET + return toDaily(period_ordinal, &af_info) -cdef void dInfoCalc_SetFromAbsDateTime(date_info *dinfo, - int64_t absdate, double abstime) nogil: +@cython.cdivision +cdef void date_info_from_days_and_time(date_info *dinfo, + int64_t unix_date, + double abstime) nogil: """ Set the instance's value using the given date and time. - Assumes GREGORIAN_CALENDAR. Parameters ---------- dinfo : date_info* - absdate : int64_t - days elapsed since datetime(1, 1, 1) + unix_date : int64_t + days elapsed since datetime(1970, 1, 1) abstime : double - seconds elapsed since beginning of day described by absdate + seconds elapsed since beginning of day described by unix_date Notes ----- Updates dinfo inplace """ + cdef: + pandas_datetimestruct dts + int inttime + int hour, minute + double second + # Bounds check # The calling function is responsible for ensuring that # abstime >= 0.0 and abstime <= 86400 # Calculate the date - dInfoCalc_SetFromAbsDate(dinfo, absdate) - - # Calculate the time - dInfoCalc_SetFromAbsTime(dinfo, abstime) - - -cdef void dInfoCalc_SetFromAbsDate(date_info *dinfo, int64_t absdate) nogil: - """ - Sets the date part of the date_info struct - Assumes GREGORIAN_CALENDAR - - Parameters - ---------- - dinfo : date_info* - unix_date : int64_t - - Notes - ----- - Updates dinfo inplace - """ - cdef: - pandas_datetimestruct dts - - pandas_datetime_to_datetimestruct(absdate - ORD_OFFSET, PANDAS_FR_D, &dts) + pandas_datetime_to_datetimestruct(unix_date, PANDAS_FR_D, &dts) dinfo.year = dts.year dinfo.month = dts.month dinfo.day = dts.day - -@cython.cdivision -cdef void dInfoCalc_SetFromAbsTime(date_info *dinfo, double abstime) nogil: - """ - Sets the time part of the DateTime object. - - Parameters - ---------- - dinfo : date_info* - abstime : double - seconds elapsed since beginning of day described by absdate - - Notes - ----- - Updates dinfo inplace - """ - cdef: - int inttime - int hour, minute - double second - + # Calculate the time inttime = abstime hour = inttime / 3600 minute = (inttime % 3600) / 60 @@ -396,8 +355,7 @@ cdef void dInfoCalc_SetFromAbsTime(date_info *dinfo, double abstime) nogil: @cython.cdivision -cdef double get_abs_time(int freq, int64_t date_ordinal, - int64_t ordinal) nogil: +cdef double get_abs_time(int freq, int64_t unix_date, int64_t ordinal) nogil: cdef: int freq_index, day_index, base_index int64_t per_day, start_ord @@ -416,16 +374,15 @@ cdef double get_abs_time(int freq, int64_t date_ordinal, if base_index < freq_index: unit = 1 / unit - start_ord = date_ordinal * per_day + start_ord = unix_date * per_day result = (unit * (ordinal - start_ord)) return result -cdef int64_t absdate_from_ymd(int year, int month, int day) nogil: +cdef int64_t unix_date_from_ymd(int year, int month, int day) nogil: """ - Find the absdate (days elapsed since datetime(1, 1, 1) + Find the unix_date (days elapsed since datetime(1970, 1, 1) for the given year/month/day. - Assumes GREGORIAN_CALENDAR Parameters ---------- @@ -435,11 +392,9 @@ cdef int64_t absdate_from_ymd(int year, int month, int day) nogil: Returns ------- - absdate : int - days elapsed since datetime(1, 1, 1) + unix_date : int + days elapsed since datetime(1970, 1, 1) """ - - # /* Calculate the absolute date cdef: pandas_datetimestruct dts int64_t unix_date @@ -449,7 +404,7 @@ cdef int64_t absdate_from_ymd(int year, int month, int day) nogil: dts.month = month dts.day = day unix_date = pandas_datetimestruct_to_datetime(PANDAS_FR_D, &dts) - return ORD_OFFSET + unix_date + return unix_date cdef int get_yq(int64_t ordinal, int freq, int *quarter, int *year): @@ -475,9 +430,9 @@ cdef int get_yq(int64_t ordinal, int freq, int *quarter, int *year): cdef: asfreq_info af_info int qtr_freq - int64_t daily_ord + int64_t unix_date - daily_ord = get_python_ordinal(ordinal, freq) - ORD_OFFSET + unix_date = get_unix_date(ordinal, freq) if get_freq_group(freq) == FR_QTR: qtr_freq = freq @@ -486,16 +441,16 @@ cdef int get_yq(int64_t ordinal, int freq, int *quarter, int *year): get_asfreq_info(FR_DAY, qtr_freq, 'E', &af_info) - DtoQ_yq(daily_ord, &af_info, year, quarter) + quarter[0] = DtoQ_yq(unix_date, &af_info, year) return qtr_freq -cdef void DtoQ_yq(int64_t ordinal, asfreq_info *af_info, - int *year, int *quarter): +cdef int DtoQ_yq(int64_t unix_date, asfreq_info *af_info, int *year): cdef: date_info dinfo + int quarter - dInfoCalc_SetFromAbsDate(&dinfo, ordinal + ORD_OFFSET) + date_info_from_days_and_time(&dinfo, unix_date, 0) if af_info.to_q_year_end != 12: dinfo.month -= af_info.to_q_year_end @@ -505,10 +460,11 @@ cdef void DtoQ_yq(int64_t ordinal, asfreq_info *af_info, dinfo.year += 1 year[0] = dinfo.year - quarter[0] = monthToQuarter(dinfo.month) + quarter = month_to_quarter(dinfo.month) + return quarter -cdef inline int monthToQuarter(int month): +cdef inline int month_to_quarter(int month): return (month - 1) // 3 + 1 @@ -678,7 +634,7 @@ def period_format(int64_t value, int freq, object fmt=None): return repr(NaT) if fmt is None: - freq_group = (freq // 1000) * 1000 + freq_group = get_freq_group(freq) if freq_group == 1000: # FR_ANN fmt = b'%Y' elif freq_group == 2000: # FR_QTR @@ -1620,8 +1576,8 @@ class Period(_Period): return cls._from_ordinal(ordinal, freq) -def _ordinal_from_fields(year, month, quarter, day, - hour, minute, second, freq): +cdef int64_t _ordinal_from_fields(year, month, quarter, day, + hour, minute, second, freq): base, mult = get_freq_code(freq) if quarter is not None: year, month = _quarter_to_myear(year, quarter, freq) diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index dff5433adcf79..f43ab0704f0f4 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -1440,3 +1440,10 @@ def test_period_immutable(): freq = per.freq with pytest.raises(AttributeError): per.freq = 2 * freq + + +@pytest.mark.xfail(reason='GH#19834 Period parsing error') +def test_small_year_parsing(): + per1 = Period('0001-01-07', 'D') + assert per1.year == 1 + assert per1.day == 7 diff --git a/pandas/tests/scalar/period/test_period_asfreq.py b/pandas/tests/scalar/period/test_period_asfreq.py index 9f8b2562e9e20..474d19809b03c 100644 --- a/pandas/tests/scalar/period/test_period_asfreq.py +++ b/pandas/tests/scalar/period/test_period_asfreq.py @@ -21,6 +21,16 @@ def test_asfreq_near_zero(self, freq): tup2 = (prev.year, prev.month, prev.day) assert tup2 < tup1 + def test_asfreq_near_zero_weekly(self): + # GH#19834 + per1 = Period('0001-01-01', 'D') + 6 + per2 = Period('0001-01-01', 'D') - 6 + week1 = per1.asfreq('W') + week2 = per2.asfreq('W') + assert week1 != week2 + assert week1.asfreq('D', 'E') >= per1 + assert week2.asfreq('D', 'S') <= per2 + @pytest.mark.xfail(reason='GH#19643 period_helper asfreq functions fail ' 'to check for overflows') def test_to_timestamp_out_of_bounds(self): From 0ffc4b56c57e914b85d5b5b0033e21ee0525307f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 23 Feb 2018 03:36:42 -0800 Subject: [PATCH 179/214] BUG: fix index op names and pinning (#19723) --- pandas/core/indexes/base.py | 267 +++++++++++++--------------- pandas/core/indexes/datetimelike.py | 26 ++- pandas/core/indexes/datetimes.py | 5 +- pandas/core/indexes/period.py | 16 +- pandas/core/indexes/range.py | 58 ++---- pandas/core/indexes/timedeltas.py | 12 +- pandas/tests/indexes/common.py | 5 +- pandas/tests/indexes/test_base.py | 22 ++- 8 files changed, 204 insertions(+), 207 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 59fe4bba649d3..c343126db0ea1 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1,11 +1,11 @@ -import datetime +from datetime import datetime, timedelta import warnings import operator import numpy as np from pandas._libs import (lib, index as libindex, tslib as libts, algos as libalgos, join as libjoin, - Timestamp, Timedelta) + Timedelta) from pandas._libs.lib import is_datetime_array from pandas.compat import range, u, set_function_name @@ -47,6 +47,7 @@ from pandas.core.base import PandasObject, IndexOpsMixin import pandas.core.common as com import pandas.core.base as base +from pandas.core import ops from pandas.util._decorators import ( Appender, Substitution, cache_readonly, deprecate_kwarg) from pandas.core.indexes.frozen import FrozenList @@ -55,7 +56,7 @@ import pandas.core.algorithms as algos import pandas.core.sorting as sorting from pandas.io.formats.printing import pprint_thing -from pandas.core.ops import _comp_method_OBJECT_ARRAY, make_invalid_op +from pandas.core.ops import make_invalid_op from pandas.core.config import get_option from pandas.core.strings import StringMethods @@ -82,6 +83,74 @@ def _try_get_item(x): return x +def _make_comparison_op(op, cls): + def cmp_method(self, other): + if isinstance(other, (np.ndarray, Index, ABCSeries)): + if other.ndim > 0 and len(self) != len(other): + raise ValueError('Lengths must match to compare') + + # we may need to directly compare underlying + # representations + if needs_i8_conversion(self) and needs_i8_conversion(other): + return self._evaluate_compare(other, op) + + if is_object_dtype(self) and self.nlevels == 1: + # don't pass MultiIndex + with np.errstate(all='ignore'): + result = ops._comp_method_OBJECT_ARRAY(op, self.values, other) + else: + with np.errstate(all='ignore'): + result = op(self.values, np.asarray(other)) + + # technically we could support bool dtyped Index + # for now just return the indexing array directly + if is_bool_dtype(result): + return result + try: + return Index(result) + except TypeError: + return result + + name = '__{name}__'.format(name=op.__name__) + # TODO: docstring? + return set_function_name(cmp_method, name, cls) + + +def _make_arithmetic_op(op, cls): + def index_arithmetic_method(self, other): + if isinstance(other, (ABCSeries, ABCDataFrame)): + return NotImplemented + elif isinstance(other, ABCTimedeltaIndex): + # Defer to subclass implementation + return NotImplemented + + other = self._validate_for_numeric_binop(other, op) + + # handle time-based others + if isinstance(other, (ABCDateOffset, np.timedelta64, timedelta)): + return self._evaluate_with_timedelta_like(other, op) + elif isinstance(other, (datetime, np.datetime64)): + return self._evaluate_with_datetime_like(other, op) + + values = self.values + with np.errstate(all='ignore'): + result = op(values, other) + + result = missing.dispatch_missing(op, values, other, result) + + attrs = self._get_attributes_dict() + attrs = self._maybe_update_attributes(attrs) + if op is divmod: + result = (Index(result[0], **attrs), Index(result[1], **attrs)) + else: + result = Index(result, **attrs) + return result + + name = '__{name}__'.format(name=op.__name__) + # TODO: docstring? + return set_function_name(index_arithmetic_method, name, cls) + + class InvalidIndexError(Exception): pass @@ -2175,11 +2244,13 @@ def __add__(self, other): def __radd__(self, other): return Index(other + np.array(self)) - __iadd__ = __add__ + def __iadd__(self, other): + # alias for __add__ + return self + other def __sub__(self, other): raise TypeError("cannot perform __sub__ with this index type: " - "{typ}".format(typ=type(self))) + "{typ}".format(typ=type(self).__name__)) def __and__(self, other): return self.intersection(other) @@ -3917,13 +3988,11 @@ def dropna(self, how='any'): return self._shallow_copy(self.values[~self._isnan]) return self._shallow_copy() - def _evaluate_with_timedelta_like(self, other, op, opstr, reversed=False): + def _evaluate_with_timedelta_like(self, other, op): # Timedelta knows how to operate with np.array, so dispatch to that # operation and then wrap the results other = Timedelta(other) values = self.values - if reversed: - values, other = other, values with np.errstate(all='ignore'): result = op(values, other) @@ -3934,7 +4003,7 @@ def _evaluate_with_timedelta_like(self, other, op, opstr, reversed=False): return Index(result[0], **attrs), Index(result[1], **attrs) return Index(result, **attrs) - def _evaluate_with_datetime_like(self, other, op, opstr): + def _evaluate_with_datetime_like(self, other, op): raise TypeError("can only perform ops with datetime like values") def _evaluate_compare(self, other, op): @@ -3943,64 +4012,39 @@ def _evaluate_compare(self, other, op): @classmethod def _add_comparison_methods(cls): """ add in comparison methods """ - - def _make_compare(op): - def _evaluate_compare(self, other): - if isinstance(other, (np.ndarray, Index, ABCSeries)): - if other.ndim > 0 and len(self) != len(other): - raise ValueError('Lengths must match to compare') - - # we may need to directly compare underlying - # representations - if needs_i8_conversion(self) and needs_i8_conversion(other): - return self._evaluate_compare(other, op) - - if (is_object_dtype(self) and - self.nlevels == 1): - - # don't pass MultiIndex - with np.errstate(all='ignore'): - result = _comp_method_OBJECT_ARRAY( - op, self.values, other) - else: - with np.errstate(all='ignore'): - result = op(self.values, np.asarray(other)) - - # technically we could support bool dtyped Index - # for now just return the indexing array directly - if is_bool_dtype(result): - return result - try: - return Index(result) - except TypeError: - return result - - name = '__{name}__'.format(name=op.__name__) - return set_function_name(_evaluate_compare, name, cls) - - cls.__eq__ = _make_compare(operator.eq) - cls.__ne__ = _make_compare(operator.ne) - cls.__lt__ = _make_compare(operator.lt) - cls.__gt__ = _make_compare(operator.gt) - cls.__le__ = _make_compare(operator.le) - cls.__ge__ = _make_compare(operator.ge) + cls.__eq__ = _make_comparison_op(operator.eq, cls) + cls.__ne__ = _make_comparison_op(operator.ne, cls) + cls.__lt__ = _make_comparison_op(operator.lt, cls) + cls.__gt__ = _make_comparison_op(operator.gt, cls) + cls.__le__ = _make_comparison_op(operator.le, cls) + cls.__ge__ = _make_comparison_op(operator.ge, cls) @classmethod def _add_numeric_methods_add_sub_disabled(cls): """ add in the numeric add/sub methods to disable """ - cls.__add__ = cls.__radd__ = __iadd__ = make_invalid_op('__add__') # noqa - cls.__sub__ = __isub__ = make_invalid_op('__sub__') # noqa + cls.__add__ = make_invalid_op('__add__') + cls.__radd__ = make_invalid_op('__radd__') + cls.__iadd__ = make_invalid_op('__iadd__') + cls.__sub__ = make_invalid_op('__sub__') + cls.__rsub__ = make_invalid_op('__rsub__') + cls.__isub__ = make_invalid_op('__isub__') @classmethod def _add_numeric_methods_disabled(cls): """ add in numeric methods to disable other than add/sub """ cls.__pow__ = make_invalid_op('__pow__') cls.__rpow__ = make_invalid_op('__rpow__') - cls.__mul__ = cls.__rmul__ = make_invalid_op('__mul__') - cls.__floordiv__ = cls.__rfloordiv__ = make_invalid_op('__floordiv__') - cls.__truediv__ = cls.__rtruediv__ = make_invalid_op('__truediv__') + cls.__mul__ = make_invalid_op('__mul__') + cls.__rmul__ = make_invalid_op('__rmul__') + cls.__floordiv__ = make_invalid_op('__floordiv__') + cls.__rfloordiv__ = make_invalid_op('__rfloordiv__') + cls.__truediv__ = make_invalid_op('__truediv__') + cls.__rtruediv__ = make_invalid_op('__rtruediv__') if not compat.PY3: - cls.__div__ = cls.__rdiv__ = make_invalid_op('__div__') + cls.__div__ = make_invalid_op('__div__') + cls.__rdiv__ = make_invalid_op('__rdiv__') + cls.__mod__ = make_invalid_op('__mod__') + cls.__divmod__ = make_invalid_op('__divmod__') cls.__neg__ = make_invalid_op('__neg__') cls.__pos__ = make_invalid_op('__pos__') cls.__abs__ = make_invalid_op('__abs__') @@ -4015,34 +4059,29 @@ def _validate_for_numeric_unaryop(self, op, opstr): if not self._is_numeric_dtype: raise TypeError("cannot evaluate a numeric op " - "{opstr} for type: {typ}".format( - opstr=opstr, - typ=type(self)) - ) + "{opstr} for type: {typ}" + .format(opstr=opstr, typ=type(self).__name__)) - def _validate_for_numeric_binop(self, other, op, opstr): + def _validate_for_numeric_binop(self, other, op): """ return valid other, evaluate or raise TypeError if we are not of the appropriate type internal method called by ops """ + opstr = '__{opname}__'.format(opname=op.__name__) # if we are an inheritor of numeric, # but not actually numeric (e.g. DatetimeIndex/PeriodIndex) if not self._is_numeric_dtype: raise TypeError("cannot evaluate a numeric op {opstr} " - "for type: {typ}".format( - opstr=opstr, - typ=type(self)) - ) + "for type: {typ}" + .format(opstr=opstr, typ=type(self).__name__)) if isinstance(other, Index): if not other._is_numeric_dtype: raise TypeError("cannot evaluate a numeric op " - "{opstr} with type: {typ}".format( - opstr=type(self), - typ=type(other)) - ) + "{opstr} with type: {typ}" + .format(opstr=opstr, typ=type(other))) elif isinstance(other, np.ndarray) and not other.ndim: other = other.item() @@ -4054,11 +4093,10 @@ def _validate_for_numeric_binop(self, other, op, opstr): if other.dtype.kind not in ['f', 'i', 'u']: raise TypeError("cannot evaluate a numeric op " "with a non-numeric dtype") - elif isinstance(other, (ABCDateOffset, np.timedelta64, - datetime.timedelta)): + elif isinstance(other, (ABCDateOffset, np.timedelta64, timedelta)): # higher up to handle pass - elif isinstance(other, (Timestamp, np.datetime64)): + elif isinstance(other, (datetime, np.datetime64)): # higher up to handle pass else: @@ -4070,73 +4108,24 @@ def _validate_for_numeric_binop(self, other, op, opstr): @classmethod def _add_numeric_methods_binary(cls): """ add in numeric methods """ - - def _make_evaluate_binop(op, opstr, reversed=False, constructor=Index): - def _evaluate_numeric_binop(self, other): - if isinstance(other, (ABCSeries, ABCDataFrame)): - return NotImplemented - elif isinstance(other, ABCTimedeltaIndex): - # Defer to subclass implementation - return NotImplemented - - other = self._validate_for_numeric_binop(other, op, opstr) - - # handle time-based others - if isinstance(other, (ABCDateOffset, np.timedelta64, - datetime.timedelta)): - return self._evaluate_with_timedelta_like(other, op, opstr, - reversed) - elif isinstance(other, (Timestamp, np.datetime64)): - return self._evaluate_with_datetime_like(other, op, opstr) - - # if we are a reversed non-commutative op - values = self.values - if reversed: - values, other = other, values - - attrs = self._get_attributes_dict() - attrs = self._maybe_update_attributes(attrs) - with np.errstate(all='ignore'): - result = op(values, other) - - result = missing.dispatch_missing(op, values, other, result) - return constructor(result, **attrs) - - return _evaluate_numeric_binop - - cls.__add__ = cls.__radd__ = _make_evaluate_binop( - operator.add, '__add__') - cls.__sub__ = _make_evaluate_binop( - operator.sub, '__sub__') - cls.__rsub__ = _make_evaluate_binop( - operator.sub, '__sub__', reversed=True) - cls.__mul__ = cls.__rmul__ = _make_evaluate_binop( - operator.mul, '__mul__') - cls.__rpow__ = _make_evaluate_binop( - operator.pow, '__pow__', reversed=True) - cls.__pow__ = _make_evaluate_binop( - operator.pow, '__pow__') - cls.__mod__ = _make_evaluate_binop( - operator.mod, '__mod__') - cls.__floordiv__ = _make_evaluate_binop( - operator.floordiv, '__floordiv__') - cls.__rfloordiv__ = _make_evaluate_binop( - operator.floordiv, '__floordiv__', reversed=True) - cls.__truediv__ = _make_evaluate_binop( - operator.truediv, '__truediv__') - cls.__rtruediv__ = _make_evaluate_binop( - operator.truediv, '__truediv__', reversed=True) + cls.__add__ = _make_arithmetic_op(operator.add, cls) + cls.__radd__ = _make_arithmetic_op(ops.radd, cls) + cls.__sub__ = _make_arithmetic_op(operator.sub, cls) + cls.__rsub__ = _make_arithmetic_op(ops.rsub, cls) + cls.__mul__ = _make_arithmetic_op(operator.mul, cls) + cls.__rmul__ = _make_arithmetic_op(ops.rmul, cls) + cls.__rpow__ = _make_arithmetic_op(ops.rpow, cls) + cls.__pow__ = _make_arithmetic_op(operator.pow, cls) + cls.__mod__ = _make_arithmetic_op(operator.mod, cls) + cls.__floordiv__ = _make_arithmetic_op(operator.floordiv, cls) + cls.__rfloordiv__ = _make_arithmetic_op(ops.rfloordiv, cls) + cls.__truediv__ = _make_arithmetic_op(operator.truediv, cls) + cls.__rtruediv__ = _make_arithmetic_op(ops.rtruediv, cls) if not compat.PY3: - cls.__div__ = _make_evaluate_binop( - operator.div, '__div__') - cls.__rdiv__ = _make_evaluate_binop( - operator.div, '__div__', reversed=True) + cls.__div__ = _make_arithmetic_op(operator.div, cls) + cls.__rdiv__ = _make_arithmetic_op(ops.rdiv, cls) - cls.__divmod__ = _make_evaluate_binop( - divmod, - '__divmod__', - constructor=lambda result, **attrs: (Index(result[0], **attrs), - Index(result[1], **attrs))) + cls.__divmod__ = _make_arithmetic_op(divmod, cls) @classmethod def _add_numeric_methods_unary(cls): @@ -4153,8 +4142,8 @@ def _evaluate_numeric_unary(self): return _evaluate_numeric_unary - cls.__neg__ = _make_evaluate_unary(lambda x: -x, '__neg__') - cls.__pos__ = _make_evaluate_unary(lambda x: x, '__pos__') + cls.__neg__ = _make_evaluate_unary(operator.neg, '__neg__') + cls.__pos__ = _make_evaluate_unary(operator.pos, '__pos__') cls.__abs__ = _make_evaluate_unary(np.abs, '__abs__') cls.__inv__ = _make_evaluate_unary(lambda x: -x, '__inv__') diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 187f9fcf52dd4..ac75e5ae5e2a0 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -669,6 +669,7 @@ def __add__(self, other): result = self._add_offset_array(other) elif isinstance(self, TimedeltaIndex) and isinstance(other, Index): if hasattr(other, '_add_delta'): + # i.e. DatetimeIndex, TimedeltaIndex, or PeriodIndex result = other._add_delta(self) else: raise TypeError("cannot add TimedeltaIndex and {typ}" @@ -693,7 +694,11 @@ def __add__(self, other): return result cls.__add__ = __add__ - cls.__radd__ = __add__ + + def __radd__(self, other): + # alias for __add__ + return self.__add__(other) + cls.__radd__ = __radd__ def __sub__(self, other): from pandas.core.index import Index @@ -712,10 +717,10 @@ def __sub__(self, other): # Array/Index of DateOffset objects result = self._sub_offset_array(other) elif isinstance(self, TimedeltaIndex) and isinstance(other, Index): - if not isinstance(other, TimedeltaIndex): - raise TypeError("cannot subtract TimedeltaIndex and {typ}" - .format(typ=type(other).__name__)) - result = self._add_delta(-other) + # We checked above for timedelta64_dtype(other) so this + # must be invalid. + raise TypeError("cannot subtract TimedeltaIndex and {typ}" + .format(typ=type(other).__name__)) elif isinstance(other, DatetimeIndex): result = self._sub_datelike(other) elif is_integer(other): @@ -747,8 +752,15 @@ def __rsub__(self, other): return -(self - other) cls.__rsub__ = __rsub__ - cls.__iadd__ = __add__ - cls.__isub__ = __sub__ + def __iadd__(self, other): + # alias for __add__ + return self.__add__(other) + cls.__iadd__ = __iadd__ + + def __isub__(self, other): + # alias for __sub__ + return self.__sub__(other) + cls.__isub__ = __isub__ def _add_delta(self, other): return NotImplemented diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index debeabf9bae23..17f92339e4205 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -100,10 +100,11 @@ def f(self): return property(f) -def _dt_index_cmp(opname, cls, nat_result=False): +def _dt_index_cmp(opname, cls): """ Wrap comparison operations to convert datetime-like to datetime64 """ + nat_result = True if opname == '__ne__' else False def wrapper(self, other): func = getattr(super(DatetimeIndex, self), opname) @@ -291,7 +292,7 @@ def _join_i8_wrapper(joinf, **kwargs): def _add_comparison_methods(cls): """ add in comparison methods """ cls.__eq__ = _dt_index_cmp('__eq__', cls) - cls.__ne__ = _dt_index_cmp('__ne__', cls, nat_result=True) + cls.__ne__ = _dt_index_cmp('__ne__', cls) cls.__lt__ = _dt_index_cmp('__lt__', cls) cls.__gt__ = _dt_index_cmp('__gt__', cls) cls.__le__ = _dt_index_cmp('__le__', cls) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 88f9297652ebf..4c14cbffcd813 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -76,26 +76,25 @@ def dt64arr_to_periodarr(data, freq, tz): _DIFFERENT_FREQ_INDEX = period._DIFFERENT_FREQ_INDEX -def _period_index_cmp(opname, cls, nat_result=False): +def _period_index_cmp(opname, cls): """ - Wrap comparison operations to convert datetime-like to datetime64 + Wrap comparison operations to convert Period-like to PeriodDtype """ + nat_result = True if opname == '__ne__' else False def wrapper(self, other): + op = getattr(self._ndarray_values, opname) if isinstance(other, Period): - func = getattr(self._ndarray_values, opname) - other_base, _ = _gfc(other.freq) if other.freq != self.freq: msg = _DIFFERENT_FREQ_INDEX.format(self.freqstr, other.freqstr) raise IncompatibleFrequency(msg) - result = func(other.ordinal) + result = op(other.ordinal) elif isinstance(other, PeriodIndex): if other.freq != self.freq: msg = _DIFFERENT_FREQ_INDEX.format(self.freqstr, other.freqstr) raise IncompatibleFrequency(msg) - op = getattr(self._ndarray_values, opname) result = op(other._ndarray_values) mask = self._isnan | other._isnan @@ -108,8 +107,7 @@ def wrapper(self, other): result.fill(nat_result) else: other = Period(other, freq=self.freq) - func = getattr(self._ndarray_values, opname) - result = func(other.ordinal) + result = op(other.ordinal) if self.hasnans: result[self._isnan] = nat_result @@ -231,7 +229,7 @@ class PeriodIndex(DatelikeOps, DatetimeIndexOpsMixin, Int64Index): def _add_comparison_methods(cls): """ add in comparison methods """ cls.__eq__ = _period_index_cmp('__eq__', cls) - cls.__ne__ = _period_index_cmp('__ne__', cls, nat_result=True) + cls.__ne__ = _period_index_cmp('__ne__', cls) cls.__lt__ = _period_index_cmp('__lt__', cls) cls.__gt__ = _period_index_cmp('__gt__', cls) cls.__le__ = _period_index_cmp('__le__', cls) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 0ac415ee0b701..9d770cffb0059 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -16,6 +16,7 @@ from pandas.compat.numpy import function as nv import pandas.core.common as com +from pandas.core import ops from pandas.core.indexes.base import Index, _index_shared_docs from pandas.util._decorators import Appender, cache_readonly import pandas.core.dtypes.concat as _concat @@ -570,16 +571,12 @@ def __floordiv__(self, other): def _add_numeric_methods_binary(cls): """ add in numeric methods, specialized to RangeIndex """ - def _make_evaluate_binop(op, opstr, reversed=False, step=False): + def _make_evaluate_binop(op, step=False): """ Parameters ---------- op : callable that accepts 2 parms perform the binary op - opstr : string - string name of ops - reversed : boolean, default False - if this is a reversed op, e.g. radd step : callable, optional, default to False op to apply to the step parm if not None if False, use the existing step @@ -594,17 +591,13 @@ def _evaluate_numeric_binop(self, other): elif isinstance(other, (timedelta, np.timedelta64)): # GH#19333 is_integer evaluated True on timedelta64, # so we need to catch these explicitly - if reversed: - return op(other, self._int64index) return op(self._int64index, other) - other = self._validate_for_numeric_binop(other, op, opstr) + other = self._validate_for_numeric_binop(other, op) attrs = self._get_attributes_dict() attrs = self._maybe_update_attributes(attrs) left, right = self, other - if reversed: - left, right = right, left try: # apply if we have an override @@ -638,43 +631,26 @@ def _evaluate_numeric_binop(self, other): return result - except (ValueError, TypeError, AttributeError, - ZeroDivisionError): + except (ValueError, TypeError, ZeroDivisionError): # Defer to Int64Index implementation - if reversed: - return op(other, self._int64index) return op(self._int64index, other) + # TODO: Do attrs get handled reliably? return _evaluate_numeric_binop - cls.__add__ = cls.__radd__ = _make_evaluate_binop( - operator.add, '__add__') - cls.__sub__ = _make_evaluate_binop(operator.sub, '__sub__') - cls.__rsub__ = _make_evaluate_binop( - operator.sub, '__sub__', reversed=True) - cls.__mul__ = cls.__rmul__ = _make_evaluate_binop( - operator.mul, - '__mul__', - step=operator.mul) - cls.__truediv__ = _make_evaluate_binop( - operator.truediv, - '__truediv__', - step=operator.truediv) - cls.__rtruediv__ = _make_evaluate_binop( - operator.truediv, - '__truediv__', - reversed=True, - step=operator.truediv) + cls.__add__ = _make_evaluate_binop(operator.add) + cls.__radd__ = _make_evaluate_binop(ops.radd) + cls.__sub__ = _make_evaluate_binop(operator.sub) + cls.__rsub__ = _make_evaluate_binop(ops.rsub) + cls.__mul__ = _make_evaluate_binop(operator.mul, step=operator.mul) + cls.__rmul__ = _make_evaluate_binop(ops.rmul, step=ops.rmul) + cls.__truediv__ = _make_evaluate_binop(operator.truediv, + step=operator.truediv) + cls.__rtruediv__ = _make_evaluate_binop(ops.rtruediv, + step=ops.rtruediv) if not compat.PY3: - cls.__div__ = _make_evaluate_binop( - operator.div, - '__div__', - step=operator.div) - cls.__rdiv__ = _make_evaluate_binop( - operator.div, - '__div__', - reversed=True, - step=operator.div) + cls.__div__ = _make_evaluate_binop(operator.div, step=operator.div) + cls.__rdiv__ = _make_evaluate_binop(ops.rdiv, step=ops.rdiv) RangeIndex._add_numeric_methods() diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 6b61db53d9a11..3542a24290f89 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -53,10 +53,11 @@ def f(self): return property(f) -def _td_index_cmp(opname, cls, nat_result=False): +def _td_index_cmp(opname, cls): """ Wrap comparison operations to convert timedelta-like to timedelta64 """ + nat_result = True if opname == '__ne__' else False def wrapper(self, other): msg = "cannot compare a TimedeltaIndex with type {0}" @@ -184,7 +185,7 @@ def _join_i8_wrapper(joinf, **kwargs): def _add_comparison_methods(cls): """ add in comparison methods """ cls.__eq__ = _td_index_cmp('__eq__', cls) - cls.__ne__ = _td_index_cmp('__ne__', cls, nat_result=True) + cls.__ne__ = _td_index_cmp('__ne__', cls) cls.__lt__ = _td_index_cmp('__lt__', cls) cls.__gt__ = _td_index_cmp('__gt__', cls) cls.__le__ = _td_index_cmp('__le__', cls) @@ -383,11 +384,12 @@ def _add_delta(self, delta): return TimedeltaIndex(new_values, freq='infer') - def _evaluate_with_timedelta_like(self, other, op, opstr, reversed=False): + def _evaluate_with_timedelta_like(self, other, op): if isinstance(other, ABCSeries): # GH#19042 return NotImplemented + opstr = '__{opname}__'.format(opname=op.__name__).replace('__r', '__') # allow division by a timedelta if opstr in ['__div__', '__truediv__', '__floordiv__']: if _is_convertible_to_td(other): @@ -398,11 +400,9 @@ def _evaluate_with_timedelta_like(self, other, op, opstr, reversed=False): i8 = self.asi8 left, right = i8, other.value - if reversed: - left, right = right, left if opstr in ['__floordiv__']: - result = left // right + result = op(left, right) else: result = op(left, np.float64(right)) result = self._maybe_mask_results(result, convert='float64') diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 1162662bf9a08..8f51dbabd5b71 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -127,16 +127,17 @@ def test_numeric_compat(self): idx = self.create_index() tm.assert_raises_regex(TypeError, "cannot perform __mul__", lambda: idx * 1) - tm.assert_raises_regex(TypeError, "cannot perform __mul__", + tm.assert_raises_regex(TypeError, "cannot perform __rmul__", lambda: 1 * idx) div_err = "cannot perform __truediv__" if PY3 \ else "cannot perform __div__" tm.assert_raises_regex(TypeError, div_err, lambda: idx / 1) + div_err = div_err.replace(' __', ' __r') tm.assert_raises_regex(TypeError, div_err, lambda: 1 / idx) tm.assert_raises_regex(TypeError, "cannot perform __floordiv__", lambda: idx // 1) - tm.assert_raises_regex(TypeError, "cannot perform __floordiv__", + tm.assert_raises_regex(TypeError, "cannot perform __rfloordiv__", lambda: 1 // idx) def test_logical_compat(self): diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 90edcb526bb2e..d7f185853ca45 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -7,6 +7,7 @@ from collections import defaultdict import pandas.util.testing as tm +from pandas.core.dtypes.generic import ABCIndex from pandas.core.dtypes.common import is_unsigned_integer_dtype from pandas.core.indexes.api import Index, MultiIndex from pandas.tests.indexes.common import Base @@ -1988,6 +1989,17 @@ def test_addsub_arithmetic(self, dtype, delta): tm.assert_index_equal(idx - idx, 0 * idx) assert not (idx - idx).empty + def test_iadd_preserves_name(self): + # GH#17067, GH#19723 __iadd__ and __isub__ should preserve index name + ser = pd.Series([1, 2, 3]) + ser.index.name = 'foo' + + ser.index += 1 + assert ser.index.name == "foo" + + ser.index -= 1 + assert ser.index.name == "foo" + class TestMixedIntIndex(Base): # Mostly the tests from common.py for which the results differ @@ -2301,9 +2313,17 @@ def test_ensure_index_from_sequences(self, data, names, expected): tm.assert_index_equal(result, expected) -@pytest.mark.parametrize('opname', ['eq', 'ne', 'le', 'lt', 'ge', 'gt']) +@pytest.mark.parametrize('opname', ['eq', 'ne', 'le', 'lt', 'ge', 'gt', + 'add', 'radd', 'sub', 'rsub', + 'mul', 'rmul', 'truediv', 'rtruediv', + 'floordiv', 'rfloordiv', + 'pow', 'rpow', 'mod', 'divmod']) def test_generated_op_names(opname, indices): index = indices + if isinstance(index, ABCIndex) and opname == 'rsub': + # pd.Index.__rsub__ does not exist; though the method does exist + # for subclasses. see GH#19723 + return opname = '__{name}__'.format(name=opname) method = getattr(index, opname) assert method.__name__ == opname From 0176f6e146d5eef3c1a7557e71bfdfbf939b6ffa Mon Sep 17 00:00:00 2001 From: Tommy <10076072+tommyod@users.noreply.github.com> Date: Fri, 23 Feb 2018 12:46:11 +0100 Subject: [PATCH 180/214] DOC: Spellcheck of gotchas.rst (FAQ page) (#19747) --- ci/lint.sh | 1 + doc/source/gotchas.rst | 114 +++++++++++++++++++++-------------------- 2 files changed, 60 insertions(+), 55 deletions(-) diff --git a/ci/lint.sh b/ci/lint.sh index e3a39668885f0..fcd65fc5aba5e 100755 --- a/ci/lint.sh +++ b/ci/lint.sh @@ -156,6 +156,7 @@ if [ "$LINT" ]; then RET=1 fi echo "Check for deprecated messages without sphinx directive DONE" + else echo "NOT Linting" fi diff --git a/doc/source/gotchas.rst b/doc/source/gotchas.rst index bc490877e190d..b7042ef390018 100644 --- a/doc/source/gotchas.rst +++ b/doc/source/gotchas.rst @@ -22,22 +22,22 @@ Frequently Asked Questions (FAQ) DataFrame memory usage ---------------------- -The memory usage of a dataframe (including the index) -is shown when accessing the ``info`` method of a dataframe. A -configuration option, ``display.memory_usage`` (see :ref:`options`), -specifies if the dataframe's memory usage will be displayed when -invoking the ``df.info()`` method. +The memory usage of a ``DataFrame`` (including the index) is shown when calling +the :meth:`~DataFrame.info`. A configuration option, ``display.memory_usage`` +(see :ref:`the list of options `), specifies if the +``DataFrame``'s memory usage will be displayed when invoking the ``df.info()`` +method. -For example, the memory usage of the dataframe below is shown -when calling ``df.info()``: +For example, the memory usage of the ``DataFrame`` below is shown +when calling :meth:`~DataFrame.info`: .. ipython:: python dtypes = ['int64', 'float64', 'datetime64[ns]', 'timedelta64[ns]', 'complex128', 'object', 'bool'] n = 5000 - data = dict([ (t, np.random.randint(100, size=n).astype(t)) - for t in dtypes]) + data = dict([(t, np.random.randint(100, size=n).astype(t)) + for t in dtypes]) df = pd.DataFrame(data) df['categorical'] = df['object'].astype('category') @@ -48,7 +48,7 @@ pandas does not count the memory used by values in columns with ``dtype=object``. Passing ``memory_usage='deep'`` will enable a more accurate memory usage report, -that accounts for the full usage of the contained objects. This is optional +accounting for the full usage of the contained objects. This is optional as it can be expensive to do this deeper introspection. .. ipython:: python @@ -58,11 +58,11 @@ as it can be expensive to do this deeper introspection. By default the display option is set to ``True`` but can be explicitly overridden by passing the ``memory_usage`` argument when invoking ``df.info()``. -The memory usage of each column can be found by calling the ``memory_usage`` -method. This returns a Series with an index represented by column names -and memory usage of each column shown in bytes. For the dataframe above, -the memory usage of each column and the total memory usage of the -dataframe can be found with the memory_usage method: +The memory usage of each column can be found by calling the +:meth:`~DataFrame.memory_usage` method. This returns a ``Series`` with an index +represented by column names and memory usage of each column shown in bytes. For +the ``DataFrame`` above, the memory usage of each column and the total memory +usage can be found with the ``memory_usage`` method: .. ipython:: python @@ -71,18 +71,18 @@ dataframe can be found with the memory_usage method: # total memory usage of dataframe df.memory_usage().sum() -By default the memory usage of the dataframe's index is shown in the -returned Series, the memory usage of the index can be suppressed by passing +By default the memory usage of the ``DataFrame``'s index is shown in the +returned ``Series``, the memory usage of the index can be suppressed by passing the ``index=False`` argument: .. ipython:: python df.memory_usage(index=False) -The memory usage displayed by the ``info`` method utilizes the -``memory_usage`` method to determine the memory usage of a dataframe -while also formatting the output in human-readable units (base-2 -representation; i.e., 1KB = 1024 bytes). +The memory usage displayed by the :meth:`~DataFrame.info` method utilizes the +:meth:`~DataFrame.memory_usage` method to determine the memory usage of a +``DataFrame`` while also formatting the output in human-readable units (base-2 +representation; i.e. 1KB = 1024 bytes). See also :ref:`Categorical Memory Usage `. @@ -91,17 +91,18 @@ See also :ref:`Categorical Memory Usage `. Using If/Truth Statements with pandas ------------------------------------- -pandas follows the NumPy convention of raising an error when you try to convert something to a ``bool``. -This happens in a ``if`` or when using the boolean operations, ``and``, ``or``, or ``not``. It is not clear -what the result of +pandas follows the NumPy convention of raising an error when you try to convert +something to a ``bool``. This happens in an ``if``-statement or when using the +boolean operations: ``and``, ``or``, and ``not``. It is not clear what the result +of the following code should be: .. code-block:: python >>> if pd.Series([False, True, False]): ... -should be. Should it be ``True`` because it's not zero-length? ``False`` because there are ``False`` values? -It is unclear, so instead, pandas raises a ``ValueError``: +Should it be ``True`` because it's not zero-length, or ``False`` because there +are ``False`` values? It is unclear, so instead, pandas raises a ``ValueError``: .. code-block:: python @@ -111,9 +112,9 @@ It is unclear, so instead, pandas raises a ``ValueError``: ... ValueError: The truth value of an array is ambiguous. Use a.empty, a.any() or a.all(). - -If you see that, you need to explicitly choose what you want to do with it (e.g., use `any()`, `all()` or `empty`). -or, you might want to compare if the pandas object is ``None`` +You need to explicitly choose what you want to do with the ``DataFrame``, e.g. +use :meth:`~DataFrame.any`, :meth:`~DataFrame.all` or :meth:`~DataFrame.empty`. +Alternatively, you might want to compare if the pandas object is ``None``: .. code-block:: python @@ -122,7 +123,7 @@ or, you might want to compare if the pandas object is ``None`` >>> I was not None -or return if ``any`` value is ``True``. +Below is how to check if any of the values are ``True``: .. code-block:: python @@ -130,7 +131,8 @@ or return if ``any`` value is ``True``. print("I am any") >>> I am any -To evaluate single-element pandas objects in a boolean context, use the method ``.bool()``: +To evaluate single-element pandas objects in a boolean context, use the method +:meth:`~DataFrame.bool`: .. ipython:: python @@ -161,25 +163,25 @@ See :ref:`boolean comparisons` for more examples. Using the ``in`` operator ~~~~~~~~~~~~~~~~~~~~~~~~~ -Using the Python ``in`` operator on a Series tests for membership in the +Using the Python ``in`` operator on a ``Series`` tests for membership in the index, not membership among the values. -.. ipython:: +.. ipython:: python s = pd.Series(range(5), index=list('abcde')) 2 in s 'b' in s If this behavior is surprising, keep in mind that using ``in`` on a Python -dictionary tests keys, not values, and Series are dict-like. -To test for membership in the values, use the method :func:`~pandas.Series.isin`: +dictionary tests keys, not values, and ``Series`` are dict-like. +To test for membership in the values, use the method :meth:`~pandas.Series.isin`: -.. ipython:: +.. ipython:: python s.isin([2]) s.isin([2]).any() -For DataFrames, likewise, ``in`` applies to the column axis, +For ``DataFrames``, likewise, ``in`` applies to the column axis, testing for membership in the list of column names. ``NaN``, Integer ``NA`` values and ``NA`` type promotions @@ -189,12 +191,12 @@ Choice of ``NA`` representation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ For lack of ``NA`` (missing) support from the ground up in NumPy and Python in -general, we were given the difficult choice between either +general, we were given the difficult choice between either: - A *masked array* solution: an array of data and an array of boolean values - indicating whether a value is there or is missing + indicating whether a value is there or is missing. - Using a special sentinel value, bit pattern, or set of sentinel values to - denote ``NA`` across the dtypes + denote ``NA`` across the dtypes. For many reasons we chose the latter. After years of production use it has proven, at least in my opinion, to be the best decision given the state of @@ -226,15 +228,16 @@ arrays. For example: s2.dtype This trade-off is made largely for memory and performance reasons, and also so -that the resulting Series continues to be "numeric". One possibility is to use -``dtype=object`` arrays instead. +that the resulting ``Series`` continues to be "numeric". One possibility is to +use ``dtype=object`` arrays instead. ``NA`` type promotions ~~~~~~~~~~~~~~~~~~~~~~ -When introducing NAs into an existing Series or DataFrame via ``reindex`` or -some other means, boolean and integer types will be promoted to a different -dtype in order to store the NAs. These are summarized by this table: +When introducing NAs into an existing ``Series`` or ``DataFrame`` via +:meth:`~Series.reindex` or some other means, boolean and integer types will be +promoted to a different dtype in order to store the NAs. The promotions are +summarized in this table: .. csv-table:: :header: "Typeclass","Promotion dtype for storing NAs" @@ -289,19 +292,19 @@ integer arrays to floating when NAs must be introduced. Differences with NumPy ---------------------- -For Series and DataFrame objects, ``var`` normalizes by ``N-1`` to produce -unbiased estimates of the sample variance, while NumPy's ``var`` normalizes -by N, which measures the variance of the sample. Note that ``cov`` -normalizes by ``N-1`` in both pandas and NumPy. +For ``Series`` and ``DataFrame`` objects, :meth:`~DataFrame.var` normalizes by +``N-1`` to produce unbiased estimates of the sample variance, while NumPy's +``var`` normalizes by N, which measures the variance of the sample. Note that +:meth:`~DataFrame.cov` normalizes by ``N-1`` in both pandas and NumPy. Thread-safety ------------- As of pandas 0.11, pandas is not 100% thread safe. The known issues relate to -the ``DataFrame.copy`` method. If you are doing a lot of copying of DataFrame -objects shared among threads, we recommend holding locks inside the threads -where the data copying occurs. +the :meth:`~DataFrame.copy` method. If you are doing a lot of copying of +``DataFrame`` objects shared among threads, we recommend holding locks inside +the threads where the data copying occurs. See `this link `__ for more information. @@ -310,7 +313,8 @@ for more information. Byte-Ordering Issues -------------------- Occasionally you may have to deal with data that were created on a machine with -a different byte order than the one on which you are running Python. A common symptom of this issue is an error like +a different byte order than the one on which you are running Python. A common +symptom of this issue is an error like: .. code-block:: python @@ -320,8 +324,8 @@ a different byte order than the one on which you are running Python. A common sy To deal with this issue you should convert the underlying NumPy array to the native -system byte order *before* passing it to Series/DataFrame/Panel constructors -using something similar to the following: +system byte order *before* passing it to ``Series`` or ``DataFrame`` +constructors using something similar to the following: .. ipython:: python From 01e99decf14b55409cea0789ffcc615afed45bac Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 23 Feb 2018 05:47:23 -0600 Subject: [PATCH 181/214] ENH: Allow storing ExtensionArrays in containers (#19520) * ENH: non-interval changes * COMPAT: py2 Super * BUG: Use original object for extension array * Consistent boxing / unboxing NumPy compat * 32-bit compat * Add a test array * linting * Default __iter__ * Tests for value_counts * Implement value_counts * Py2 compat * Fixed dropna * Test fixups * Started setitem * REF/Clean: Internal / External values * Move to index base * Setitem tests, decimal example * Compat * Fixed extension block tests. The only "API change" was that you can't just inherit from NonConsolidatableMixin, which is OK since 1. it's a mixin 2. geopandas also inherits from Block * Clarify binop tests Make it clearer which bit might raise * TST: Removed ops tests * Cleanup unique handling * Simplify object concat * Use values for intersection I think eventually we'll want to ndarray_values for this, but it'll require a bit more work to support. Currently, using ndarary_values causes occasional failures on categorical. * hmm * More failing tests * remove bad test * better setitem * Dropna works. * Restore xfail test * Test Categorical * Xfail setitem tests * TST: Skip JSON tests on py2 * Additional testing * More tests * ndarray_values * API: Default ExtensionArray.astype (cherry picked from commit 943a915562b72bed147c857de927afa0daf31c1a) (cherry picked from commit fbf0a0672380e210d3cb3c527fa8045a204d81be) * Simplify concat_as_object * Py2 compat (cherry picked from commit b20e12cae68dd86ff51597464045656763d369f7) * Set-ops ugliness * better docstrings * tolist * linting * Moved dtypes (cherry picked from commit d1362271bca8a7b183f3241e5c2f040c422118b8) * clean * cleanup * NumPy compat * Use base _values for CategoricalIndex * Update dev docs * cleanup * cleanup (cherry picked from commit 242562108b099b4e7a205541ee15b9272dcb5265) * cleanup * Linting * Precision in tests * Linting * Move to extension * Push _ndarray_values to ExtensionArray Now IndexOpsMixin._ndarray_values will dispatch all the way down to the EA. Subclasses like Categorical can override it as they see fit. * Clean up tolist * Move test locations * Fixed test * REF: Update per comments * lint * REF: Use _values for size and shape * PERF: Implement size, shape for IntervalIndex * PERF: Avoid materializing values for PeriodIndex shape, size * Cleanup * Override nbytes * Remove unused change * Docs * Test cleanpu * Always set PANDAS_TESTING_MODE * Revert "Always set PANDAS_TESTING_MODE" This reverts commit a312ba5c59c2e96854a286bde74d7fd4562afbf8. * Explicitly catch warnings or not * fastparquet warnings * Unicode literals strikes again. Only catch fp warning for newer numpy * Restore circle env var * More parquet test catching * No stacklevel * Lower bound on FP * Exact bound for FP * Don't use fastpath for ExtensionBlock make_block * Consistently use _values * TST: Additional constructor tests * CLN: de-nested a bit * _fill_value handling * Handle user provided dtype in constructors. When the dtype matches, we allow it to proceed. When the dtype would require coercion, we raise. * Document ExtensionBlock._maybe_coerce_values Also changes to use _values as we should * Created ABCExtensionArray * TST: Tests for is_object_dtype and is_string_dtype and EAs * fixup! Handle user provided dtype in constructors. * Doc for setitem * Split base tests * Revert test_parquet changes * API: Removed _fill_value from the interface * Push coercion to extension dtype till later * Linting * ERR: Better error message for coercion to 3rd party dtypes * CLN: Make take_nd EA aware * Revert sparse changes * Other _typ for ABCExtensionArray * Test cleanup and expansion. Tests for concating and aligning frames * Copy if copy * TST: remove self param for fixture * Remove unnescessary EA handling in Series ctor * API: Removed value_counts Moved setitem notes to comment * More doc notes * Handle expanding a DataFrame with an EA * Added ExtensionDtype.__eq__ Support for astype * linting * REF: is_dtype_equal refactor Moved from PandasExtensionDtype to ExtensionDtype with one modification: catch TypeError explicitly. * Remove reference to dtype being a class * move * Moved sparse check to take_nd * Docstring * Split tests * Revert index change * Copy changes * Simplify EA implementation names comments for object vs. str missing values * Linting --- pandas/core/algorithms.py | 26 ++- pandas/core/arrays/base.py | 90 +++++++--- pandas/core/dtypes/base.py | 57 +++++-- pandas/core/dtypes/common.py | 2 +- pandas/core/dtypes/dtypes.py | 25 --- pandas/core/dtypes/generic.py | 2 + pandas/core/dtypes/missing.py | 53 +++--- pandas/core/frame.py | 21 ++- pandas/core/indexes/base.py | 4 +- pandas/core/indexing.py | 3 + pandas/core/internals.py | 66 ++++++-- pandas/core/series.py | 54 ++++-- pandas/tests/categorical/test_missing.py | 3 +- pandas/tests/extension/base/__init__.py | 42 +++++ pandas/tests/extension/base/casting.py | 11 ++ pandas/tests/extension/base/constructors.py | 43 +++++ pandas/tests/extension/base/dtype.py | 46 ++++++ pandas/tests/extension/base/getitem.py | 119 ++++++++++++++ pandas/tests/extension/base/interface.py | 53 ++++++ pandas/tests/extension/base/methods.py | 32 ++++ pandas/tests/extension/base/missing.py | 45 +++++ pandas/tests/extension/base/reshaping.py | 61 +++++++ pandas/tests/extension/category/__init__.py | 0 .../extension/category/test_categorical.py | 84 ++++++++++ pandas/tests/extension/conftest.py | 48 ++++++ pandas/tests/extension/decimal/__init__.py | 0 pandas/tests/extension/decimal/array.py | 86 ++++++++++ .../tests/extension/decimal/test_decimal.py | 154 ++++++++++++++++++ pandas/tests/extension/json/__init__.py | 0 pandas/tests/extension/json/array.py | 99 +++++++++++ pandas/tests/extension/json/test_json.py | 73 +++++++++ .../test_external_block.py | 4 +- 32 files changed, 1276 insertions(+), 130 deletions(-) create mode 100644 pandas/tests/extension/base/__init__.py create mode 100644 pandas/tests/extension/base/casting.py create mode 100644 pandas/tests/extension/base/constructors.py create mode 100644 pandas/tests/extension/base/dtype.py create mode 100644 pandas/tests/extension/base/getitem.py create mode 100644 pandas/tests/extension/base/interface.py create mode 100644 pandas/tests/extension/base/methods.py create mode 100644 pandas/tests/extension/base/missing.py create mode 100644 pandas/tests/extension/base/reshaping.py create mode 100644 pandas/tests/extension/category/__init__.py create mode 100644 pandas/tests/extension/category/test_categorical.py create mode 100644 pandas/tests/extension/conftest.py create mode 100644 pandas/tests/extension/decimal/__init__.py create mode 100644 pandas/tests/extension/decimal/array.py create mode 100644 pandas/tests/extension/decimal/test_decimal.py create mode 100644 pandas/tests/extension/json/__init__.py create mode 100644 pandas/tests/extension/json/array.py create mode 100644 pandas/tests/extension/json/test_json.py rename pandas/tests/{internals => extension}/test_external_block.py (94%) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 624045a3d64bc..d616e3f92aa4d 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -15,11 +15,12 @@ is_unsigned_integer_dtype, is_signed_integer_dtype, is_integer_dtype, is_complex_dtype, is_object_dtype, + is_extension_array_dtype, is_categorical_dtype, is_sparse, is_period_dtype, is_numeric_dtype, is_float_dtype, is_bool_dtype, needs_i8_conversion, - is_categorical, is_datetimetz, + is_datetimetz, is_datetime64_any_dtype, is_datetime64tz_dtype, is_timedelta64_dtype, is_interval_dtype, is_scalar, is_list_like, @@ -547,7 +548,7 @@ def value_counts(values, sort=True, ascending=False, normalize=False, if is_categorical_dtype(values) or is_sparse(values): # handle Categorical and sparse, - result = Series(values).values.value_counts(dropna=dropna) + result = Series(values)._values.value_counts(dropna=dropna) result.name = name counts = result.values @@ -1292,10 +1293,13 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, """ Specialized Cython take which sets NaN values in one pass + This dispatches to ``take`` defined on ExtensionArrays. It does not + currently dispatch to ``SparseArray.take`` for sparse ``arr``. + Parameters ---------- - arr : ndarray - Input array + arr : array-like + Input array. indexer : ndarray 1-D array of indices to take, subarrays corresponding to -1 value indicies are filed with fill_value @@ -1315,17 +1319,25 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, If False, indexer is assumed to contain no -1 values so no filling will be done. This short-circuits computation of a mask. Result is undefined if allow_fill == False and -1 is present in indexer. + + Returns + ------- + subarray : array-like + May be the same type as the input, or cast to an ndarray. """ + # TODO(EA): Remove these if / elifs as datetimeTZ, interval, become EAs # dispatch to internal type takes - if is_categorical(arr): - return arr.take_nd(indexer, fill_value=fill_value, - allow_fill=allow_fill) + if is_extension_array_dtype(arr): + return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) elif is_datetimetz(arr): return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) elif is_interval_dtype(arr): return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) + if is_sparse(arr): + arr = arr.get_values() + if indexer is None: indexer = np.arange(arr.shape[axis], dtype=np.int64) dtype, fill_value = arr.dtype, arr.dtype.type() diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index e618dc6b69b2d..cec881394a021 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -25,14 +25,13 @@ class ExtensionArray(object): * isna * take * copy - * _formatting_values * _concat_same_type - Some additional methods are required to satisfy pandas' internal, private + Some additional methods are available to satisfy pandas' internal, private block API. - * _concat_same_type * _can_hold_na + * _formatting_values This class does not inherit from 'abc.ABCMeta' for performance reasons. Methods and properties required by the interface raise @@ -53,13 +52,14 @@ class ExtensionArray(object): Extension arrays should be able to be constructed with instances of the class, i.e. ``ExtensionArray(extension_array)`` should return an instance, not error. - - Additionally, certain methods and interfaces are required for proper - this array to be properly stored inside a ``DataFrame`` or ``Series``. """ + # '_typ' is for pandas.core.dtypes.generic.ABCExtensionArray. + # Don't override this. + _typ = 'extension' # ------------------------------------------------------------------------ # Must be a Sequence # ------------------------------------------------------------------------ + def __getitem__(self, item): # type (Any) -> Any """Select a subset of self. @@ -92,7 +92,46 @@ def __getitem__(self, item): raise AbstractMethodError(self) def __setitem__(self, key, value): - # type: (Any, Any) -> None + # type: (Union[int, np.ndarray], Any) -> None + """Set one or more values inplace. + + This method is not required to satisfy the pandas extension array + interface. + + Parameters + ---------- + key : int, ndarray, or slice + When called from, e.g. ``Series.__setitem__``, ``key`` will be + one of + + * scalar int + * ndarray of integers. + * boolean ndarray + * slice object + + value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object + value or values to be set of ``key``. + + Returns + ------- + None + """ + # Some notes to the ExtensionArray implementor who may have ended up + # here. While this method is not required for the interface, if you + # *do* choose to implement __setitem__, then some semantics should be + # observed: + # + # * Setting multiple values : ExtensionArrays should support setting + # multiple values at once, 'key' will be a sequence of integers and + # 'value' will be a same-length sequence. + # + # * Broadcasting : For a sequence 'key' and a scalar 'value', + # each position in 'key' should be set to 'value'. + # + # * Coercion : Most users will expect basic coercion to work. For + # example, a string like '2018-01-01' is coerced to a datetime + # when setting on a datetime64ns array. In general, if the + # __init__ method coerces that value, then so should __setitem__ raise NotImplementedError(_not_implemented_message.format( type(self), '__setitem__') ) @@ -107,6 +146,16 @@ def __len__(self): # type: () -> int raise AbstractMethodError(self) + def __iter__(self): + """Iterate over elements of the array. + + """ + # This needs to be implemented so that pandas recognizes extension + # arrays as list-like. The default implementation makes successive + # calls to ``__getitem__``, which may be slower than necessary. + for i in range(len(self)): + yield self[i] + # ------------------------------------------------------------------------ # Required attributes # ------------------------------------------------------------------------ @@ -132,9 +181,9 @@ def nbytes(self): # type: () -> int """The number of bytes needed to store this object in memory. - If this is expensive to compute, return an approximate lower bound - on the number of bytes needed. """ + # If this is expensive to compute, return an approximate lower bound + # on the number of bytes needed. raise AbstractMethodError(self) # ------------------------------------------------------------------------ @@ -184,8 +233,8 @@ def take(self, indexer, allow_fill=True, fill_value=None): will be done. This short-circuits computation of a mask. Result is undefined if allow_fill == False and -1 is present in indexer. fill_value : any, default None - Fill value to replace -1 values with. By default, this uses - the missing value sentinel for this type, ``self._fill_value``. + Fill value to replace -1 values with. If applicable, this should + use the sentinel missing value for this type. Notes ----- @@ -198,17 +247,20 @@ def take(self, indexer, allow_fill=True, fill_value=None): Examples -------- - Suppose the extension array somehow backed by a NumPy structured array - and that the underlying structured array is stored as ``self.data``. - Then ``take`` may be written as + Suppose the extension array is backed by a NumPy array stored as + ``self.data``. Then ``take`` may be written as .. code-block:: python def take(self, indexer, allow_fill=True, fill_value=None): mask = indexer == -1 result = self.data.take(indexer) - result[mask] = self._fill_value + result[mask] = np.nan # NA for this type return type(self)(result) + + See Also + -------- + numpy.take """ raise AbstractMethodError(self) @@ -230,17 +282,12 @@ def copy(self, deep=False): # ------------------------------------------------------------------------ # Block-related methods # ------------------------------------------------------------------------ - @property - def _fill_value(self): - # type: () -> Any - """The missing value for this type, e.g. np.nan""" - return None def _formatting_values(self): # type: () -> np.ndarray # At the moment, this has to be an array since we use result.dtype """An array of values to be printed in, e.g. the Series repr""" - raise AbstractMethodError(self) + return np.array(self) @classmethod def _concat_same_type(cls, to_concat): @@ -257,6 +304,7 @@ def _concat_same_type(cls, to_concat): """ raise AbstractMethodError(cls) + @property def _can_hold_na(self): # type: () -> bool """Whether your array can hold missing values. True by default. diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index c7c5378801f02..d54d980d02ffa 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -1,4 +1,7 @@ """Extend pandas with custom array types""" +import numpy as np + +from pandas import compat from pandas.errors import AbstractMethodError @@ -23,6 +26,32 @@ class ExtensionDtype(object): def __str__(self): return self.name + def __eq__(self, other): + """Check whether 'other' is equal to self. + + By default, 'other' is considered equal if + + * it's a string matching 'self.name'. + * it's an instance of this type. + + Parameters + ---------- + other : Any + + Returns + ------- + bool + """ + if isinstance(other, compat.string_types): + return other == self.name + elif isinstance(other, type(self)): + return True + else: + return False + + def __ne__(self, other): + return not self.__eq__(other) + @property def type(self): # type: () -> type @@ -102,11 +131,12 @@ def construct_from_string(cls, string): @classmethod def is_dtype(cls, dtype): - """Check if we match 'dtype' + """Check if we match 'dtype'. Parameters ---------- - dtype : str or dtype + dtype : object + The object to check. Returns ------- @@ -118,12 +148,19 @@ def is_dtype(cls, dtype): 1. ``cls.construct_from_string(dtype)`` is an instance of ``cls``. - 2. 'dtype' is ``cls`` or a subclass of ``cls``. + 2. ``dtype`` is an object and is an instance of ``cls`` + 3. ``dtype`` has a ``dtype`` attribute, and any of the above + conditions is true for ``dtype.dtype``. """ - if isinstance(dtype, str): - try: - return isinstance(cls.construct_from_string(dtype), cls) - except TypeError: - return False - else: - return issubclass(dtype, cls) + dtype = getattr(dtype, 'dtype', dtype) + + if isinstance(dtype, np.dtype): + return False + elif dtype is None: + return False + elif isinstance(dtype, cls): + return True + try: + return cls.construct_from_string(dtype) is not None + except TypeError: + return False diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index c2b71bc316fe8..197b35de88896 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1708,9 +1708,9 @@ def is_extension_array_dtype(arr_or_dtype): """ from pandas.core.arrays import ExtensionArray - # we want to unpack series, anything else? if isinstance(arr_or_dtype, (ABCIndexClass, ABCSeries)): arr_or_dtype = arr_or_dtype._values + return isinstance(arr_or_dtype, (ExtensionDtype, ExtensionArray)) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 99e4033f104db..d262a71933915 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -66,13 +66,6 @@ def __hash__(self): raise NotImplementedError("sub-classes should implement an __hash__ " "method") - def __eq__(self, other): - raise NotImplementedError("sub-classes should implement an __eq__ " - "method") - - def __ne__(self, other): - return not self.__eq__(other) - def __getstate__(self): # pickle support; we don't want to pickle the cache return {k: getattr(self, k, None) for k in self._metadata} @@ -82,24 +75,6 @@ def reset_cache(cls): """ clear the cache """ cls._cache = {} - @classmethod - def is_dtype(cls, dtype): - """ Return a boolean if the passed type is an actual dtype that - we can match (via string or type) - """ - if hasattr(dtype, 'dtype'): - dtype = dtype.dtype - if isinstance(dtype, np.dtype): - return False - elif dtype is None: - return False - elif isinstance(dtype, cls): - return True - try: - return cls.construct_from_string(dtype) is not None - except: - return False - class CategoricalDtypeType(type): """ diff --git a/pandas/core/dtypes/generic.py b/pandas/core/dtypes/generic.py index b032cb6f14d4c..cb54c94d29205 100644 --- a/pandas/core/dtypes/generic.py +++ b/pandas/core/dtypes/generic.py @@ -57,6 +57,8 @@ def _check(cls, inst): ABCDateOffset = create_pandas_abc_type("ABCDateOffset", "_typ", ("dateoffset",)) ABCInterval = create_pandas_abc_type("ABCInterval", "_typ", ("interval", )) +ABCExtensionArray = create_pandas_abc_type("ABCExtensionArray", "_typ", + ("extension", "categorical",)) class _ABCGeneric(type): diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index ffac702476af1..01c88c269e7e0 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -5,14 +5,16 @@ from pandas._libs import lib, missing as libmissing from pandas._libs.tslib import NaT, iNaT from .generic import (ABCMultiIndex, ABCSeries, - ABCIndexClass, ABCGeneric) + ABCIndexClass, ABCGeneric, + ABCExtensionArray) from .common import (is_string_dtype, is_datetimelike, is_datetimelike_v_numeric, is_float_dtype, is_datetime64_dtype, is_datetime64tz_dtype, is_timedelta64_dtype, is_interval_dtype, - is_complex_dtype, is_categorical_dtype, + is_complex_dtype, is_string_like_dtype, is_bool_dtype, is_integer_dtype, is_dtype_equal, + is_extension_array_dtype, needs_i8_conversion, _ensure_object, pandas_dtype, is_scalar, @@ -57,7 +59,8 @@ def _isna_new(obj): # hack (for now) because MI registers as ndarray elif isinstance(obj, ABCMultiIndex): raise NotImplementedError("isna is not defined for MultiIndex") - elif isinstance(obj, (ABCSeries, np.ndarray, ABCIndexClass)): + elif isinstance(obj, (ABCSeries, np.ndarray, ABCIndexClass, + ABCExtensionArray)): return _isna_ndarraylike(obj) elif isinstance(obj, ABCGeneric): return obj._constructor(obj._data.isna(func=isna)) @@ -124,30 +127,31 @@ def _use_inf_as_na(key): def _isna_ndarraylike(obj): - values = getattr(obj, 'values', obj) dtype = values.dtype - if is_string_dtype(dtype): - if is_categorical_dtype(values): - from pandas import Categorical - if not isinstance(values, Categorical): - values = values.values - result = values.isna() - elif is_interval_dtype(values): - from pandas import IntervalIndex - result = IntervalIndex(obj).isna() + if is_extension_array_dtype(obj): + if isinstance(obj, (ABCIndexClass, ABCSeries)): + values = obj._values else: + values = obj + result = values.isna() + elif is_interval_dtype(values): + # TODO(IntervalArray): remove this if block + from pandas import IntervalIndex + result = IntervalIndex(obj).isna() + elif is_string_dtype(dtype): + # Working around NumPy ticket 1542 + shape = values.shape - # Working around NumPy ticket 1542 - shape = values.shape - - if is_string_like_dtype(dtype): - result = np.zeros(values.shape, dtype=bool) - else: - result = np.empty(shape, dtype=bool) - vec = libmissing.isnaobj(values.ravel()) - result[...] = vec.reshape(shape) + if is_string_like_dtype(dtype): + # object array of strings + result = np.zeros(values.shape, dtype=bool) + else: + # object array of non-strings + result = np.empty(shape, dtype=bool) + vec = libmissing.isnaobj(values.ravel()) + result[...] = vec.reshape(shape) elif needs_i8_conversion(obj): # this is the NaT pattern @@ -406,4 +410,7 @@ def remove_na_arraylike(arr): """ Return array-like containing only true/non-NaN values, possibly empty. """ - return arr[notna(lib.values_from_object(arr))] + if is_extension_array_dtype(arr): + return arr[notna(arr)] + else: + return arr[notna(lib.values_from_object(arr))] diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 2aae4dffbeaaf..1c5cf87d6b39b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -39,6 +39,7 @@ is_categorical_dtype, is_object_dtype, is_extension_type, + is_extension_array_dtype, is_datetimetz, is_datetime64_any_dtype, is_datetime64tz_dtype, @@ -71,7 +72,7 @@ create_block_manager_from_arrays, create_block_manager_from_blocks) from pandas.core.series import Series -from pandas.core.arrays import Categorical +from pandas.core.arrays import Categorical, ExtensionArray import pandas.core.algorithms as algorithms from pandas.compat import (range, map, zip, lrange, lmap, lzip, StringIO, u, OrderedDict, raise_with_traceback) @@ -511,7 +512,7 @@ def _get_axes(N, K, index=index, columns=columns): index, columns = _get_axes(len(values), 1) return _arrays_to_mgr([values], columns, index, columns, dtype=dtype) - elif is_datetimetz(values): + elif (is_datetimetz(values) or is_extension_array_dtype(values)): # GH19157 if columns is None: columns = [0] @@ -2837,7 +2838,7 @@ def reindexer(value): # now align rows value = reindexer(value).T - elif isinstance(value, Categorical): + elif isinstance(value, ExtensionArray): value = value.copy() elif isinstance(value, Index) or is_sequence(value): @@ -2867,7 +2868,7 @@ def reindexer(value): value = maybe_cast_to_datetime(value, value.dtype) # return internal types directly - if is_extension_type(value): + if is_extension_type(value) or is_extension_array_dtype(value): return value # broadcast across multiple columns if necessary @@ -3404,12 +3405,8 @@ class max type new_obj = self.copy() def _maybe_casted_values(index, labels=None): - if isinstance(index, PeriodIndex): - values = index.astype(object).values - elif isinstance(index, DatetimeIndex) and index.tz is not None: - values = index - else: - values = index.values + values = index._values + if not isinstance(index, (PeriodIndex, DatetimeIndex)): if values.dtype == np.object_: values = lib.maybe_convert_objects(values) @@ -5621,7 +5618,9 @@ def count(self, axis=0, level=None, numeric_only=False): if len(frame._get_axis(axis)) == 0: result = Series(0, index=frame._get_agg_axis(axis)) else: - if frame._is_mixed_type: + if frame._is_mixed_type or frame._data.any_extension_types: + # the or any_extension_types is really only hit for single- + # column frames with an extension array result = notna(frame).sum(axis=axis) else: counts = notna(frame.values).sum(axis=axis) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index c343126db0ea1..0813c12d573d5 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -13,6 +13,7 @@ from pandas import compat from pandas.core.accessor import CachedAccessor +from pandas.core.arrays import ExtensionArray from pandas.core.dtypes.generic import ( ABCSeries, ABCDataFrame, ABCMultiIndex, @@ -2051,6 +2052,7 @@ def _format_with_header(self, header, na_rep='NaN', **kwargs): if is_categorical_dtype(values.dtype): values = np.array(values) + elif is_object_dtype(values.dtype): values = lib.maybe_convert_objects(values, safe=1) @@ -2652,7 +2654,7 @@ def get_value(self, series, key): # if we have something that is Index-like, then # use this, e.g. DatetimeIndex s = getattr(series, '_values', None) - if isinstance(s, Index) and is_scalar(key): + if isinstance(s, (ExtensionArray, Index)) and is_scalar(key): try: return s[key] except (IndexError, ValueError): diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index eb3aeda7902fc..2aa490cd02afb 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -618,6 +618,9 @@ def can_do_equal_len(): return if isinstance(value, (ABCSeries, dict)): + # TODO(EA): ExtensionBlock.setitem this causes issues with + # setting for extensionarrays that store dicts. Need to decide + # if it's worth supporting that. value = self._align_series(indexer, Series(value)) elif isinstance(value, ABCDataFrame): diff --git a/pandas/core/internals.py b/pandas/core/internals.py index dd5feefc49fe3..bad0626206e80 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -56,7 +56,11 @@ is_null_datelike_scalar) import pandas.core.dtypes.concat as _concat -from pandas.core.dtypes.generic import ABCSeries, ABCDatetimeIndex +from pandas.core.dtypes.generic import ( + ABCSeries, + ABCDatetimeIndex, + ABCExtensionArray, + ABCIndexClass) import pandas.core.common as com import pandas.core.algorithms as algos @@ -99,6 +103,7 @@ class Block(PandasObject): is_object = False is_categorical = False is_sparse = False + is_extension = False _box_to_block_values = True _can_hold_na = False _can_consolidate = True @@ -1854,11 +1859,40 @@ class ExtensionBlock(NonConsolidatableMixIn, Block): ExtensionArrays are limited to 1-D. """ + is_extension = True + + def __init__(self, values, placement, ndim=None): + values = self._maybe_coerce_values(values) + super(ExtensionBlock, self).__init__(values, placement, ndim) + + def _maybe_coerce_values(self, values): + """Unbox to an extension array. + + This will unbox an ExtensionArray stored in an Index or Series. + ExtensionArrays pass through. No dtype coercion is done. + + Parameters + ---------- + values : Index, Series, ExtensionArray + + Returns + ------- + ExtensionArray + """ + if isinstance(values, (ABCIndexClass, ABCSeries)): + values = values._values + return values + @property def _holder(self): # For extension blocks, the holder is values-dependent. return type(self.values) + @property + def _can_hold_na(self): + # The default ExtensionArray._can_hold_na is True + return self._holder._can_hold_na + @property def is_view(self): """Extension arrays are never treated as views.""" @@ -3451,6 +3485,8 @@ def apply(self, f, axes=None, filter=None, do_integrity_check=False, else: align_keys = [] + # TODO(EA): may interfere with ExtensionBlock.setitem for blocks + # with a .values attribute. aligned_args = dict((k, kwargs[k]) for k in align_keys if hasattr(kwargs[k], 'values')) @@ -3696,6 +3732,11 @@ def is_datelike_mixed_type(self): self._consolidate_inplace() return any(block.is_datelike for block in self.blocks) + @property + def any_extension_types(self): + """Whether any of the blocks in this manager are extension blocks""" + return any(block.is_extension for block in self.blocks) + @property def is_view(self): """ return a boolean if we are a single block and are a view """ @@ -4101,7 +4142,10 @@ def set(self, item, value, check=False): # FIXME: refactor, clearly separate broadcasting & zip-like assignment # can prob also fix the various if tests for sparse/categorical - value_is_extension_type = is_extension_type(value) + # TODO(EA): Remove an is_extension_ when all extension types satisfy + # the interface + value_is_extension_type = (is_extension_type(value) or + is_extension_array_dtype(value)) # categorical/spares/datetimetz if value_is_extension_type: @@ -4833,15 +4877,11 @@ def form_blocks(arrays, names, axes): if len(items_dict['ExtensionBlock']): - external_blocks = [] - for i, _, array in items_dict['ExtensionBlock']: - if isinstance(array, ABCSeries): - array = array.values - # Allow our internal arrays to chose their block type. - block_type = getattr(array, '_block_type', ExtensionBlock) - external_blocks.append( - make_block(array, klass=block_type, - fastpath=True, placement=[i])) + external_blocks = [ + make_block(array, klass=ExtensionBlock, placement=[i]) + for i, _, array in items_dict['ExtensionBlock'] + ] + blocks.extend(external_blocks) if len(extra_locs): @@ -5162,7 +5202,7 @@ def _safe_reshape(arr, new_shape): """ if isinstance(arr, ABCSeries): arr = arr._values - if not isinstance(arr, Categorical): + if not isinstance(arr, ABCExtensionArray): arr = arr.reshape(new_shape) return arr @@ -5673,6 +5713,8 @@ def is_na(self): if not values._null_fill_value and values.sp_index.ngaps > 0: return False values_flat = values.ravel(order='K') + elif isinstance(self.block, ExtensionBlock): + values_flat = values else: values_flat = values.ravel(order='K') total_len = values_flat.shape[0] diff --git a/pandas/core/series.py b/pandas/core/series.py index 6fcd54ecc6118..b42e02bc99237 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -14,12 +14,14 @@ import numpy.ma as ma from pandas.core.accessor import CachedAccessor +from pandas.core.arrays import ExtensionArray from pandas.core.dtypes.common import ( is_categorical_dtype, is_bool, is_integer, is_integer_dtype, is_float_dtype, is_extension_type, + is_extension_array_dtype, is_datetime64tz_dtype, is_timedelta64_dtype, is_list_like, @@ -173,12 +175,17 @@ def __init__(self, data=None, index=None, dtype=None, name=None, raise NotImplementedError("initializing a Series from a " "MultiIndex is not supported") elif isinstance(data, Index): - # need to copy to avoid aliasing issues if name is None: name = data.name - data = data._to_embed(keep_tz=True, dtype=dtype) + if dtype is not None: + # astype copies + data = data.astype(dtype) + else: + # need to copy to avoid aliasing issues + data = data._values.copy() copy = False + elif isinstance(data, np.ndarray): pass elif isinstance(data, Series): @@ -203,13 +210,15 @@ def __init__(self, data=None, index=None, dtype=None, name=None, '`data` argument and a different ' '`index` argument. `copy` must ' 'be False.') - elif isinstance(data, Categorical): + + elif is_extension_array_dtype(data) and dtype is not None: # GH12574: Allow dtype=category only, otherwise error - if ((dtype is not None) and - not is_categorical_dtype(dtype)): - raise ValueError("cannot specify a dtype with a " - "Categorical unless " - "dtype='category'") + if not data.dtype.is_dtype(dtype): + raise ValueError("Cannot specify a dtype '{}' with an " + "extension array of a different " + "dtype ('{}').".format(dtype, + data.dtype)) + elif (isinstance(data, types.GeneratorType) or (compat.PY3 and isinstance(data, map))): data = list(data) @@ -2556,8 +2565,7 @@ def _reindex_indexer(self, new_index, indexer, copy): return self.copy() return self - # be subclass-friendly - new_values = algorithms.take_1d(self.get_values(), indexer) + new_values = algorithms.take_1d(self._values, indexer) return self._constructor(new_values, index=new_index) def _needs_reindex_multi(self, axes, method, level): @@ -3113,10 +3121,11 @@ def _sanitize_index(data, index, copy=False): if isinstance(data, ABCIndexClass) and not copy: pass - elif isinstance(data, PeriodIndex): - data = data.astype(object).values - elif isinstance(data, DatetimeIndex): - data = data._to_embed(keep_tz=True) + elif isinstance(data, (PeriodIndex, DatetimeIndex)): + data = data._values + if copy: + data = data.copy() + elif isinstance(data, np.ndarray): # coerce datetimelike types @@ -3156,8 +3165,17 @@ def _try_cast(arr, take_fast_path): subarr = np.array(subarr, dtype=dtype, copy=copy) except (ValueError, TypeError): if is_categorical_dtype(dtype): + # We *do* allow casting to categorical, since we know + # that Categorical is the only array type for 'category'. subarr = Categorical(arr, dtype.categories, ordered=dtype.ordered) + elif is_extension_array_dtype(dtype): + # We don't allow casting to third party dtypes, since we don't + # know what array belongs to which type. + msg = ("Cannot cast data to extension dtype '{}'. " + "Pass the extension array directly.".format(dtype)) + raise ValueError(msg) + elif dtype is not None and raise_cast_failure: raise else: @@ -3189,9 +3207,15 @@ def _try_cast(arr, take_fast_path): # we will try to copy be-definition here subarr = _try_cast(data, True) - elif isinstance(data, Categorical): + elif isinstance(data, ExtensionArray): subarr = data + if dtype is not None and not data.dtype.is_dtype(dtype): + msg = ("Cannot coerce extension array to dtype '{typ}'. " + "Do the coercion before passing to the constructor " + "instead.".format(typ=dtype)) + raise ValueError(msg) + if copy: subarr = data.copy() return subarr diff --git a/pandas/tests/categorical/test_missing.py b/pandas/tests/categorical/test_missing.py index fca5573547071..5133c97d8b590 100644 --- a/pandas/tests/categorical/test_missing.py +++ b/pandas/tests/categorical/test_missing.py @@ -1,10 +1,9 @@ # -*- coding: utf-8 -*- - import numpy as np import pytest import pandas.util.testing as tm -from pandas import (Categorical, Index, isna) +from pandas import Categorical, Index, isna from pandas.compat import lrange from pandas.core.dtypes.dtypes import CategoricalDtype diff --git a/pandas/tests/extension/base/__init__.py b/pandas/tests/extension/base/__init__.py new file mode 100644 index 0000000000000..2273ef1f3e110 --- /dev/null +++ b/pandas/tests/extension/base/__init__.py @@ -0,0 +1,42 @@ +"""Base test suite for extension arrays. + +These tests are intended for third-party libraries to subclass to validate +that their extension arrays and dtypes satisfy the interface. Moving or +renaming the tests should not be done lightly. + +Libraries are expected to implement a few pytest fixtures to provide data +for the tests. The fixtures may be located in either + +* The same module as your test class. +* A ``conftest.py`` in the same directory as your test class. + +The full list of fixtures may be found in the ``conftest.py`` next to this +file. + +.. code-block:: python + + import pytest + from pandas.tests.extension.base import BaseDtypeTests + + + @pytest.fixture + def dtype(): + return MyDtype() + + + class TestMyDtype(BaseDtypeTests): + pass + + +Your class ``TestDtype`` will inherit all the tests defined on +``BaseDtypeTests``. pytest's fixture discover will supply your ``dtype`` +wherever the test requires it. You're free to implement additional tests. +""" +from .casting import BaseCastingTests # noqa +from .constructors import BaseConstructorsTests # noqa +from .dtype import BaseDtypeTests # noqa +from .getitem import BaseGetitemTests # noqa +from .interface import BaseInterfaceTests # noqa +from .methods import BaseMethodsTests # noqa +from .missing import BaseMissingTests # noqa +from .reshaping import BaseReshapingTests # noqa diff --git a/pandas/tests/extension/base/casting.py b/pandas/tests/extension/base/casting.py new file mode 100644 index 0000000000000..bcfbf0a247269 --- /dev/null +++ b/pandas/tests/extension/base/casting.py @@ -0,0 +1,11 @@ +import pandas as pd +from pandas.core.internals import ObjectBlock + + +class BaseCastingTests(object): + """Casting to and from ExtensionDtypes""" + + def test_astype_object_series(self, all_data): + ser = pd.Series({"A": all_data}) + result = ser.astype(object) + assert isinstance(result._data.blocks[0], ObjectBlock) diff --git a/pandas/tests/extension/base/constructors.py b/pandas/tests/extension/base/constructors.py new file mode 100644 index 0000000000000..7ad100e6289e9 --- /dev/null +++ b/pandas/tests/extension/base/constructors.py @@ -0,0 +1,43 @@ +import pytest + +import pandas as pd +import pandas.util.testing as tm +from pandas.core.internals import ExtensionBlock + + +class BaseConstructorsTests(object): + + def test_series_constructor(self, data): + result = pd.Series(data) + assert result.dtype == data.dtype + assert len(result) == len(data) + assert isinstance(result._data.blocks[0], ExtensionBlock) + assert result._data.blocks[0].values is data + + # Series[EA] is unboxed / boxed correctly + result2 = pd.Series(result) + assert result2.dtype == data.dtype + assert isinstance(result2._data.blocks[0], ExtensionBlock) + + @pytest.mark.parametrize("from_series", [True, False]) + def test_dataframe_constructor_from_dict(self, data, from_series): + if from_series: + data = pd.Series(data) + result = pd.DataFrame({"A": data}) + assert result.dtypes['A'] == data.dtype + assert result.shape == (len(data), 1) + assert isinstance(result._data.blocks[0], ExtensionBlock) + + def test_dataframe_from_series(self, data): + result = pd.DataFrame(pd.Series(data)) + assert result.dtypes[0] == data.dtype + assert result.shape == (len(data), 1) + assert isinstance(result._data.blocks[0], ExtensionBlock) + + @pytest.mark.xfail(reason="GH-19342") + def test_series_given_mismatched_index_raises(self, data): + msg = 'Wrong number of items passed 3, placement implies 4' + with tm.assert_raises_regex(ValueError, None) as m: + pd.Series(data[:3], index=[0, 1, 2, 3, 4]) + + assert m.match(msg) diff --git a/pandas/tests/extension/base/dtype.py b/pandas/tests/extension/base/dtype.py new file mode 100644 index 0000000000000..f5015bd469f13 --- /dev/null +++ b/pandas/tests/extension/base/dtype.py @@ -0,0 +1,46 @@ +import numpy as np +import pandas as pd + + +class BaseDtypeTests(object): + """Base class for ExtensionDtype classes""" + + def test_name(self, dtype): + assert isinstance(dtype.name, str) + + def test_kind(self, dtype): + valid = set('biufcmMOSUV') + if dtype.kind is not None: + assert dtype.kind in valid + + def test_construct_from_string_own_name(self, dtype): + result = dtype.construct_from_string(dtype.name) + assert type(result) is type(dtype) + + # check OK as classmethod + result = type(dtype).construct_from_string(dtype.name) + assert type(result) is type(dtype) + + def test_is_dtype_from_name(self, dtype): + result = type(dtype).is_dtype(dtype.name) + assert result is True + + def test_is_dtype_unboxes_dtype(self, data, dtype): + assert dtype.is_dtype(data) is True + + def test_is_dtype_from_self(self, dtype): + result = type(dtype).is_dtype(dtype) + assert result is True + + def test_is_not_string_type(self, dtype): + return not pd.api.types.is_string_dtype(dtype) + + def test_is_not_object_type(self, dtype): + return not pd.api.types.is_object_dtype(dtype) + + def test_eq_with_str(self, dtype): + assert dtype == dtype.name + assert dtype != dtype.name + '-suffix' + + def test_eq_with_numpy_object(self, dtype): + assert dtype != np.dtype('object') diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py new file mode 100644 index 0000000000000..f43971e928cac --- /dev/null +++ b/pandas/tests/extension/base/getitem.py @@ -0,0 +1,119 @@ +import numpy as np + +import pandas as pd +import pandas.util.testing as tm + + +class BaseGetitemTests(object): + """Tests for ExtensionArray.__getitem__.""" + + def test_iloc_series(self, data): + ser = pd.Series(data) + result = ser.iloc[:4] + expected = pd.Series(data[:4]) + tm.assert_series_equal(result, expected) + + result = ser.iloc[[0, 1, 2, 3]] + tm.assert_series_equal(result, expected) + + def test_iloc_frame(self, data): + df = pd.DataFrame({"A": data, 'B': np.arange(len(data))}) + expected = pd.DataFrame({"A": data[:4]}) + + # slice -> frame + result = df.iloc[:4, [0]] + tm.assert_frame_equal(result, expected) + + # sequence -> frame + result = df.iloc[[0, 1, 2, 3], [0]] + tm.assert_frame_equal(result, expected) + + expected = pd.Series(data[:4], name='A') + + # slice -> series + result = df.iloc[:4, 0] + tm.assert_series_equal(result, expected) + + # sequence -> series + result = df.iloc[:4, 0] + tm.assert_series_equal(result, expected) + + def test_loc_series(self, data): + ser = pd.Series(data) + result = ser.loc[:3] + expected = pd.Series(data[:4]) + tm.assert_series_equal(result, expected) + + result = ser.loc[[0, 1, 2, 3]] + tm.assert_series_equal(result, expected) + + def test_loc_frame(self, data): + df = pd.DataFrame({"A": data, 'B': np.arange(len(data))}) + expected = pd.DataFrame({"A": data[:4]}) + + # slice -> frame + result = df.loc[:3, ['A']] + tm.assert_frame_equal(result, expected) + + # sequence -> frame + result = df.loc[[0, 1, 2, 3], ['A']] + tm.assert_frame_equal(result, expected) + + expected = pd.Series(data[:4], name='A') + + # slice -> series + result = df.loc[:3, 'A'] + tm.assert_series_equal(result, expected) + + # sequence -> series + result = df.loc[:3, 'A'] + tm.assert_series_equal(result, expected) + + def test_getitem_scalar(self, data): + result = data[0] + assert isinstance(result, data.dtype.type) + + result = pd.Series(data)[0] + assert isinstance(result, data.dtype.type) + + def test_getitem_scalar_na(self, data_missing, na_cmp, na_value): + result = data_missing[0] + assert na_cmp(result, na_value) + + def test_getitem_mask(self, data): + # Empty mask, raw array + mask = np.zeros(len(data), dtype=bool) + result = data[mask] + assert len(result) == 0 + assert isinstance(result, type(data)) + + # Empty mask, in series + mask = np.zeros(len(data), dtype=bool) + result = pd.Series(data)[mask] + assert len(result) == 0 + assert result.dtype == data.dtype + + # non-empty mask, raw array + mask[0] = True + result = data[mask] + assert len(result) == 1 + assert isinstance(result, type(data)) + + # non-empty mask, in series + result = pd.Series(data)[mask] + assert len(result) == 1 + assert result.dtype == data.dtype + + def test_getitem_slice(self, data): + # getitem[slice] should return an array + result = data[slice(0)] # empty + assert isinstance(result, type(data)) + + result = data[slice(1)] # scalar + assert isinstance(result, type(data)) + + def test_take_sequence(self, data): + result = pd.Series(data)[[0, 1, 3]] + assert result.iloc[0] == data[0] + assert result.iloc[1] == data[1] + assert result.iloc[2] == data[3] diff --git a/pandas/tests/extension/base/interface.py b/pandas/tests/extension/base/interface.py new file mode 100644 index 0000000000000..8f17131a9482b --- /dev/null +++ b/pandas/tests/extension/base/interface.py @@ -0,0 +1,53 @@ +import numpy as np + +import pandas as pd +from pandas.compat import StringIO +from pandas.core.dtypes.common import is_extension_array_dtype +from pandas.core.dtypes.dtypes import ExtensionDtype + + +class BaseInterfaceTests(object): + """Tests that the basic interface is satisfied.""" + # ------------------------------------------------------------------------ + # Interface + # ------------------------------------------------------------------------ + + def test_len(self, data): + assert len(data) == 100 + + def test_ndim(self, data): + assert data.ndim == 1 + + def test_can_hold_na_valid(self, data): + assert data._can_hold_na in {True, False} + + def test_memory_usage(self, data): + s = pd.Series(data) + result = s.memory_usage(index=False) + assert result == s.nbytes + + def test_array_interface(self, data): + result = np.array(data) + assert result[0] == data[0] + + def test_as_ndarray_with_dtype_kind(self, data): + np.array(data, dtype=data.dtype.kind) + + def test_repr(self, data): + ser = pd.Series(data) + assert data.dtype.name in repr(ser) + + df = pd.DataFrame({"A": data}) + repr(df) + + def test_dtype_name_in_info(self, data): + buf = StringIO() + pd.DataFrame({"A": data}).info(buf=buf) + result = buf.getvalue() + assert data.dtype.name in result + + def test_is_extension_array_dtype(self, data): + assert is_extension_array_dtype(data) + assert is_extension_array_dtype(data.dtype) + assert is_extension_array_dtype(pd.Series(data)) + assert isinstance(data.dtype, ExtensionDtype) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py new file mode 100644 index 0000000000000..c77811ca63926 --- /dev/null +++ b/pandas/tests/extension/base/methods.py @@ -0,0 +1,32 @@ +import pytest +import numpy as np + +import pandas as pd +import pandas.util.testing as tm + + +class BaseMethodsTests(object): + """Various Series and DataFrame methods.""" + + @pytest.mark.parametrize('dropna', [True, False]) + def test_value_counts(self, all_data, dropna): + all_data = all_data[:10] + if dropna: + other = np.array(all_data[~all_data.isna()]) + else: + other = all_data + + result = pd.Series(all_data).value_counts(dropna=dropna).sort_index() + expected = pd.Series(other).value_counts(dropna=dropna).sort_index() + + tm.assert_series_equal(result, expected) + + def test_count(self, data_missing): + df = pd.DataFrame({"A": data_missing}) + result = df.count(axis='columns') + expected = pd.Series([0, 1]) + tm.assert_series_equal(result, expected) + + def test_apply_simple_series(self, data): + result = pd.Series(data).apply(id) + assert isinstance(result, pd.Series) diff --git a/pandas/tests/extension/base/missing.py b/pandas/tests/extension/base/missing.py new file mode 100644 index 0000000000000..1d6f2eea1f1f9 --- /dev/null +++ b/pandas/tests/extension/base/missing.py @@ -0,0 +1,45 @@ +import numpy as np + +import pandas as pd +import pandas.util.testing as tm + + +class BaseMissingTests(object): + def test_isna(self, data_missing): + if data_missing._can_hold_na: + expected = np.array([True, False]) + else: + expected = np.array([False, False]) + + result = pd.isna(data_missing) + tm.assert_numpy_array_equal(result, expected) + + result = pd.Series(data_missing).isna() + expected = pd.Series(expected) + tm.assert_series_equal(result, expected) + + def test_dropna_series(self, data_missing): + ser = pd.Series(data_missing) + result = ser.dropna() + expected = ser.iloc[[1]] + tm.assert_series_equal(result, expected) + + def test_dropna_frame(self, data_missing): + df = pd.DataFrame({"A": data_missing}) + + # defaults + result = df.dropna() + expected = df.iloc[[1]] + tm.assert_frame_equal(result, expected) + + # axis = 1 + result = df.dropna(axis='columns') + expected = pd.DataFrame(index=[0, 1]) + tm.assert_frame_equal(result, expected) + + # multiple + df = pd.DataFrame({"A": data_missing, + "B": [1, np.nan]}) + result = df.dropna() + expected = df.iloc[:0] + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py new file mode 100644 index 0000000000000..d8f577c6fa50d --- /dev/null +++ b/pandas/tests/extension/base/reshaping.py @@ -0,0 +1,61 @@ +import pytest + +import pandas as pd +import pandas.util.testing as tm +from pandas.core.internals import ExtensionBlock + + +class BaseReshapingTests(object): + """Tests for reshaping and concatenation.""" + @pytest.mark.parametrize('in_frame', [True, False]) + def test_concat(self, data, in_frame): + wrapped = pd.Series(data) + if in_frame: + wrapped = pd.DataFrame(wrapped) + result = pd.concat([wrapped, wrapped], ignore_index=True) + + assert len(result) == len(data) * 2 + + if in_frame: + dtype = result.dtypes[0] + else: + dtype = result.dtype + + assert dtype == data.dtype + assert isinstance(result._data.blocks[0], ExtensionBlock) + + def test_align(self, data, na_value): + a = data[:3] + b = data[2:5] + r1, r2 = pd.Series(a).align(pd.Series(b, index=[1, 2, 3])) + + # Assumes that the ctor can take a list of scalars of the type + e1 = pd.Series(type(data)(list(a) + [na_value])) + e2 = pd.Series(type(data)([na_value] + list(b))) + tm.assert_series_equal(r1, e1) + tm.assert_series_equal(r2, e2) + + def test_align_frame(self, data, na_value): + a = data[:3] + b = data[2:5] + r1, r2 = pd.DataFrame({'A': a}).align( + pd.DataFrame({'A': b}, index=[1, 2, 3]) + ) + + # Assumes that the ctor can take a list of scalars of the type + e1 = pd.DataFrame({'A': type(data)(list(a) + [na_value])}) + e2 = pd.DataFrame({'A': type(data)([na_value] + list(b))}) + tm.assert_frame_equal(r1, e1) + tm.assert_frame_equal(r2, e2) + + def test_set_frame_expand_regular_with_extension(self, data): + df = pd.DataFrame({"A": [1] * len(data)}) + df['B'] = data + expected = pd.DataFrame({"A": [1] * len(data), "B": data}) + tm.assert_frame_equal(df, expected) + + def test_set_frame_expand_extension_with_regular(self, data): + df = pd.DataFrame({'A': data}) + df['B'] = [1] * len(data) + expected = pd.DataFrame({"A": data, "B": [1] * len(data)}) + tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/extension/category/__init__.py b/pandas/tests/extension/category/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/extension/category/test_categorical.py b/pandas/tests/extension/category/test_categorical.py new file mode 100644 index 0000000000000..ec548fca6d901 --- /dev/null +++ b/pandas/tests/extension/category/test_categorical.py @@ -0,0 +1,84 @@ +import string + +import pytest +import numpy as np + +from pandas.api.types import CategoricalDtype +from pandas import Categorical +from pandas.tests.extension import base + + +def make_data(): + return np.random.choice(list(string.ascii_letters), size=100) + + +@pytest.fixture +def dtype(): + return CategoricalDtype() + + +@pytest.fixture +def data(): + """Length-100 PeriodArray for semantics test.""" + return Categorical(make_data()) + + +@pytest.fixture +def data_missing(): + """Length 2 array with [NA, Valid]""" + return Categorical([np.nan, 'A']) + + +@pytest.fixture +def na_value(): + return np.nan + + +class TestDtype(base.BaseDtypeTests): + pass + + +class TestInterface(base.BaseInterfaceTests): + @pytest.mark.skip(reason="Memory usage doesn't match") + def test_memory_usage(self): + # Is this deliberate? + pass + + +class TestConstructors(base.BaseConstructorsTests): + pass + + +class TestReshaping(base.BaseReshapingTests): + @pytest.mark.skip(reason="Unobserved categories preseved in concat.") + def test_align(self, data, na_value): + pass + + @pytest.mark.skip(reason="Unobserved categories preseved in concat.") + def test_align_frame(self, data, na_value): + pass + + +class TestGetitem(base.BaseGetitemTests): + @pytest.mark.skip(reason="Backwards compatability") + def test_getitem_scalar(self): + # CategoricalDtype.type isn't "correct" since it should + # be a parent of the elements (object). But don't want + # to break things by changing. + pass + + +class TestMissing(base.BaseMissingTests): + pass + + +class TestMethods(base.BaseMethodsTests): + pass + + @pytest.mark.skip(reason="Unobserved categories included") + def test_value_counts(self, all_data, dropna): + pass + + +class TestCasting(base.BaseCastingTests): + pass diff --git a/pandas/tests/extension/conftest.py b/pandas/tests/extension/conftest.py new file mode 100644 index 0000000000000..f86849b9cbd61 --- /dev/null +++ b/pandas/tests/extension/conftest.py @@ -0,0 +1,48 @@ +import operator + +import pytest + + +@pytest.fixture +def dtype(): + """A fixture providing the ExtensionDtype to validate.""" + raise NotImplementedError + + +@pytest.fixture +def data(): + """Length-100 array for this type.""" + raise NotImplementedError + + +@pytest.fixture +def data_missing(): + """Length-2 array with [NA, Valid]""" + raise NotImplementedError + + +@pytest.fixture(params=['data', 'data_missing']) +def all_data(request, data, data_missing): + """Parametrized fixture giving 'data' and 'data_missing'""" + if request.param == 'data': + return data + elif request.param == 'data_missing': + return data_missing + + +@pytest.fixture +def na_cmp(): + """Binary operator for comparing NA values. + + Should return a function of two arguments that returns + True if both arguments are (scalar) NA for your type. + + By defult, uses ``operator.or`` + """ + return operator.is_ + + +@pytest.fixture +def na_value(): + """The scalar missing value for this type. Default 'None'""" + return None diff --git a/pandas/tests/extension/decimal/__init__.py b/pandas/tests/extension/decimal/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py new file mode 100644 index 0000000000000..f526ac5996a10 --- /dev/null +++ b/pandas/tests/extension/decimal/array.py @@ -0,0 +1,86 @@ +import decimal +import numbers +import random +import sys + +import numpy as np + +import pandas as pd +from pandas.core.arrays import ExtensionArray +from pandas.core.dtypes.base import ExtensionDtype + + +class DecimalDtype(ExtensionDtype): + type = decimal.Decimal + name = 'decimal' + + @classmethod + def construct_from_string(cls, string): + if string == cls.name: + return cls() + else: + raise TypeError("Cannot construct a '{}' from " + "'{}'".format(cls, string)) + + +class DecimalArray(ExtensionArray): + dtype = DecimalDtype() + + def __init__(self, values): + values = np.asarray(values, dtype=object) + + self.values = values + + def __getitem__(self, item): + if isinstance(item, numbers.Integral): + return self.values[item] + else: + return type(self)(self.values[item]) + + def copy(self, deep=False): + if deep: + return type(self)(self.values.copy()) + return type(self)(self) + + def __setitem__(self, key, value): + if pd.api.types.is_list_like(value): + value = [decimal.Decimal(v) for v in value] + else: + value = decimal.Decimal(value) + self.values[key] = value + + def __len__(self): + return len(self.values) + + def __repr__(self): + return repr(self.values) + + @property + def nbytes(self): + n = len(self) + if n: + return n * sys.getsizeof(self[0]) + return 0 + + def isna(self): + return np.array([x.is_nan() for x in self.values]) + + def take(self, indexer, allow_fill=True, fill_value=None): + mask = indexer == -1 + + out = self.values.take(indexer) + out[mask] = self._na_value + + return type(self)(out) + + @property + def _na_value(self): + return decimal.Decimal('NaN') + + @classmethod + def _concat_same_type(cls, to_concat): + return cls(np.concatenate([x.values for x in to_concat])) + + +def make_data(): + return [decimal.Decimal(random.random()) for _ in range(100)] diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py new file mode 100644 index 0000000000000..7b4d079ecad87 --- /dev/null +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -0,0 +1,154 @@ +import decimal + +import numpy as np +import pandas as pd +import pandas.util.testing as tm +import pytest + +from pandas.tests.extension import base + +from .array import DecimalDtype, DecimalArray, make_data + + +@pytest.fixture +def dtype(): + return DecimalDtype() + + +@pytest.fixture +def data(): + return DecimalArray(make_data()) + + +@pytest.fixture +def data_missing(): + return DecimalArray([decimal.Decimal('NaN'), decimal.Decimal(1)]) + + +@pytest.fixture +def na_cmp(): + return lambda x, y: x.is_nan() and y.is_nan() + + +@pytest.fixture +def na_value(): + return decimal.Decimal("NaN") + + +class TestDtype(base.BaseDtypeTests): + pass + + +class TestInterface(base.BaseInterfaceTests): + pass + + +class TestConstructors(base.BaseConstructorsTests): + pass + + +class TestReshaping(base.BaseReshapingTests): + + def test_align(self, data, na_value): + # Have to override since assert_series_equal doesn't + # compare Decimal(NaN) properly. + a = data[:3] + b = data[2:5] + r1, r2 = pd.Series(a).align(pd.Series(b, index=[1, 2, 3])) + + # NaN handling + e1 = pd.Series(type(data)(list(a) + [na_value])) + e2 = pd.Series(type(data)([na_value] + list(b))) + tm.assert_series_equal(r1.iloc[:3], e1.iloc[:3]) + assert r1[3].is_nan() + assert e1[3].is_nan() + + tm.assert_series_equal(r2.iloc[1:], e2.iloc[1:]) + assert r2[0].is_nan() + assert e2[0].is_nan() + + def test_align_frame(self, data, na_value): + # Override for Decimal(NaN) comparison + a = data[:3] + b = data[2:5] + r1, r2 = pd.DataFrame({'A': a}).align( + pd.DataFrame({'A': b}, index=[1, 2, 3]) + ) + + # Assumes that the ctor can take a list of scalars of the type + e1 = pd.DataFrame({'A': type(data)(list(a) + [na_value])}) + e2 = pd.DataFrame({'A': type(data)([na_value] + list(b))}) + + tm.assert_frame_equal(r1.iloc[:3], e1.iloc[:3]) + assert r1.loc[3, 'A'].is_nan() + assert e1.loc[3, 'A'].is_nan() + + tm.assert_frame_equal(r2.iloc[1:], e2.iloc[1:]) + assert r2.loc[0, 'A'].is_nan() + assert e2.loc[0, 'A'].is_nan() + + +class TestGetitem(base.BaseGetitemTests): + pass + + +class TestMissing(base.BaseMissingTests): + pass + + +class TestMethods(base.BaseMethodsTests): + @pytest.mark.parametrize('dropna', [True, False]) + @pytest.mark.xfail(reason="value_counts not implemented yet.") + def test_value_counts(self, all_data, dropna): + all_data = all_data[:10] + if dropna: + other = np.array(all_data[~all_data.isna()]) + else: + other = all_data + + result = pd.Series(all_data).value_counts(dropna=dropna).sort_index() + expected = pd.Series(other).value_counts(dropna=dropna).sort_index() + + tm.assert_series_equal(result, expected) + + +class TestCasting(base.BaseCastingTests): + pass + + +def test_series_constructor_coerce_data_to_extension_dtype_raises(): + xpr = ("Cannot cast data to extension dtype 'decimal'. Pass the " + "extension array directly.") + with tm.assert_raises_regex(ValueError, xpr): + pd.Series([0, 1, 2], dtype=DecimalDtype()) + + +def test_series_constructor_with_same_dtype_ok(): + arr = DecimalArray([decimal.Decimal('10.0')]) + result = pd.Series(arr, dtype=DecimalDtype()) + expected = pd.Series(arr) + tm.assert_series_equal(result, expected) + + +def test_series_constructor_coerce_extension_array_to_dtype_raises(): + arr = DecimalArray([decimal.Decimal('10.0')]) + xpr = "Cannot specify a dtype 'int64' .* \('decimal'\)." + + with tm.assert_raises_regex(ValueError, xpr): + pd.Series(arr, dtype='int64') + + +def test_dataframe_constructor_with_same_dtype_ok(): + arr = DecimalArray([decimal.Decimal('10.0')]) + + result = pd.DataFrame({"A": arr}, dtype=DecimalDtype()) + expected = pd.DataFrame({"A": arr}) + tm.assert_frame_equal(result, expected) + + +def test_dataframe_constructor_with_different_dtype_raises(): + arr = DecimalArray([decimal.Decimal('10.0')]) + + xpr = "Cannot coerce extension array to dtype 'int64'. " + with tm.assert_raises_regex(ValueError, xpr): + pd.DataFrame({"A": arr}, dtype='int64') diff --git a/pandas/tests/extension/json/__init__.py b/pandas/tests/extension/json/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py new file mode 100644 index 0000000000000..90aac93c68f64 --- /dev/null +++ b/pandas/tests/extension/json/array.py @@ -0,0 +1,99 @@ +import collections +import itertools +import numbers +import random +import string +import sys + +import numpy as np + +from pandas.core.dtypes.base import ExtensionDtype +from pandas.core.arrays import ExtensionArray + + +class JSONDtype(ExtensionDtype): + type = collections.Mapping + name = 'json' + + @classmethod + def construct_from_string(cls, string): + if string == cls.name: + return cls() + else: + raise TypeError("Cannot construct a '{}' from " + "'{}'".format(cls, string)) + + +class JSONArray(ExtensionArray): + dtype = JSONDtype() + + def __init__(self, values): + for val in values: + if not isinstance(val, self.dtype.type): + raise TypeError + self.data = values + + def __getitem__(self, item): + if isinstance(item, numbers.Integral): + return self.data[item] + elif isinstance(item, np.ndarray) and item.dtype == 'bool': + return type(self)([x for x, m in zip(self, item) if m]) + else: + return type(self)(self.data[item]) + + def __setitem__(self, key, value): + if isinstance(key, numbers.Integral): + self.data[key] = value + else: + if not isinstance(value, (type(self), + collections.Sequence)): + # broadcast value + value = itertools.cycle([value]) + + if isinstance(key, np.ndarray) and key.dtype == 'bool': + # masking + for i, (k, v) in enumerate(zip(key, value)): + if k: + assert isinstance(v, self.dtype.type) + self.data[i] = v + else: + for k, v in zip(key, value): + assert isinstance(v, self.dtype.type) + self.data[k] = v + + def __len__(self): + return len(self.data) + + def __repr__(self): + return 'JSONArary({!r})'.format(self.data) + + @property + def nbytes(self): + return sys.getsizeof(self.data) + + def isna(self): + return np.array([x == self._na_value for x in self.data]) + + def take(self, indexer, allow_fill=True, fill_value=None): + output = [self.data[loc] if loc != -1 else self._na_value + for loc in indexer] + return type(self)(output) + + def copy(self, deep=False): + return type(self)(self.data[:]) + + @property + def _na_value(self): + return {} + + @classmethod + def _concat_same_type(cls, to_concat): + data = list(itertools.chain.from_iterable([x.data for x in to_concat])) + return cls(data) + + +def make_data(): + # TODO: Use a regular dict. See _NDFrameIndexer._setitem_with_indexer + return [collections.UserDict([ + (random.choice(string.ascii_letters), random.randint(0, 100)) + for _ in range(random.randint(0, 10))]) for _ in range(100)] diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py new file mode 100644 index 0000000000000..e0721bb1d8d1a --- /dev/null +++ b/pandas/tests/extension/json/test_json.py @@ -0,0 +1,73 @@ +import operator +import sys + +import pytest + + +from pandas.tests.extension import base + +from .array import JSONArray, JSONDtype, make_data + +pytestmark = pytest.mark.skipif(sys.version_info[0] == 2, + reason="Py2 doesn't have a UserDict") + + +@pytest.fixture +def dtype(): + return JSONDtype() + + +@pytest.fixture +def data(): + """Length-100 PeriodArray for semantics test.""" + return JSONArray(make_data()) + + +@pytest.fixture +def data_missing(): + """Length 2 array with [NA, Valid]""" + return JSONArray([{}, {'a': 10}]) + + +@pytest.fixture +def na_value(): + return {} + + +@pytest.fixture +def na_cmp(): + return operator.eq + + +class TestDtype(base.BaseDtypeTests): + pass + + +class TestInterface(base.BaseInterfaceTests): + pass + + +class TestConstructors(base.BaseConstructorsTests): + pass + + +class TestReshaping(base.BaseReshapingTests): + pass + + +class TestGetitem(base.BaseGetitemTests): + pass + + +class TestMissing(base.BaseMissingTests): + pass + + +class TestMethods(base.BaseMethodsTests): + @pytest.mark.skip(reason="Unhashable") + def test_value_counts(self, all_data, dropna): + pass + + +class TestCasting(base.BaseCastingTests): + pass diff --git a/pandas/tests/internals/test_external_block.py b/pandas/tests/extension/test_external_block.py similarity index 94% rename from pandas/tests/internals/test_external_block.py rename to pandas/tests/extension/test_external_block.py index 2487363df8f99..991da41168aa0 100644 --- a/pandas/tests/internals/test_external_block.py +++ b/pandas/tests/extension/test_external_block.py @@ -5,12 +5,12 @@ import pandas as pd from pandas.core.internals import ( - BlockManager, SingleBlockManager, ExtensionBlock) + BlockManager, SingleBlockManager, NonConsolidatableMixIn, Block) import pytest -class CustomBlock(ExtensionBlock): +class CustomBlock(NonConsolidatableMixIn, Block): _holder = np.ndarray From ce77b79b97d7e827f50ebd0b7ca292fc2d966cc7 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 23 Feb 2018 04:35:07 -0800 Subject: [PATCH 182/214] Separate TimedeltaIndex mul/div tests (#19848) --- .../tests/indexes/datetimes/test_datetime.py | 108 +--- .../tests/indexes/datetimes/test_indexing.py | 570 +++++++++++------- pandas/tests/indexes/period/test_indexing.py | 84 ++- pandas/tests/indexes/period/test_period.py | 37 +- .../indexes/timedeltas/test_arithmetic.py | 296 ++++----- .../tests/indexes/timedeltas/test_indexing.py | 229 +++---- 6 files changed, 677 insertions(+), 647 deletions(-) diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index 2cf33644377ab..b685584a29fb9 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -2,7 +2,7 @@ import pytest import numpy as np -from datetime import date, timedelta, time +from datetime import date import dateutil import pandas as pd @@ -18,112 +18,6 @@ class TestDatetimeIndex(object): - def test_get_loc(self): - idx = pd.date_range('2000-01-01', periods=3) - - for method in [None, 'pad', 'backfill', 'nearest']: - assert idx.get_loc(idx[1], method) == 1 - assert idx.get_loc(idx[1].to_pydatetime(), method) == 1 - assert idx.get_loc(str(idx[1]), method) == 1 - - if method is not None: - assert idx.get_loc(idx[1], method, - tolerance=pd.Timedelta('0 days')) == 1 - - assert idx.get_loc('2000-01-01', method='nearest') == 0 - assert idx.get_loc('2000-01-01T12', method='nearest') == 1 - - assert idx.get_loc('2000-01-01T12', method='nearest', - tolerance='1 day') == 1 - assert idx.get_loc('2000-01-01T12', method='nearest', - tolerance=pd.Timedelta('1D')) == 1 - assert idx.get_loc('2000-01-01T12', method='nearest', - tolerance=np.timedelta64(1, 'D')) == 1 - assert idx.get_loc('2000-01-01T12', method='nearest', - tolerance=timedelta(1)) == 1 - with tm.assert_raises_regex(ValueError, - 'unit abbreviation w/o a number'): - idx.get_loc('2000-01-01T12', method='nearest', tolerance='foo') - with pytest.raises(KeyError): - idx.get_loc('2000-01-01T03', method='nearest', tolerance='2 hours') - with pytest.raises( - ValueError, - match='tolerance size must match target index size'): - idx.get_loc('2000-01-01', method='nearest', - tolerance=[pd.Timedelta('1day').to_timedelta64(), - pd.Timedelta('1day').to_timedelta64()]) - - assert idx.get_loc('2000', method='nearest') == slice(0, 3) - assert idx.get_loc('2000-01', method='nearest') == slice(0, 3) - - assert idx.get_loc('1999', method='nearest') == 0 - assert idx.get_loc('2001', method='nearest') == 2 - - with pytest.raises(KeyError): - idx.get_loc('1999', method='pad') - with pytest.raises(KeyError): - idx.get_loc('2001', method='backfill') - - with pytest.raises(KeyError): - idx.get_loc('foobar') - with pytest.raises(TypeError): - idx.get_loc(slice(2)) - - idx = pd.to_datetime(['2000-01-01', '2000-01-04']) - assert idx.get_loc('2000-01-02', method='nearest') == 0 - assert idx.get_loc('2000-01-03', method='nearest') == 1 - assert idx.get_loc('2000-01', method='nearest') == slice(0, 2) - - # time indexing - idx = pd.date_range('2000-01-01', periods=24, freq='H') - tm.assert_numpy_array_equal(idx.get_loc(time(12)), - np.array([12]), check_dtype=False) - tm.assert_numpy_array_equal(idx.get_loc(time(12, 30)), - np.array([]), check_dtype=False) - with pytest.raises(NotImplementedError): - idx.get_loc(time(12, 30), method='pad') - - def test_get_indexer(self): - idx = pd.date_range('2000-01-01', periods=3) - exp = np.array([0, 1, 2], dtype=np.intp) - tm.assert_numpy_array_equal(idx.get_indexer(idx), exp) - - target = idx[0] + pd.to_timedelta(['-1 hour', '12 hours', - '1 day 1 hour']) - tm.assert_numpy_array_equal(idx.get_indexer(target, 'pad'), - np.array([-1, 0, 1], dtype=np.intp)) - tm.assert_numpy_array_equal(idx.get_indexer(target, 'backfill'), - np.array([0, 1, 2], dtype=np.intp)) - tm.assert_numpy_array_equal(idx.get_indexer(target, 'nearest'), - np.array([0, 1, 1], dtype=np.intp)) - tm.assert_numpy_array_equal( - idx.get_indexer(target, 'nearest', - tolerance=pd.Timedelta('1 hour')), - np.array([0, -1, 1], dtype=np.intp)) - tol_raw = [pd.Timedelta('1 hour'), - pd.Timedelta('1 hour'), - pd.Timedelta('1 hour').to_timedelta64(), ] - tm.assert_numpy_array_equal( - idx.get_indexer(target, 'nearest', - tolerance=[np.timedelta64(x) for x in tol_raw]), - np.array([0, -1, 1], dtype=np.intp)) - tol_bad = [pd.Timedelta('2 hour').to_timedelta64(), - pd.Timedelta('1 hour').to_timedelta64(), - 'foo', ] - with pytest.raises( - ValueError, match='abbreviation w/o a number'): - idx.get_indexer(target, 'nearest', tolerance=tol_bad) - with pytest.raises(ValueError): - idx.get_indexer(idx[[0]], method='nearest', tolerance='foo') - - def test_reasonable_keyerror(self): - # GH #1062 - index = DatetimeIndex(['1/3/2000']) - try: - index.get_loc('1/1/2000') - except KeyError as e: - assert '2000' in str(e) - def test_roundtrip_pickle_with_tz(self): # GH 8367 diff --git a/pandas/tests/indexes/datetimes/test_indexing.py b/pandas/tests/indexes/datetimes/test_indexing.py index a9f1a5e608ac7..af65a8618d30f 100644 --- a/pandas/tests/indexes/datetimes/test_indexing.py +++ b/pandas/tests/indexes/datetimes/test_indexing.py @@ -1,4 +1,4 @@ -from datetime import datetime +from datetime import datetime, timedelta, time import pytest import pytz @@ -12,10 +12,93 @@ START, END = datetime(2009, 1, 1), datetime(2010, 1, 1) -class TestDatetimeIndex(object): +class TestGetItem(object): + def test_getitem(self): + idx1 = pd.date_range('2011-01-01', '2011-01-31', freq='D', name='idx') + idx2 = pd.date_range('2011-01-01', '2011-01-31', freq='D', + tz='Asia/Tokyo', name='idx') - def test_where_other(self): + for idx in [idx1, idx2]: + result = idx[0] + assert result == Timestamp('2011-01-01', tz=idx.tz) + + result = idx[0:5] + expected = pd.date_range('2011-01-01', '2011-01-05', freq='D', + tz=idx.tz, name='idx') + tm.assert_index_equal(result, expected) + assert result.freq == expected.freq + + result = idx[0:10:2] + expected = pd.date_range('2011-01-01', '2011-01-09', freq='2D', + tz=idx.tz, name='idx') + tm.assert_index_equal(result, expected) + assert result.freq == expected.freq + + result = idx[-20:-5:3] + expected = pd.date_range('2011-01-12', '2011-01-24', freq='3D', + tz=idx.tz, name='idx') + tm.assert_index_equal(result, expected) + assert result.freq == expected.freq + + result = idx[4::-1] + expected = DatetimeIndex(['2011-01-05', '2011-01-04', '2011-01-03', + '2011-01-02', '2011-01-01'], + freq='-1D', tz=idx.tz, name='idx') + tm.assert_index_equal(result, expected) + assert result.freq == expected.freq + + def test_dti_business_getitem(self): + rng = pd.bdate_range(START, END) + smaller = rng[:5] + exp = DatetimeIndex(rng.view(np.ndarray)[:5]) + tm.assert_index_equal(smaller, exp) + + assert smaller.offset == rng.offset + + sliced = rng[::5] + assert sliced.offset == BDay() * 5 + fancy_indexed = rng[[4, 3, 2, 1, 0]] + assert len(fancy_indexed) == 5 + assert isinstance(fancy_indexed, DatetimeIndex) + assert fancy_indexed.freq is None + + # 32-bit vs. 64-bit platforms + assert rng[4] == rng[np.int_(4)] + + def test_dti_business_getitem_matplotlib_hackaround(self): + rng = pd.bdate_range(START, END) + values = rng[:, None] + expected = rng.values[:, None] + tm.assert_numpy_array_equal(values, expected) + + def test_dti_custom_getitem(self): + rng = pd.bdate_range(START, END, freq='C') + smaller = rng[:5] + exp = DatetimeIndex(rng.view(np.ndarray)[:5]) + tm.assert_index_equal(smaller, exp) + assert smaller.offset == rng.offset + + sliced = rng[::5] + assert sliced.offset == CDay() * 5 + + fancy_indexed = rng[[4, 3, 2, 1, 0]] + assert len(fancy_indexed) == 5 + assert isinstance(fancy_indexed, DatetimeIndex) + assert fancy_indexed.freq is None + + # 32-bit vs. 64-bit platforms + assert rng[4] == rng[np.int_(4)] + + def test_dti_custom_getitem_matplotlib_hackaround(self): + rng = pd.bdate_range(START, END, freq='C') + values = rng[:, None] + expected = rng.values[:, None] + tm.assert_numpy_array_equal(values, expected) + + +class TestWhere(object): + def test_where_other(self): # other is ndarray or Index i = pd.date_range('20130101', periods=3, tz='US/Eastern') @@ -46,6 +129,152 @@ def test_where_tz(self): expected = i2 tm.assert_index_equal(result, expected) + +class TestTake(object): + def test_take(self): + # GH#10295 + idx1 = pd.date_range('2011-01-01', '2011-01-31', freq='D', name='idx') + idx2 = pd.date_range('2011-01-01', '2011-01-31', freq='D', + tz='Asia/Tokyo', name='idx') + + for idx in [idx1, idx2]: + result = idx.take([0]) + assert result == Timestamp('2011-01-01', tz=idx.tz) + + result = idx.take([0, 1, 2]) + expected = pd.date_range('2011-01-01', '2011-01-03', freq='D', + tz=idx.tz, name='idx') + tm.assert_index_equal(result, expected) + assert result.freq == expected.freq + + result = idx.take([0, 2, 4]) + expected = pd.date_range('2011-01-01', '2011-01-05', freq='2D', + tz=idx.tz, name='idx') + tm.assert_index_equal(result, expected) + assert result.freq == expected.freq + + result = idx.take([7, 4, 1]) + expected = pd.date_range('2011-01-08', '2011-01-02', freq='-3D', + tz=idx.tz, name='idx') + tm.assert_index_equal(result, expected) + assert result.freq == expected.freq + + result = idx.take([3, 2, 5]) + expected = DatetimeIndex(['2011-01-04', '2011-01-03', + '2011-01-06'], + freq=None, tz=idx.tz, name='idx') + tm.assert_index_equal(result, expected) + assert result.freq is None + + result = idx.take([-3, 2, 5]) + expected = DatetimeIndex(['2011-01-29', '2011-01-03', + '2011-01-06'], + freq=None, tz=idx.tz, name='idx') + tm.assert_index_equal(result, expected) + assert result.freq is None + + def test_take_invalid_kwargs(self): + idx = pd.date_range('2011-01-01', '2011-01-31', freq='D', name='idx') + indices = [1, 6, 5, 9, 10, 13, 15, 3] + + msg = r"take\(\) got an unexpected keyword argument 'foo'" + tm.assert_raises_regex(TypeError, msg, idx.take, + indices, foo=2) + + msg = "the 'out' parameter is not supported" + tm.assert_raises_regex(ValueError, msg, idx.take, + indices, out=indices) + + msg = "the 'mode' parameter is not supported" + tm.assert_raises_regex(ValueError, msg, idx.take, + indices, mode='clip') + + # TODO: This method came from test_datetime; de-dup with version above + @pytest.mark.parametrize('tz', [None, 'US/Eastern', 'Asia/Tokyo']) + def test_take2(self, tz): + dates = [datetime(2010, 1, 1, 14), datetime(2010, 1, 1, 15), + datetime(2010, 1, 1, 17), datetime(2010, 1, 1, 21)] + + idx = DatetimeIndex(start='2010-01-01 09:00', + end='2010-02-01 09:00', freq='H', tz=tz, + name='idx') + expected = DatetimeIndex(dates, freq=None, name='idx', tz=tz) + + taken1 = idx.take([5, 6, 8, 12]) + taken2 = idx[[5, 6, 8, 12]] + + for taken in [taken1, taken2]: + tm.assert_index_equal(taken, expected) + assert isinstance(taken, DatetimeIndex) + assert taken.freq is None + assert taken.tz == expected.tz + assert taken.name == expected.name + + def test_take_fill_value(self): + # GH#12631 + idx = pd.DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01'], + name='xxx') + result = idx.take(np.array([1, 0, -1])) + expected = pd.DatetimeIndex(['2011-02-01', '2011-01-01', '2011-03-01'], + name='xxx') + tm.assert_index_equal(result, expected) + + # fill_value + result = idx.take(np.array([1, 0, -1]), fill_value=True) + expected = pd.DatetimeIndex(['2011-02-01', '2011-01-01', 'NaT'], + name='xxx') + tm.assert_index_equal(result, expected) + + # allow_fill=False + result = idx.take(np.array([1, 0, -1]), allow_fill=False, + fill_value=True) + expected = pd.DatetimeIndex(['2011-02-01', '2011-01-01', '2011-03-01'], + name='xxx') + tm.assert_index_equal(result, expected) + + msg = ('When allow_fill=True and fill_value is not None, ' + 'all indices must be >= -1') + with tm.assert_raises_regex(ValueError, msg): + idx.take(np.array([1, 0, -2]), fill_value=True) + with tm.assert_raises_regex(ValueError, msg): + idx.take(np.array([1, 0, -5]), fill_value=True) + + with pytest.raises(IndexError): + idx.take(np.array([1, -5])) + + def test_take_fill_value_with_timezone(self): + idx = pd.DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01'], + name='xxx', tz='US/Eastern') + result = idx.take(np.array([1, 0, -1])) + expected = pd.DatetimeIndex(['2011-02-01', '2011-01-01', '2011-03-01'], + name='xxx', tz='US/Eastern') + tm.assert_index_equal(result, expected) + + # fill_value + result = idx.take(np.array([1, 0, -1]), fill_value=True) + expected = pd.DatetimeIndex(['2011-02-01', '2011-01-01', 'NaT'], + name='xxx', tz='US/Eastern') + tm.assert_index_equal(result, expected) + + # allow_fill=False + result = idx.take(np.array([1, 0, -1]), allow_fill=False, + fill_value=True) + expected = pd.DatetimeIndex(['2011-02-01', '2011-01-01', '2011-03-01'], + name='xxx', tz='US/Eastern') + tm.assert_index_equal(result, expected) + + msg = ('When allow_fill=True and fill_value is not None, ' + 'all indices must be >= -1') + with tm.assert_raises_regex(ValueError, msg): + idx.take(np.array([1, 0, -2]), fill_value=True) + with tm.assert_raises_regex(ValueError, msg): + idx.take(np.array([1, 0, -5]), fill_value=True) + + with pytest.raises(IndexError): + idx.take(np.array([1, -5])) + + +class TestDatetimeIndex(object): @pytest.mark.parametrize('null', [None, np.nan, pd.NaT]) @pytest.mark.parametrize('tz', [None, 'UTC', 'US/Eastern']) def test_insert_nat(self, tz, null): @@ -253,233 +482,108 @@ def test_delete_slice(self): assert result.freq == expected.freq assert result.tz == expected.tz - def test_getitem(self): - idx1 = pd.date_range('2011-01-01', '2011-01-31', freq='D', name='idx') - idx2 = pd.date_range('2011-01-01', '2011-01-31', freq='D', - tz='Asia/Tokyo', name='idx') - - for idx in [idx1, idx2]: - result = idx[0] - assert result == Timestamp('2011-01-01', tz=idx.tz) - - result = idx[0:5] - expected = pd.date_range('2011-01-01', '2011-01-05', freq='D', - tz=idx.tz, name='idx') - tm.assert_index_equal(result, expected) - assert result.freq == expected.freq - - result = idx[0:10:2] - expected = pd.date_range('2011-01-01', '2011-01-09', freq='2D', - tz=idx.tz, name='idx') - tm.assert_index_equal(result, expected) - assert result.freq == expected.freq - - result = idx[-20:-5:3] - expected = pd.date_range('2011-01-12', '2011-01-24', freq='3D', - tz=idx.tz, name='idx') - tm.assert_index_equal(result, expected) - assert result.freq == expected.freq - - result = idx[4::-1] - expected = DatetimeIndex(['2011-01-05', '2011-01-04', '2011-01-03', - '2011-01-02', '2011-01-01'], - freq='-1D', tz=idx.tz, name='idx') - tm.assert_index_equal(result, expected) - assert result.freq == expected.freq - - def test_take(self): - # GH 10295 - idx1 = pd.date_range('2011-01-01', '2011-01-31', freq='D', name='idx') - idx2 = pd.date_range('2011-01-01', '2011-01-31', freq='D', - tz='Asia/Tokyo', name='idx') - - for idx in [idx1, idx2]: - result = idx.take([0]) - assert result == Timestamp('2011-01-01', tz=idx.tz) - - result = idx.take([0, 1, 2]) - expected = pd.date_range('2011-01-01', '2011-01-03', freq='D', - tz=idx.tz, name='idx') - tm.assert_index_equal(result, expected) - assert result.freq == expected.freq - - result = idx.take([0, 2, 4]) - expected = pd.date_range('2011-01-01', '2011-01-05', freq='2D', - tz=idx.tz, name='idx') - tm.assert_index_equal(result, expected) - assert result.freq == expected.freq - - result = idx.take([7, 4, 1]) - expected = pd.date_range('2011-01-08', '2011-01-02', freq='-3D', - tz=idx.tz, name='idx') - tm.assert_index_equal(result, expected) - assert result.freq == expected.freq - - result = idx.take([3, 2, 5]) - expected = DatetimeIndex(['2011-01-04', '2011-01-03', - '2011-01-06'], - freq=None, tz=idx.tz, name='idx') - tm.assert_index_equal(result, expected) - assert result.freq is None - - result = idx.take([-3, 2, 5]) - expected = DatetimeIndex(['2011-01-29', '2011-01-03', - '2011-01-06'], - freq=None, tz=idx.tz, name='idx') - tm.assert_index_equal(result, expected) - assert result.freq is None - - def test_take_invalid_kwargs(self): - idx = pd.date_range('2011-01-01', '2011-01-31', freq='D', name='idx') - indices = [1, 6, 5, 9, 10, 13, 15, 3] - - msg = r"take\(\) got an unexpected keyword argument 'foo'" - tm.assert_raises_regex(TypeError, msg, idx.take, - indices, foo=2) - - msg = "the 'out' parameter is not supported" - tm.assert_raises_regex(ValueError, msg, idx.take, - indices, out=indices) - - msg = "the 'mode' parameter is not supported" - tm.assert_raises_regex(ValueError, msg, idx.take, - indices, mode='clip') - - # TODO: This method came from test_datetime; de-dup with version above - @pytest.mark.parametrize('tz', [None, 'US/Eastern', 'Asia/Tokyo']) - def test_take2(self, tz): - dates = [datetime(2010, 1, 1, 14), datetime(2010, 1, 1, 15), - datetime(2010, 1, 1, 17), datetime(2010, 1, 1, 21)] - - idx = DatetimeIndex(start='2010-01-01 09:00', - end='2010-02-01 09:00', freq='H', tz=tz, - name='idx') - expected = DatetimeIndex(dates, freq=None, name='idx', tz=tz) - - taken1 = idx.take([5, 6, 8, 12]) - taken2 = idx[[5, 6, 8, 12]] - - for taken in [taken1, taken2]: - tm.assert_index_equal(taken, expected) - assert isinstance(taken, DatetimeIndex) - assert taken.freq is None - assert taken.tz == expected.tz - assert taken.name == expected.name - - def test_take_fill_value(self): - # GH 12631 - idx = pd.DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01'], - name='xxx') - result = idx.take(np.array([1, 0, -1])) - expected = pd.DatetimeIndex(['2011-02-01', '2011-01-01', '2011-03-01'], - name='xxx') - tm.assert_index_equal(result, expected) - - # fill_value - result = idx.take(np.array([1, 0, -1]), fill_value=True) - expected = pd.DatetimeIndex(['2011-02-01', '2011-01-01', 'NaT'], - name='xxx') - tm.assert_index_equal(result, expected) - - # allow_fill=False - result = idx.take(np.array([1, 0, -1]), allow_fill=False, - fill_value=True) - expected = pd.DatetimeIndex(['2011-02-01', '2011-01-01', '2011-03-01'], - name='xxx') - tm.assert_index_equal(result, expected) - - msg = ('When allow_fill=True and fill_value is not None, ' - 'all indices must be >= -1') - with tm.assert_raises_regex(ValueError, msg): - idx.take(np.array([1, 0, -2]), fill_value=True) - with tm.assert_raises_regex(ValueError, msg): - idx.take(np.array([1, 0, -5]), fill_value=True) - - with pytest.raises(IndexError): - idx.take(np.array([1, -5])) - - def test_take_fill_value_with_timezone(self): - idx = pd.DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01'], - name='xxx', tz='US/Eastern') - result = idx.take(np.array([1, 0, -1])) - expected = pd.DatetimeIndex(['2011-02-01', '2011-01-01', '2011-03-01'], - name='xxx', tz='US/Eastern') - tm.assert_index_equal(result, expected) - - # fill_value - result = idx.take(np.array([1, 0, -1]), fill_value=True) - expected = pd.DatetimeIndex(['2011-02-01', '2011-01-01', 'NaT'], - name='xxx', tz='US/Eastern') - tm.assert_index_equal(result, expected) - - # allow_fill=False - result = idx.take(np.array([1, 0, -1]), allow_fill=False, - fill_value=True) - expected = pd.DatetimeIndex(['2011-02-01', '2011-01-01', '2011-03-01'], - name='xxx', tz='US/Eastern') - tm.assert_index_equal(result, expected) - - msg = ('When allow_fill=True and fill_value is not None, ' - 'all indices must be >= -1') - with tm.assert_raises_regex(ValueError, msg): - idx.take(np.array([1, 0, -2]), fill_value=True) - with tm.assert_raises_regex(ValueError, msg): - idx.take(np.array([1, 0, -5]), fill_value=True) - - with pytest.raises(IndexError): - idx.take(np.array([1, -5])) - - -class TestBusinessDatetimeIndexIndexing(object): - def setup_method(self, method): - self.rng = pd.bdate_range(START, END) - - def test_getitem(self): - smaller = self.rng[:5] - exp = DatetimeIndex(self.rng.view(np.ndarray)[:5]) - tm.assert_index_equal(smaller, exp) - - assert smaller.offset == self.rng.offset - - sliced = self.rng[::5] - assert sliced.offset == BDay() * 5 - - fancy_indexed = self.rng[[4, 3, 2, 1, 0]] - assert len(fancy_indexed) == 5 - assert isinstance(fancy_indexed, DatetimeIndex) - assert fancy_indexed.freq is None - - # 32-bit vs. 64-bit platforms - assert self.rng[4] == self.rng[np.int_(4)] - - def test_getitem_matplotlib_hackaround(self): - values = self.rng[:, None] - expected = self.rng.values[:, None] - tm.assert_numpy_array_equal(values, expected) - - -class TestCustomDatetimeIndexIndexing(object): - def setup_method(self, method): - self.rng = pd.bdate_range(START, END, freq='C') - - def test_getitem(self): - smaller = self.rng[:5] - exp = DatetimeIndex(self.rng.view(np.ndarray)[:5]) - tm.assert_index_equal(smaller, exp) - assert smaller.offset == self.rng.offset - - sliced = self.rng[::5] - assert sliced.offset == CDay() * 5 - - fancy_indexed = self.rng[[4, 3, 2, 1, 0]] - assert len(fancy_indexed) == 5 - assert isinstance(fancy_indexed, DatetimeIndex) - assert fancy_indexed.freq is None - - # 32-bit vs. 64-bit platforms - assert self.rng[4] == self.rng[np.int_(4)] - - def test_getitem_matplotlib_hackaround(self): - values = self.rng[:, None] - expected = self.rng.values[:, None] - tm.assert_numpy_array_equal(values, expected) + def test_get_loc(self): + idx = pd.date_range('2000-01-01', periods=3) + + for method in [None, 'pad', 'backfill', 'nearest']: + assert idx.get_loc(idx[1], method) == 1 + assert idx.get_loc(idx[1].to_pydatetime(), method) == 1 + assert idx.get_loc(str(idx[1]), method) == 1 + + if method is not None: + assert idx.get_loc(idx[1], method, + tolerance=pd.Timedelta('0 days')) == 1 + + assert idx.get_loc('2000-01-01', method='nearest') == 0 + assert idx.get_loc('2000-01-01T12', method='nearest') == 1 + + assert idx.get_loc('2000-01-01T12', method='nearest', + tolerance='1 day') == 1 + assert idx.get_loc('2000-01-01T12', method='nearest', + tolerance=pd.Timedelta('1D')) == 1 + assert idx.get_loc('2000-01-01T12', method='nearest', + tolerance=np.timedelta64(1, 'D')) == 1 + assert idx.get_loc('2000-01-01T12', method='nearest', + tolerance=timedelta(1)) == 1 + with tm.assert_raises_regex(ValueError, + 'unit abbreviation w/o a number'): + idx.get_loc('2000-01-01T12', method='nearest', tolerance='foo') + with pytest.raises(KeyError): + idx.get_loc('2000-01-01T03', method='nearest', tolerance='2 hours') + with pytest.raises( + ValueError, + match='tolerance size must match target index size'): + idx.get_loc('2000-01-01', method='nearest', + tolerance=[pd.Timedelta('1day').to_timedelta64(), + pd.Timedelta('1day').to_timedelta64()]) + + assert idx.get_loc('2000', method='nearest') == slice(0, 3) + assert idx.get_loc('2000-01', method='nearest') == slice(0, 3) + + assert idx.get_loc('1999', method='nearest') == 0 + assert idx.get_loc('2001', method='nearest') == 2 + + with pytest.raises(KeyError): + idx.get_loc('1999', method='pad') + with pytest.raises(KeyError): + idx.get_loc('2001', method='backfill') + + with pytest.raises(KeyError): + idx.get_loc('foobar') + with pytest.raises(TypeError): + idx.get_loc(slice(2)) + + idx = pd.to_datetime(['2000-01-01', '2000-01-04']) + assert idx.get_loc('2000-01-02', method='nearest') == 0 + assert idx.get_loc('2000-01-03', method='nearest') == 1 + assert idx.get_loc('2000-01', method='nearest') == slice(0, 2) + + # time indexing + idx = pd.date_range('2000-01-01', periods=24, freq='H') + tm.assert_numpy_array_equal(idx.get_loc(time(12)), + np.array([12]), check_dtype=False) + tm.assert_numpy_array_equal(idx.get_loc(time(12, 30)), + np.array([]), check_dtype=False) + with pytest.raises(NotImplementedError): + idx.get_loc(time(12, 30), method='pad') + + def test_get_indexer(self): + idx = pd.date_range('2000-01-01', periods=3) + exp = np.array([0, 1, 2], dtype=np.intp) + tm.assert_numpy_array_equal(idx.get_indexer(idx), exp) + + target = idx[0] + pd.to_timedelta(['-1 hour', '12 hours', + '1 day 1 hour']) + tm.assert_numpy_array_equal(idx.get_indexer(target, 'pad'), + np.array([-1, 0, 1], dtype=np.intp)) + tm.assert_numpy_array_equal(idx.get_indexer(target, 'backfill'), + np.array([0, 1, 2], dtype=np.intp)) + tm.assert_numpy_array_equal(idx.get_indexer(target, 'nearest'), + np.array([0, 1, 1], dtype=np.intp)) + tm.assert_numpy_array_equal( + idx.get_indexer(target, 'nearest', + tolerance=pd.Timedelta('1 hour')), + np.array([0, -1, 1], dtype=np.intp)) + tol_raw = [pd.Timedelta('1 hour'), + pd.Timedelta('1 hour'), + pd.Timedelta('1 hour').to_timedelta64(), ] + tm.assert_numpy_array_equal( + idx.get_indexer(target, 'nearest', + tolerance=[np.timedelta64(x) for x in tol_raw]), + np.array([0, -1, 1], dtype=np.intp)) + tol_bad = [pd.Timedelta('2 hour').to_timedelta64(), + pd.Timedelta('1 hour').to_timedelta64(), + 'foo', ] + with pytest.raises( + ValueError, match='abbreviation w/o a number'): + idx.get_indexer(target, 'nearest', tolerance=tol_bad) + with pytest.raises(ValueError): + idx.get_indexer(idx[[0]], method='nearest', tolerance='foo') + + def test_reasonable_keyerror(self): + # GH#1062 + index = DatetimeIndex(['1/3/2000']) + try: + index.get_loc('1/1/2000') + except KeyError as e: + assert '2000' in str(e) diff --git a/pandas/tests/indexes/period/test_indexing.py b/pandas/tests/indexes/period/test_indexing.py index b913934195260..6b8e2203e83fd 100644 --- a/pandas/tests/indexes/period/test_indexing.py +++ b/pandas/tests/indexes/period/test_indexing.py @@ -6,9 +6,9 @@ import pandas as pd from pandas.util import testing as tm from pandas.compat import lrange -from pandas._libs import tslib, tslibs +from pandas._libs import tslibs from pandas import (PeriodIndex, Series, DatetimeIndex, - period_range, Period) + period_range, Period, notna) from pandas._libs.tslibs import period as libperiod @@ -119,7 +119,7 @@ def test_getitem_datetime(self): def test_getitem_nat(self): idx = pd.PeriodIndex(['2011-01', 'NaT', '2011-02'], freq='M') assert idx[0] == pd.Period('2011-01', freq='M') - assert idx[1] is tslib.NaT + assert idx[1] is pd.NaT s = pd.Series([0, 1, 2], index=idx) assert s[pd.NaT] == 1 @@ -127,7 +127,7 @@ def test_getitem_nat(self): s = pd.Series(idx, index=idx) assert (s[pd.Period('2011-01', freq='M')] == pd.Period('2011-01', freq='M')) - assert s[pd.NaT] is tslib.NaT + assert s[pd.NaT] is pd.NaT def test_getitem_list_periods(self): # GH 7710 @@ -190,31 +190,43 @@ def test_getitem_day(self): s[v] -class TestIndexing(object): +class TestWhere(object): + @pytest.mark.parametrize('klass', [list, tuple, np.array, Series]) + def test_where(self, klass): + i = period_range('20130101', periods=5, freq='D') + cond = [True] * len(i) + expected = i + result = i.where(klass(cond)) + tm.assert_index_equal(result, expected) - def test_get_loc_msg(self): - idx = period_range('2000-1-1', freq='A', periods=10) - bad_period = Period('2012', 'A') - pytest.raises(KeyError, idx.get_loc, bad_period) + cond = [False] + [True] * (len(i) - 1) + expected = PeriodIndex([pd.NaT] + i[1:].tolist(), freq='D') + result = i.where(klass(cond)) + tm.assert_index_equal(result, expected) - try: - idx.get_loc(bad_period) - except KeyError as inst: - assert inst.args[0] == bad_period + def test_where_other(self): + i = period_range('20130101', periods=5, freq='D') + for arr in [np.nan, pd.NaT]: + result = i.where(notna(i), other=np.nan) + expected = i + tm.assert_index_equal(result, expected) - def test_get_loc_nat(self): - didx = DatetimeIndex(['2011-01-01', 'NaT', '2011-01-03']) - pidx = PeriodIndex(['2011-01-01', 'NaT', '2011-01-03'], freq='M') + i2 = i.copy() + i2 = pd.PeriodIndex([pd.NaT, pd.NaT] + i[2:].tolist(), + freq='D') + result = i.where(notna(i2), i2) + tm.assert_index_equal(result, i2) - # check DatetimeIndex compat - for idx in [didx, pidx]: - assert idx.get_loc(pd.NaT) == 1 - assert idx.get_loc(None) == 1 - assert idx.get_loc(float('nan')) == 1 - assert idx.get_loc(np.nan) == 1 + i2 = i.copy() + i2 = pd.PeriodIndex([pd.NaT, pd.NaT] + i[2:].tolist(), + freq='D') + result = i.where(notna(i2), i2.values) + tm.assert_index_equal(result, i2) + +class TestTake(object): def test_take(self): - # GH 10295 + # GH#10295 idx1 = pd.period_range('2011-01-01', '2011-01-31', freq='D', name='idx') @@ -278,7 +290,7 @@ def test_take_misc(self): assert taken.name == expected.name def test_take_fill_value(self): - # GH 12631 + # GH#12631 idx = pd.PeriodIndex(['2011-01-01', '2011-02-01', '2011-03-01'], name='xxx', freq='D') result = idx.take(np.array([1, 0, -1])) @@ -309,6 +321,30 @@ def test_take_fill_value(self): with pytest.raises(IndexError): idx.take(np.array([1, -5])) + +class TestIndexing(object): + + def test_get_loc_msg(self): + idx = period_range('2000-1-1', freq='A', periods=10) + bad_period = Period('2012', 'A') + pytest.raises(KeyError, idx.get_loc, bad_period) + + try: + idx.get_loc(bad_period) + except KeyError as inst: + assert inst.args[0] == bad_period + + def test_get_loc_nat(self): + didx = DatetimeIndex(['2011-01-01', 'NaT', '2011-01-03']) + pidx = PeriodIndex(['2011-01-01', 'NaT', '2011-01-03'], freq='M') + + # check DatetimeIndex compat + for idx in [didx, pidx]: + assert idx.get_loc(pd.NaT) == 1 + assert idx.get_loc(None) == 1 + assert idx.get_loc(float('nan')) == 1 + assert idx.get_loc(np.nan) == 1 + def test_get_loc(self): # GH 17717 p0 = pd.Period('2017-09-01') diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index dd437363cfc1d..4548d7fa1a468 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -5,7 +5,7 @@ import pandas as pd import pandas.util._test_decorators as td from pandas.util import testing as tm -from pandas import (PeriodIndex, period_range, notna, DatetimeIndex, NaT, +from pandas import (PeriodIndex, period_range, DatetimeIndex, NaT, Index, Period, Series, DataFrame, date_range, offsets) @@ -33,38 +33,9 @@ def test_pickle_round_trip(self, freq): result = tm.round_trip_pickle(idx) tm.assert_index_equal(result, idx) - @pytest.mark.parametrize('klass', [list, tuple, np.array, Series]) - def test_where(self, klass): - i = self.create_index() - cond = [True] * len(i) - expected = i - result = i.where(klass(cond)) - tm.assert_index_equal(result, expected) - - cond = [False] + [True] * (len(i) - 1) - expected = PeriodIndex([NaT] + i[1:].tolist(), freq='D') - result = i.where(klass(cond)) - tm.assert_index_equal(result, expected) - - def test_where_other(self): - - i = self.create_index() - for arr in [np.nan, pd.NaT]: - result = i.where(notna(i), other=np.nan) - expected = i - tm.assert_index_equal(result, expected) - - i2 = i.copy() - i2 = pd.PeriodIndex([pd.NaT, pd.NaT] + i[2:].tolist(), - freq='D') - result = i.where(notna(i2), i2) - tm.assert_index_equal(result, i2) - - i2 = i.copy() - i2 = pd.PeriodIndex([pd.NaT, pd.NaT] + i[2:].tolist(), - freq='D') - result = i.where(notna(i2), i2.values) - tm.assert_index_equal(result, i2) + def test_where(self): + # This is handled in test_indexing + pass def test_repeat(self): # GH10183 diff --git a/pandas/tests/indexes/timedeltas/test_arithmetic.py b/pandas/tests/indexes/timedeltas/test_arithmetic.py index 24341b3419859..282501860f7e5 100644 --- a/pandas/tests/indexes/timedeltas/test_arithmetic.py +++ b/pandas/tests/indexes/timedeltas/test_arithmetic.py @@ -123,8 +123,149 @@ def test_comparisons_nat(self): tm.assert_numpy_array_equal(result, expected) +class TestTimedeltaIndexMultiplicationDivision(object): + # __mul__, __rmul__, + # __div__, __rdiv__, __floordiv__, __rfloordiv__, + # __mod__, __rmod__, __divmod__, __rdivmod__ + + # ------------------------------------------------------------- + # Multiplication + # organized with scalar others first, then array-like + + def test_tdi_mul_int(self): + idx = TimedeltaIndex(np.arange(5, dtype='int64')) + result = idx * 1 + tm.assert_index_equal(result, idx) + + def test_tdi_rmul_int(self): + idx = TimedeltaIndex(np.arange(5, dtype='int64')) + result = 1 * idx + tm.assert_index_equal(result, idx) + + def test_tdi_mul_tdlike_scalar_raises(self, delta): + rng = timedelta_range('1 days', '10 days', name='foo') + with pytest.raises(TypeError): + rng * delta + + def test_tdi_mul_int_array_zerodim(self): + rng5 = np.arange(5, dtype='int64') + idx = TimedeltaIndex(rng5) + expected = TimedeltaIndex(rng5 * 5) + result = idx * np.array(5, dtype='int64') + tm.assert_index_equal(result, expected) + + def test_tdi_mul_int_array(self): + rng5 = np.arange(5, dtype='int64') + idx = TimedeltaIndex(rng5) + didx = TimedeltaIndex(rng5 ** 2) + + result = idx * rng5 + tm.assert_index_equal(result, didx) + + def test_tdi_mul_dti_raises(self): + idx = TimedeltaIndex(np.arange(5, dtype='int64')) + with pytest.raises(TypeError): + idx * idx + + def test_tdi_mul_too_short_raises(self): + idx = TimedeltaIndex(np.arange(5, dtype='int64')) + with pytest.raises(TypeError): + idx * TimedeltaIndex(np.arange(3)) + with pytest.raises(ValueError): + idx * np.array([1, 2]) + + def test_tdi_mul_int_series(self): + idx = TimedeltaIndex(np.arange(5, dtype='int64')) + didx = TimedeltaIndex(np.arange(5, dtype='int64') ** 2) + + result = idx * Series(np.arange(5, dtype='int64')) + + tm.assert_series_equal(result, Series(didx)) + + def test_tdi_mul_float_series(self): + idx = TimedeltaIndex(np.arange(5, dtype='int64')) + + rng5f = np.arange(5, dtype='float64') + result = idx * Series(rng5f + 0.1) + expected = Series(TimedeltaIndex(rng5f * (rng5f + 0.1))) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize('other', [np.arange(1, 11), + pd.Int64Index(range(1, 11)), + pd.UInt64Index(range(1, 11)), + pd.Float64Index(range(1, 11)), + pd.RangeIndex(1, 11)]) + def test_tdi_rmul_arraylike(self, other): + tdi = TimedeltaIndex(['1 Day'] * 10) + expected = timedelta_range('1 days', '10 days') + + result = other * tdi + tm.assert_index_equal(result, expected) + commute = tdi * other + tm.assert_index_equal(commute, expected) + + # ------------------------------------------------------------- + # TimedeltaIndex.__div__ + + def test_tdi_div_int(self): + idx = TimedeltaIndex(np.arange(5, dtype='int64')) + result = idx / 1 + tm.assert_index_equal(result, idx) + + def test_tdi_div_tdlike_scalar(self, delta): + rng = timedelta_range('1 days', '10 days', name='foo') + expected = Int64Index((np.arange(10) + 1) * 12, name='foo') + + result = rng / delta + tm.assert_index_equal(result, expected, exact=False) + + def test_tdi_div_tdlike_scalar_with_nat(self, delta): + rng = TimedeltaIndex(['1 days', pd.NaT, '2 days'], name='foo') + expected = Float64Index([12, np.nan, 24], name='foo') + result = rng / delta + tm.assert_index_equal(result, expected) + + def test_tdi_div_nat_raises(self): + # don't allow division by NaT (make could in the future) + rng = timedelta_range('1 days', '10 days', name='foo') + with pytest.raises(TypeError): + rng / pd.NaT + + # ------------------------------------------------------------- + # TimedeltaIndex.__floordiv__ + + def test_tdi_floordiv_int(self): + idx = TimedeltaIndex(np.arange(5, dtype='int64')) + result = idx // 1 + tm.assert_index_equal(result, idx) + + def test_tdi_floordiv_tdlike_scalar(self, delta): + tdi = timedelta_range('1 days', '10 days', name='foo') + expected = Int64Index((np.arange(10) + 1) * 12, name='foo') + + result = tdi // delta + tm.assert_index_equal(result, expected, exact=False) + + @pytest.mark.parametrize('scalar_td', [ + timedelta(minutes=10, seconds=7), + Timedelta('10m7s'), + Timedelta('10m7s').to_timedelta64()]) + def test_tdi_floordiv_timedelta_scalar(self, scalar_td): + # GH#19125 + tdi = TimedeltaIndex(['00:05:03', '00:05:03', pd.NaT], freq=None) + expected = pd.Index([2.0, 2.0, np.nan]) + + res = tdi.__rfloordiv__(scalar_td) + tm.assert_index_equal(res, expected) + + expected = pd.Index([0.0, 0.0, np.nan]) + + res = tdi // (scalar_td) + tm.assert_index_equal(res, expected) + + class TestTimedeltaIndexArithmetic(object): - _holder = TimedeltaIndex + # Addition and Subtraction Operations # ------------------------------------------------------------- # Invalid Operations @@ -138,6 +279,20 @@ def test_tdi_add_str_invalid(self): with pytest.raises(TypeError): 'a' + tdi + @pytest.mark.parametrize('freq', [None, 'H']) + def test_tdi_sub_period(self, freq): + # GH#13078 + # not supported, check TypeError + p = pd.Period('2011-01-01', freq='D') + + idx = pd.TimedeltaIndex(['1 hours', '2 hours'], freq=freq) + + with pytest.raises(TypeError): + idx - p + + with pytest.raises(TypeError): + p - idx + # ------------------------------------------------------------- # TimedeltaIndex.shift is used by __add__/__sub__ @@ -310,69 +465,6 @@ def test_tdi_add_sub_anchored_offset_arraylike(self, box): with tm.assert_produces_warning(PerformanceWarning): anchored - tdi - def test_mul_int(self): - idx = self._holder(np.arange(5, dtype='int64')) - result = idx * 1 - tm.assert_index_equal(result, idx) - - def test_rmul_int(self): - idx = self._holder(np.arange(5, dtype='int64')) - result = 1 * idx - tm.assert_index_equal(result, idx) - - def test_div_int(self): - idx = self._holder(np.arange(5, dtype='int64')) - result = idx / 1 - tm.assert_index_equal(result, idx) - - def test_floordiv_int(self): - idx = self._holder(np.arange(5, dtype='int64')) - result = idx // 1 - tm.assert_index_equal(result, idx) - - def test_mul_int_array_zerodim(self): - rng5 = np.arange(5, dtype='int64') - idx = self._holder(rng5) - expected = self._holder(rng5 * 5) - result = idx * np.array(5, dtype='int64') - tm.assert_index_equal(result, expected) - - def test_mul_int_array(self): - rng5 = np.arange(5, dtype='int64') - idx = self._holder(rng5) - didx = self._holder(rng5 ** 2) - - result = idx * rng5 - tm.assert_index_equal(result, didx) - - def test_mul_int_series(self): - idx = self._holder(np.arange(5, dtype='int64')) - didx = self._holder(np.arange(5, dtype='int64') ** 2) - - result = idx * Series(np.arange(5, dtype='int64')) - - tm.assert_series_equal(result, Series(didx)) - - def test_mul_float_series(self): - idx = self._holder(np.arange(5, dtype='int64')) - - rng5f = np.arange(5, dtype='float64') - result = idx * Series(rng5f + 0.1) - expected = Series(self._holder(rng5f * (rng5f + 0.1))) - tm.assert_series_equal(result, expected) - - def test_dti_mul_dti_raises(self): - idx = self._holder(np.arange(5, dtype='int64')) - with pytest.raises(TypeError): - idx * idx - - def test_dti_mul_too_short_raises(self): - idx = self._holder(np.arange(5, dtype='int64')) - with pytest.raises(TypeError): - idx * self._holder(np.arange(3)) - with pytest.raises(ValueError): - idx * np.array([1, 2]) - def test_ufunc_coercions(self): # normal ops are also tested in tseries/test_timedeltas.py idx = TimedeltaIndex(['2H', '4H', '6H', '8H', '10H'], @@ -496,68 +588,6 @@ def test_tdi_radd_timestamp(self): # ------------------------------------------------------------- - @pytest.mark.parametrize('scalar_td', [ - timedelta(minutes=10, seconds=7), - Timedelta('10m7s'), - Timedelta('10m7s').to_timedelta64()]) - def test_tdi_floordiv_timedelta_scalar(self, scalar_td): - # GH#19125 - tdi = TimedeltaIndex(['00:05:03', '00:05:03', pd.NaT], freq=None) - expected = pd.Index([2.0, 2.0, np.nan]) - - res = tdi.__rfloordiv__(scalar_td) - tm.assert_index_equal(res, expected) - - expected = pd.Index([0.0, 0.0, np.nan]) - - res = tdi // (scalar_td) - tm.assert_index_equal(res, expected) - - def test_tdi_floordiv_tdlike_scalar(self, delta): - tdi = timedelta_range('1 days', '10 days', name='foo') - expected = Int64Index((np.arange(10) + 1) * 12, name='foo') - - result = tdi // delta - tm.assert_index_equal(result, expected, exact=False) - - def test_tdi_mul_tdlike_scalar_raises(self, delta): - rng = timedelta_range('1 days', '10 days', name='foo') - with pytest.raises(TypeError): - rng * delta - - def test_tdi_div_nat_raises(self): - # don't allow division by NaT (make could in the future) - rng = timedelta_range('1 days', '10 days', name='foo') - with pytest.raises(TypeError): - rng / pd.NaT - - def test_tdi_div_tdlike_scalar(self, delta): - rng = timedelta_range('1 days', '10 days', name='foo') - expected = Int64Index((np.arange(10) + 1) * 12, name='foo') - - result = rng / delta - tm.assert_index_equal(result, expected, exact=False) - - def test_tdi_div_tdlike_scalar_with_nat(self, delta): - rng = TimedeltaIndex(['1 days', pd.NaT, '2 days'], name='foo') - expected = Float64Index([12, np.nan, 24], name='foo') - result = rng / delta - tm.assert_index_equal(result, expected) - - @pytest.mark.parametrize('other', [np.arange(1, 11), - pd.Int64Index(range(1, 11)), - pd.UInt64Index(range(1, 11)), - pd.Float64Index(range(1, 11)), - pd.RangeIndex(1, 11)]) - def test_tdi_rmul_arraylike(self, other): - tdi = TimedeltaIndex(['1 Day'] * 10) - expected = timedelta_range('1 days', '10 days') - - result = other * tdi - tm.assert_index_equal(result, expected) - commute = tdi * other - tm.assert_index_equal(commute, expected) - def test_subtraction_ops(self): # with datetimes/timedelta and tdi/dti tdi = TimedeltaIndex(['1 days', pd.NaT, '2 days'], name='foo') @@ -685,20 +715,6 @@ def test_dti_tdi_numeric_ops(self): expected = DatetimeIndex(['20121231', pd.NaT, '20130101']) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('freq', [None, 'H']) - def test_sub_period(self, freq): - # GH 13078 - # not supported, check TypeError - p = pd.Period('2011-01-01', freq='D') - - idx = pd.TimedeltaIndex(['1 hours', '2 hours'], freq=freq) - - with pytest.raises(TypeError): - idx - p - - with pytest.raises(TypeError): - p - idx - def test_addition_ops(self): # with datetimes/timedelta and tdi/dti tdi = TimedeltaIndex(['1 days', pd.NaT, '2 days'], name='foo') diff --git a/pandas/tests/indexes/timedeltas/test_indexing.py b/pandas/tests/indexes/timedeltas/test_indexing.py index 59e38c2e738b0..08992188265bd 100644 --- a/pandas/tests/indexes/timedeltas/test_indexing.py +++ b/pandas/tests/indexes/timedeltas/test_indexing.py @@ -8,116 +8,7 @@ from pandas import TimedeltaIndex, timedelta_range, compat, Index, Timedelta -class TestTimedeltaIndex(object): - - def test_insert(self): - - idx = TimedeltaIndex(['4day', '1day', '2day'], name='idx') - - result = idx.insert(2, timedelta(days=5)) - exp = TimedeltaIndex(['4day', '1day', '5day', '2day'], name='idx') - tm.assert_index_equal(result, exp) - - # insertion of non-datetime should coerce to object index - result = idx.insert(1, 'inserted') - expected = Index([Timedelta('4day'), 'inserted', Timedelta('1day'), - Timedelta('2day')], name='idx') - assert not isinstance(result, TimedeltaIndex) - tm.assert_index_equal(result, expected) - assert result.name == expected.name - - idx = timedelta_range('1day 00:00:01', periods=3, freq='s', name='idx') - - # preserve freq - expected_0 = TimedeltaIndex(['1day', '1day 00:00:01', '1day 00:00:02', - '1day 00:00:03'], - name='idx', freq='s') - expected_3 = TimedeltaIndex(['1day 00:00:01', '1day 00:00:02', - '1day 00:00:03', '1day 00:00:04'], - name='idx', freq='s') - - # reset freq to None - expected_1_nofreq = TimedeltaIndex(['1day 00:00:01', '1day 00:00:01', - '1day 00:00:02', '1day 00:00:03'], - name='idx', freq=None) - expected_3_nofreq = TimedeltaIndex(['1day 00:00:01', '1day 00:00:02', - '1day 00:00:03', '1day 00:00:05'], - name='idx', freq=None) - - cases = [(0, Timedelta('1day'), expected_0), - (-3, Timedelta('1day'), expected_0), - (3, Timedelta('1day 00:00:04'), expected_3), - (1, Timedelta('1day 00:00:01'), expected_1_nofreq), - (3, Timedelta('1day 00:00:05'), expected_3_nofreq)] - - for n, d, expected in cases: - result = idx.insert(n, d) - tm.assert_index_equal(result, expected) - assert result.name == expected.name - assert result.freq == expected.freq - - # GH 18295 (test missing) - expected = TimedeltaIndex(['1day', pd.NaT, '2day', '3day']) - for na in (np.nan, pd.NaT, None): - result = timedelta_range('1day', '3day').insert(1, na) - tm.assert_index_equal(result, expected) - - def test_delete(self): - idx = timedelta_range(start='1 Days', periods=5, freq='D', name='idx') - - # prserve freq - expected_0 = timedelta_range(start='2 Days', periods=4, freq='D', - name='idx') - expected_4 = timedelta_range(start='1 Days', periods=4, freq='D', - name='idx') - - # reset freq to None - expected_1 = TimedeltaIndex( - ['1 day', '3 day', '4 day', '5 day'], freq=None, name='idx') - - cases = {0: expected_0, - -5: expected_0, - -1: expected_4, - 4: expected_4, - 1: expected_1} - for n, expected in compat.iteritems(cases): - result = idx.delete(n) - tm.assert_index_equal(result, expected) - assert result.name == expected.name - assert result.freq == expected.freq - - with pytest.raises((IndexError, ValueError)): - # either depeidnig on numpy version - result = idx.delete(5) - - def test_delete_slice(self): - idx = timedelta_range(start='1 days', periods=10, freq='D', name='idx') - - # prserve freq - expected_0_2 = timedelta_range(start='4 days', periods=7, freq='D', - name='idx') - expected_7_9 = timedelta_range(start='1 days', periods=7, freq='D', - name='idx') - - # reset freq to None - expected_3_5 = TimedeltaIndex(['1 d', '2 d', '3 d', - '7 d', '8 d', '9 d', '10d'], - freq=None, name='idx') - - cases = {(0, 1, 2): expected_0_2, - (7, 8, 9): expected_7_9, - (3, 4, 5): expected_3_5} - for n, expected in compat.iteritems(cases): - result = idx.delete(n) - tm.assert_index_equal(result, expected) - assert result.name == expected.name - assert result.freq == expected.freq - - result = idx.delete(slice(n[0], n[-1] + 1)) - tm.assert_index_equal(result, expected) - assert result.name == expected.name - assert result.freq == expected.freq - +class TestGetItem(object): def test_getitem(self): idx1 = timedelta_range('1 day', '31 day', freq='D', name='idx') @@ -150,6 +41,13 @@ def test_getitem(self): tm.assert_index_equal(result, expected) assert result.freq == expected.freq + +class TestWhere(object): + # placeholder for symmetry with DatetimeIndex and PeriodIndex tests + pass + + +class TestTake(object): def test_take(self): # GH 10295 idx1 = timedelta_range('1 day', '31 day', freq='D', name='idx') @@ -252,6 +150,117 @@ def test_take_fill_value(self): with pytest.raises(IndexError): idx.take(np.array([1, -5])) + +class TestTimedeltaIndex(object): + + def test_insert(self): + + idx = TimedeltaIndex(['4day', '1day', '2day'], name='idx') + + result = idx.insert(2, timedelta(days=5)) + exp = TimedeltaIndex(['4day', '1day', '5day', '2day'], name='idx') + tm.assert_index_equal(result, exp) + + # insertion of non-datetime should coerce to object index + result = idx.insert(1, 'inserted') + expected = Index([Timedelta('4day'), 'inserted', Timedelta('1day'), + Timedelta('2day')], name='idx') + assert not isinstance(result, TimedeltaIndex) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + + idx = timedelta_range('1day 00:00:01', periods=3, freq='s', name='idx') + + # preserve freq + expected_0 = TimedeltaIndex(['1day', '1day 00:00:01', '1day 00:00:02', + '1day 00:00:03'], + name='idx', freq='s') + expected_3 = TimedeltaIndex(['1day 00:00:01', '1day 00:00:02', + '1day 00:00:03', '1day 00:00:04'], + name='idx', freq='s') + + # reset freq to None + expected_1_nofreq = TimedeltaIndex(['1day 00:00:01', '1day 00:00:01', + '1day 00:00:02', '1day 00:00:03'], + name='idx', freq=None) + expected_3_nofreq = TimedeltaIndex(['1day 00:00:01', '1day 00:00:02', + '1day 00:00:03', '1day 00:00:05'], + name='idx', freq=None) + + cases = [(0, Timedelta('1day'), expected_0), + (-3, Timedelta('1day'), expected_0), + (3, Timedelta('1day 00:00:04'), expected_3), + (1, Timedelta('1day 00:00:01'), expected_1_nofreq), + (3, Timedelta('1day 00:00:05'), expected_3_nofreq)] + + for n, d, expected in cases: + result = idx.insert(n, d) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq == expected.freq + + # GH 18295 (test missing) + expected = TimedeltaIndex(['1day', pd.NaT, '2day', '3day']) + for na in (np.nan, pd.NaT, None): + result = timedelta_range('1day', '3day').insert(1, na) + tm.assert_index_equal(result, expected) + + def test_delete(self): + idx = timedelta_range(start='1 Days', periods=5, freq='D', name='idx') + + # prserve freq + expected_0 = timedelta_range(start='2 Days', periods=4, freq='D', + name='idx') + expected_4 = timedelta_range(start='1 Days', periods=4, freq='D', + name='idx') + + # reset freq to None + expected_1 = TimedeltaIndex( + ['1 day', '3 day', '4 day', '5 day'], freq=None, name='idx') + + cases = {0: expected_0, + -5: expected_0, + -1: expected_4, + 4: expected_4, + 1: expected_1} + for n, expected in compat.iteritems(cases): + result = idx.delete(n) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq == expected.freq + + with pytest.raises((IndexError, ValueError)): + # either depeidnig on numpy version + result = idx.delete(5) + + def test_delete_slice(self): + idx = timedelta_range(start='1 days', periods=10, freq='D', name='idx') + + # prserve freq + expected_0_2 = timedelta_range(start='4 days', periods=7, freq='D', + name='idx') + expected_7_9 = timedelta_range(start='1 days', periods=7, freq='D', + name='idx') + + # reset freq to None + expected_3_5 = TimedeltaIndex(['1 d', '2 d', '3 d', + '7 d', '8 d', '9 d', '10d'], + freq=None, name='idx') + + cases = {(0, 1, 2): expected_0_2, + (7, 8, 9): expected_7_9, + (3, 4, 5): expected_3_5} + for n, expected in compat.iteritems(cases): + result = idx.delete(n) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq == expected.freq + + result = idx.delete(slice(n[0], n[-1] + 1)) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq == expected.freq + def test_get_loc(self): idx = pd.to_timedelta(['0 days', '1 days', '2 days']) From 2c1a3985e60b77a55d3464646ab9341a9a0bd4f3 Mon Sep 17 00:00:00 2001 From: luzpaz Date: Sat, 24 Feb 2018 08:26:05 -0500 Subject: [PATCH 183/214] DOC: misc. typos (#19876) Found via `codespell -q 3 -I ../pandas-whitelist.txt` Where whitelists consists of: ``` ans behaviour doubleclick indicies initialise initialised initialising nd resetted splitted thru valu ``` --- doc/source/basics.rst | 2 +- doc/source/dsintro.rst | 2 +- doc/source/whatsnew/v0.14.1.txt | 2 +- pandas/_libs/groupby_helper.pxi.in | 2 +- pandas/core/arrays/categorical.py | 2 +- pandas/core/internals.py | 8 ++++---- pandas/plotting/_converter.py | 2 +- pandas/tests/extension/category/test_categorical.py | 2 +- pandas/tests/extension/conftest.py | 2 +- pandas/tests/frame/test_mutate_columns.py | 2 +- pandas/tests/frame/test_repr_info.py | 2 +- pandas/tests/io/test_excel.py | 2 +- pandas/tests/io/test_stata.py | 2 +- pandas/tests/reshape/test_concat.py | 4 ++-- pandas/tests/sparse/frame/test_frame.py | 6 +++--- 15 files changed, 21 insertions(+), 21 deletions(-) diff --git a/doc/source/basics.rst b/doc/source/basics.rst index 749d4be11ad45..e1b36a6acad70 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -2312,4 +2312,4 @@ All NumPy dtypes are subclasses of ``numpy.generic``: .. note:: Pandas also defines the types ``category``, and ``datetime64[ns, tz]``, which are not integrated into the normal - NumPy hierarchy and wont show up with the above function. + NumPy hierarchy and won't show up with the above function. diff --git a/doc/source/dsintro.rst b/doc/source/dsintro.rst index 582750b16f40d..e8f73a9ec2e8a 100644 --- a/doc/source/dsintro.rst +++ b/doc/source/dsintro.rst @@ -539,7 +539,7 @@ To write code compatible with all versions of Python, split the assignment in tw you'll need to take care when passing ``assign`` expressions that * Updating an existing column - * Refering to the newly updated column in the same ``assign`` + * Referring to the newly updated column in the same ``assign`` For example, we'll update column "A" and then refer to it when creating "B". diff --git a/doc/source/whatsnew/v0.14.1.txt b/doc/source/whatsnew/v0.14.1.txt index d8a6dc1793612..4674cbc846722 100644 --- a/doc/source/whatsnew/v0.14.1.txt +++ b/doc/source/whatsnew/v0.14.1.txt @@ -145,7 +145,7 @@ Performance ~~~~~~~~~~~ - Improvements in dtype inference for numeric operations involving yielding performance gains for dtypes: ``int64``, ``timedelta64``, ``datetime64`` (:issue:`7223`) - Improvements in Series.transform for significant performance gains (:issue:`6496`) -- Improvements in DataFrame.transform with ufuncs and built-in grouper functions for signifcant performance gains (:issue:`7383`) +- Improvements in DataFrame.transform with ufuncs and built-in grouper functions for significant performance gains (:issue:`7383`) - Regression in groupby aggregation of datetime64 dtypes (:issue:`7555`) - Improvements in `MultiIndex.from_product` for large iterables (:issue:`7627`) diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index 93fbb4477e2d0..e03e3af65755b 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -426,7 +426,7 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, labels : array containing unique label for each group, with its ordering matching up to the corresponding record in `values` is_datetimelike : bool - unused in this method but provided for call compatability with other + unused in this method but provided for call compatibility with other Cython transformations ties_method : {'keep', 'top', 'bottom'} * keep: leave NA values where they are diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 493b2e5bd899b..c6eeabf0148d0 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -521,7 +521,7 @@ def _from_inferred_categories(cls, inferred_categories, inferred_codes, cats = to_timedelta(inferred_categories, errors='coerce') if known_categories: - # recode from observation oder to dtype.categories order + # recode from observation order to dtype.categories order categories = dtype.categories codes = _recode_for_categories(inferred_codes, cats, categories) elif not cats.is_monotonic_increasing: diff --git a/pandas/core/internals.py b/pandas/core/internals.py index bad0626206e80..d385185fbb558 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -2600,12 +2600,12 @@ def __init__(self, values, placement, ndim=None): def _maybe_coerce_values(self, values): """Input validation for values passed to __init__. Ensure that - we have datetime64ns, coercing if nescessary. + we have datetime64ns, coercing if necessary. Parametetrs ----------- values : array-like - Must be convertable to datetime64 + Must be convertible to datetime64 Returns ------- @@ -2760,12 +2760,12 @@ def __init__(self, values, placement, ndim=2, dtype=None): def _maybe_coerce_values(self, values, dtype=None): """Input validation for values passed to __init__. Ensure that - we have datetime64TZ, coercing if nescessary. + we have datetime64TZ, coercing if necessary. Parametetrs ----------- values : array-like - Must be convertable to datetime64 + Must be convertible to datetime64 dtype : string or DatetimeTZDtype, optional Does a shallow copy to this tz diff --git a/pandas/plotting/_converter.py b/pandas/plotting/_converter.py index 9ca06475290e4..f413e4177b386 100644 --- a/pandas/plotting/_converter.py +++ b/pandas/plotting/_converter.py @@ -197,7 +197,7 @@ def __call__(self, x, pos=0): ---------- x : float The time of day specified as seconds since 00:00 (midnight), - with upto microsecond precision. + with up to microsecond precision. pos Unused diff --git a/pandas/tests/extension/category/test_categorical.py b/pandas/tests/extension/category/test_categorical.py index ec548fca6d901..8f413b4a19730 100644 --- a/pandas/tests/extension/category/test_categorical.py +++ b/pandas/tests/extension/category/test_categorical.py @@ -60,7 +60,7 @@ def test_align_frame(self, data, na_value): class TestGetitem(base.BaseGetitemTests): - @pytest.mark.skip(reason="Backwards compatability") + @pytest.mark.skip(reason="Backwards compatibility") def test_getitem_scalar(self): # CategoricalDtype.type isn't "correct" since it should # be a parent of the elements (object). But don't want diff --git a/pandas/tests/extension/conftest.py b/pandas/tests/extension/conftest.py index f86849b9cbd61..21ed8894e8ebb 100644 --- a/pandas/tests/extension/conftest.py +++ b/pandas/tests/extension/conftest.py @@ -37,7 +37,7 @@ def na_cmp(): Should return a function of two arguments that returns True if both arguments are (scalar) NA for your type. - By defult, uses ``operator.or`` + By default, uses ``operator.or`` """ return operator.is_ diff --git a/pandas/tests/frame/test_mutate_columns.py b/pandas/tests/frame/test_mutate_columns.py index 4c560129bfa45..51ffe2966b4e5 100644 --- a/pandas/tests/frame/test_mutate_columns.py +++ b/pandas/tests/frame/test_mutate_columns.py @@ -95,7 +95,7 @@ def test_assign_bad(self): def test_assign_dependent_old_python(self): df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) - # Key C does not exist at defition time of df + # Key C does not exist at definition time of df with pytest.raises(KeyError): df.assign(C=lambda df: df.A, D=lambda df: df['A'] + df['C']) diff --git a/pandas/tests/frame/test_repr_info.py b/pandas/tests/frame/test_repr_info.py index 8c46dc30a0f5f..3e5aae10618e9 100644 --- a/pandas/tests/frame/test_repr_info.py +++ b/pandas/tests/frame/test_repr_info.py @@ -307,7 +307,7 @@ def test_info_memory_usage(self): res = buf.getvalue().splitlines() assert "memory usage: " in res[-1] - # do not display memory usage cas + # do not display memory usage case df.info(buf=buf, memory_usage=False) res = buf.getvalue().splitlines() assert "memory usage: " not in res[-1] diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index 4c790a0f0f64a..86cee54665781 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -1786,7 +1786,7 @@ def roundtrip(df, header=True, parser_hdr=0, index=True): nrows = 5 ncols = 3 for use_headers in (True, False): - for i in range(1, 4): # row multindex upto nlevel=3 + for i in range(1, 4): # row multindex up to nlevel=3 for j in range(1, 4): # col "" df = mkdf(nrows, ncols, r_idx_nlevels=i, c_idx_nlevels=j) diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 4e259d0994bdb..49ad07b79d111 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -336,7 +336,7 @@ def test_read_write_dta10(self): with tm.ensure_clean() as path: original.to_stata(path, {'datetime': 'tc'}) written_and_read_again = self.read_dta(path) - # original.index is np.int32, readed index is np.int64 + # original.index is np.int32, read index is np.int64 tm.assert_frame_equal(written_and_read_again.set_index('index'), original, check_index_type=False) diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 7e126dd56775b..cc4eb6b475ae5 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -473,7 +473,7 @@ def test_concat_categorical(self): tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) - # completelly different categories (same dtype) => not-category + # completely different categories (same dtype) => not-category s1 = pd.Series([10, 11, np.nan], dtype='category') s2 = pd.Series([np.nan, 1, 3, 2], dtype='category') @@ -518,7 +518,7 @@ def test_concat_categorical_coercion(self): tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) - # completelly different categories => not-category + # completely different categories => not-category s1 = pd.Series([10, 11, np.nan], dtype='category') s2 = pd.Series([1, 3, 2]) diff --git a/pandas/tests/sparse/frame/test_frame.py b/pandas/tests/sparse/frame/test_frame.py index 0e8b2161cafc4..ee0d63aff7367 100644 --- a/pandas/tests/sparse/frame/test_frame.py +++ b/pandas/tests/sparse/frame/test_frame.py @@ -247,10 +247,10 @@ def test_constructor_preserve_attr(self): def test_constructor_nan_dataframe(self): # GH 10079 trains = np.arange(100) - tresholds = [10, 20, 30, 40, 50, 60] - tuples = [(i, j) for i in trains for j in tresholds] + thresholds = [10, 20, 30, 40, 50, 60] + tuples = [(i, j) for i in trains for j in thresholds] index = pd.MultiIndex.from_tuples(tuples, - names=['trains', 'tresholds']) + names=['trains', 'thresholds']) matrix = np.empty((len(index), len(trains))) matrix.fill(np.nan) df = pd.DataFrame(matrix, index=index, columns=trains, dtype=float) From bd76ce9401b5f3c9506191b2802e96730f42d7cb Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 24 Feb 2018 15:39:07 +0100 Subject: [PATCH 184/214] DOC: remove deprecated from_items from dsintro docs (#19837) --- doc/source/dsintro.rst | 35 +++++++++++++---------------------- pandas/core/frame.py | 16 +++++++++------- 2 files changed, 22 insertions(+), 29 deletions(-) diff --git a/doc/source/dsintro.rst b/doc/source/dsintro.rst index e8f73a9ec2e8a..1ba00b8fb6f23 100644 --- a/doc/source/dsintro.rst +++ b/doc/source/dsintro.rst @@ -364,6 +364,19 @@ and returns a DataFrame. It operates like the ``DataFrame`` constructor except for the ``orient`` parameter which is ``'columns'`` by default, but which can be set to ``'index'`` in order to use the dict keys as row labels. + +.. ipython:: python + + pd.DataFrame.from_dict(dict([('A', [1, 2, 3]), ('B', [4, 5, 6])])) + +If you pass ``orient='index'``, the keys will be the row labels. In this +case, you can also pass the desired column names: + +.. ipython:: python + + pd.DataFrame.from_dict(dict([('A', [1, 2, 3]), ('B', [4, 5, 6])]), + orient='index', columns=['one', 'two', 'three']) + .. _basics.dataframe.from_records: **DataFrame.from_records** @@ -378,28 +391,6 @@ dtype. For example: data pd.DataFrame.from_records(data, index='C') -.. _basics.dataframe.from_items: - -**DataFrame.from_items** - -``DataFrame.from_items`` works analogously to the form of the ``dict`` -constructor that takes a sequence of ``(key, value)`` pairs, where the keys are -column (or row, in the case of ``orient='index'``) names, and the value are the -column values (or row values). This can be useful for constructing a DataFrame -with the columns in a particular order without having to pass an explicit list -of columns: - -.. ipython:: python - - pd.DataFrame.from_items([('A', [1, 2, 3]), ('B', [4, 5, 6])]) - -If you pass ``orient='index'``, the keys will be the row labels. But in this -case you must also pass the desired column names: - -.. ipython:: python - - pd.DataFrame.from_items([('A', [1, 2, 3]), ('B', [4, 5, 6])], - orient='index', columns=['one', 'two', 'three']) Column selection, addition, deletion ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1c5cf87d6b39b..061b69f25e7ac 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1255,12 +1255,14 @@ def to_records(self, index=True, convert_datetime64=True): @classmethod def from_items(cls, items, columns=None, orient='columns'): - """ + """Construct a dataframe from a list of tuples + .. deprecated:: 0.23.0 - from_items is deprecated and will be removed in a - future version. Use :meth:`DataFrame.from_dict(dict())` - instead. :meth:`DataFrame.from_dict(OrderedDict(...))` may be used - to preserve the key order. + `from_items` is deprecated and will be removed in a future version. + Use :meth:`DataFrame.from_dict(dict(items)) ` + instead. + :meth:`DataFrame.from_dict(OrderedDict(items)) ` + may be used to preserve the key order. Convert (key, value) pairs to DataFrame. The keys will be the axis index (usually the columns, but depends on the specified @@ -1284,8 +1286,8 @@ def from_items(cls, items, columns=None, orient='columns'): """ warnings.warn("from_items is deprecated. Please use " - "DataFrame.from_dict(dict()) instead. " - "DataFrame.from_dict(OrderedDict()) may be used to " + "DataFrame.from_dict(dict(items), ...) instead. " + "DataFrame.from_dict(OrderedDict(items)) may be used to " "preserve the key order.", FutureWarning, stacklevel=2) From e8b80b1d468650377b86c92b9dee7e6a77cfc45a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 24 Feb 2018 06:43:34 -0800 Subject: [PATCH 185/214] De-duplicate add_offset_array methods (#19835) --- pandas/core/indexes/datetimelike.py | 97 +++++++++++++++++++---------- pandas/core/indexes/datetimes.py | 23 ------- pandas/core/indexes/period.py | 23 ------- pandas/core/indexes/timedeltas.py | 38 ++--------- 4 files changed, 70 insertions(+), 111 deletions(-) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index ac75e5ae5e2a0..a68d883f04380 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -2,7 +2,7 @@ Base and utility classes for tseries type pandas objects. """ import warnings - +import operator from datetime import datetime, timedelta from pandas import compat @@ -10,6 +10,12 @@ from pandas.core.tools.timedeltas import to_timedelta import numpy as np + +from pandas._libs import lib, iNaT, NaT +from pandas._libs.tslibs.period import Period +from pandas._libs.tslibs.timedeltas import delta_to_nanoseconds +from pandas._libs.tslibs.timestamps import round_ns + from pandas.core.dtypes.common import ( _ensure_int64, is_dtype_equal, @@ -25,18 +31,15 @@ is_integer_dtype, is_object_dtype, is_string_dtype, + is_period_dtype, is_timedelta64_dtype) from pandas.core.dtypes.generic import ( ABCIndex, ABCSeries, ABCPeriodIndex, ABCIndexClass) from pandas.core.dtypes.missing import isna from pandas.core import common as com, algorithms, ops from pandas.core.algorithms import checked_add_with_arr -from pandas.errors import NullFrequencyError +from pandas.errors import NullFrequencyError, PerformanceWarning import pandas.io.formats.printing as printing -from pandas._libs import lib, iNaT, NaT -from pandas._libs.tslibs.period import Period -from pandas._libs.tslibs.timedeltas import delta_to_nanoseconds -from pandas._libs.tslibs.timestamps import round_ns from pandas.core.indexes.base import Index, _index_shared_docs from pandas.util._decorators import Appender, cache_readonly @@ -637,13 +640,33 @@ def _sub_datelike(self, other): def _sub_period(self, other): return NotImplemented - def _add_offset_array(self, other): - # Array/Index of DateOffset objects - return NotImplemented + def _addsub_offset_array(self, other, op): + """ + Add or subtract array-like of DateOffset objects - def _sub_offset_array(self, other): - # Array/Index of DateOffset objects - return NotImplemented + Parameters + ---------- + other : Index, np.ndarray + object-dtype containing pd.DateOffset objects + op : {operator.add, operator.sub} + + Returns + ------- + result : same class as self + """ + assert op in [operator.add, operator.sub] + if len(other) == 1: + return op(self, other[0]) + + warnings.warn("Adding/subtracting array of DateOffsets to " + "{cls} not vectorized" + .format(cls=type(self).__name__), PerformanceWarning) + + res_values = op(self.astype('O').values, np.array(other)) + kwargs = {} + if not is_period_dtype(self): + kwargs['freq'] = 'infer' + return self._constructor(res_values, **kwargs) @classmethod def _add_datetimelike_methods(cls): @@ -660,13 +683,24 @@ def __add__(self, other): other = lib.item_from_zerodim(other) if isinstance(other, ABCSeries): return NotImplemented - elif is_timedelta64_dtype(other): + + # scalar others + elif isinstance(other, (DateOffset, timedelta, np.timedelta64)): result = self._add_delta(other) - elif isinstance(other, (DateOffset, timedelta)): + elif isinstance(other, (datetime, np.datetime64)): + result = self._add_datelike(other) + elif is_integer(other): + # This check must come after the check for np.timedelta64 + # as is_integer returns True for these + result = self.shift(other) + + # array-like others + elif is_timedelta64_dtype(other): + # TimedeltaIndex, ndarray[timedelta64] result = self._add_delta(other) elif is_offsetlike(other): # Array/Index of DateOffset objects - result = self._add_offset_array(other) + result = self._addsub_offset_array(other, operator.add) elif isinstance(self, TimedeltaIndex) and isinstance(other, Index): if hasattr(other, '_add_delta'): # i.e. DatetimeIndex, TimedeltaIndex, or PeriodIndex @@ -674,12 +708,6 @@ def __add__(self, other): else: raise TypeError("cannot add TimedeltaIndex and {typ}" .format(typ=type(other))) - elif is_integer(other): - # This check must come after the check for timedelta64_dtype - # or else it will incorrectly catch np.timedelta64 objects - result = self.shift(other) - elif isinstance(other, (datetime, np.datetime64)): - result = self._add_datelike(other) elif isinstance(other, Index): result = self._add_datelike(other) elif is_integer_dtype(other) and self.freq is None: @@ -709,13 +737,26 @@ def __sub__(self, other): other = lib.item_from_zerodim(other) if isinstance(other, ABCSeries): return NotImplemented - elif is_timedelta64_dtype(other): + + # scalar others + elif isinstance(other, (DateOffset, timedelta, np.timedelta64)): result = self._add_delta(-other) - elif isinstance(other, (DateOffset, timedelta)): + elif isinstance(other, (datetime, np.datetime64)): + result = self._sub_datelike(other) + elif is_integer(other): + # This check must come after the check for np.timedelta64 + # as is_integer returns True for these + result = self.shift(-other) + elif isinstance(other, Period): + result = self._sub_period(other) + + # array-like others + elif is_timedelta64_dtype(other): + # TimedeltaIndex, ndarray[timedelta64] result = self._add_delta(-other) elif is_offsetlike(other): # Array/Index of DateOffset objects - result = self._sub_offset_array(other) + result = self._addsub_offset_array(other, operator.sub) elif isinstance(self, TimedeltaIndex) and isinstance(other, Index): # We checked above for timedelta64_dtype(other) so this # must be invalid. @@ -723,14 +764,6 @@ def __sub__(self, other): .format(typ=type(other).__name__)) elif isinstance(other, DatetimeIndex): result = self._sub_datelike(other) - elif is_integer(other): - # This check must come after the check for timedelta64_dtype - # or else it will incorrectly catch np.timedelta64 objects - result = self.shift(-other) - elif isinstance(other, (datetime, np.datetime64)): - result = self._sub_datelike(other) - elif isinstance(other, Period): - result = self._sub_period(other) elif isinstance(other, Index): raise TypeError("cannot subtract {typ1} and {typ2}" .format(typ1=type(self).__name__, diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 17f92339e4205..36ea2bffb9531 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -964,29 +964,6 @@ def _add_offset(self, offset): "or DatetimeIndex", PerformanceWarning) return self.astype('O') + offset - def _add_offset_array(self, other): - # Array/Index of DateOffset objects - if len(other) == 1: - return self + other[0] - else: - warnings.warn("Adding/subtracting array of DateOffsets to " - "{} not vectorized".format(type(self)), - PerformanceWarning) - return self.astype('O') + np.array(other) - # TODO: pass freq='infer' like we do in _sub_offset_array? - # TODO: This works for __add__ but loses dtype in __sub__ - - def _sub_offset_array(self, other): - # Array/Index of DateOffset objects - if len(other) == 1: - return self - other[0] - else: - warnings.warn("Adding/subtracting array of DateOffsets to " - "{} not vectorized".format(type(self)), - PerformanceWarning) - res_values = self.astype('O').values - np.array(other) - return self.__class__(res_values, freq='infer') - def _format_native_types(self, na_rep='NaT', date_format=None, **kwargs): from pandas.io.formats.format import _get_format_datetime64_from_values format = _get_format_datetime64_from_values(self, date_format) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 4c14cbffcd813..f0567c9c963af 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -44,7 +44,6 @@ from pandas.util._decorators import (Appender, Substitution, cache_readonly, deprecate_kwarg) from pandas.compat import zip, u -from pandas.errors import PerformanceWarning import pandas.core.indexes.base as ibase _index_doc_kwargs = dict(ibase._index_doc_kwargs) @@ -745,28 +744,6 @@ def _sub_period(self, other): # result must be Int64Index or Float64Index return Index(new_data) - def _add_offset_array(self, other): - # Array/Index of DateOffset objects - if len(other) == 1: - return self + other[0] - else: - warnings.warn("Adding/subtracting array of DateOffsets to " - "{cls} not vectorized" - .format(cls=type(self).__name__), PerformanceWarning) - res_values = self.astype('O').values + np.array(other) - return self.__class__(res_values) - - def _sub_offset_array(self, other): - # Array/Index of DateOffset objects - if len(other) == 1: - return self - other[0] - else: - warnings.warn("Adding/subtracting array of DateOffsets to " - "{cls} not vectorized" - .format(cls=type(self).__name__), PerformanceWarning) - res_values = self.astype('O').values - np.array(other) - return self.__class__(res_values) - def shift(self, n): """ Specialized shift which produces an PeriodIndex diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 3542a24290f89..219adfdb66c82 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -1,7 +1,6 @@ """ implement the TimedeltaIndex """ from datetime import timedelta -import warnings import numpy as np from pandas.core.dtypes.common import ( @@ -433,43 +432,16 @@ def _sub_datelike(self, other): else: raise TypeError("cannot subtract a datelike from a TimedeltaIndex") - def _add_offset_array(self, other): - # Array/Index of DateOffset objects + def _addsub_offset_array(self, other, op): + # Add or subtract Array-like of DateOffset objects try: # TimedeltaIndex can only operate with a subset of DateOffset # subclasses. Incompatible classes will raise AttributeError, # which we re-raise as TypeError - if len(other) == 1: - return self + other[0] - else: - from pandas.errors import PerformanceWarning - warnings.warn("Adding/subtracting array of DateOffsets to " - "{} not vectorized".format(type(self)), - PerformanceWarning) - return self.astype('O') + np.array(other) - # TODO: pass freq='infer' like we do in _sub_offset_array? - # TODO: This works for __add__ but loses dtype in __sub__ - except AttributeError: - raise TypeError("Cannot add non-tick DateOffset to TimedeltaIndex") - - def _sub_offset_array(self, other): - # Array/Index of DateOffset objects - try: - # TimedeltaIndex can only operate with a subset of DateOffset - # subclasses. Incompatible classes will raise AttributeError, - # which we re-raise as TypeError - if len(other) == 1: - return self - other[0] - else: - from pandas.errors import PerformanceWarning - warnings.warn("Adding/subtracting array of DateOffsets to " - "{} not vectorized".format(type(self)), - PerformanceWarning) - res_values = self.astype('O').values - np.array(other) - return self.__class__(res_values, freq='infer') + return DatetimeIndexOpsMixin._addsub_offset_array(self, other, op) except AttributeError: - raise TypeError("Cannot subtrack non-tick DateOffset from" - " TimedeltaIndex") + raise TypeError("Cannot add/subtract non-tick DateOffset to {cls}" + .format(cls=type(self).__name__)) def _format_native_types(self, na_rep=u('NaT'), date_format=None, **kwargs): From fdc0f257b74e9f2334f017265e213d5a81d01e95 Mon Sep 17 00:00:00 2001 From: topper-123 Date: Sat, 24 Feb 2018 14:55:35 +0000 Subject: [PATCH 186/214] Let initialisation from dicts use insertion order for python >= 3.6 (part II) (#19859) --- pandas/tests/groupby/test_groupby.py | 16 +++---- pandas/tests/groupby/test_transform.py | 8 +++- pandas/tests/indexing/test_ix.py | 16 ++++--- pandas/tests/io/formats/test_format.py | 12 +++--- pandas/tests/io/formats/test_to_latex.py | 21 ++++----- pandas/tests/io/json/test_pandas.py | 2 +- pandas/tests/reshape/merge/test_merge.py | 43 ++++++++++--------- .../tests/reshape/merge/test_merge_ordered.py | 5 ++- pandas/tests/reshape/test_concat.py | 8 ++-- pandas/tests/reshape/test_melt.py | 8 ++-- pandas/tests/reshape/test_reshape.py | 24 +++++------ 11 files changed, 86 insertions(+), 77 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 4cf7c8013aa2b..129ac6b06205c 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -99,9 +99,9 @@ def max_value(group): applied = df.groupby('A').apply(max_value) result = applied.get_dtype_counts().sort_values() - expected = Series({'object': 2, - 'float64': 2, - 'int64': 1}).sort_values() + expected = Series({'float64': 2, + 'int64': 1, + 'object': 2}).sort_values() assert_series_equal(result, expected) def test_groupby_return_type(self): @@ -244,7 +244,7 @@ def func_with_no_date(batch): return pd.Series({'c': 2}) def func_with_date(batch): - return pd.Series({'c': 2, 'b': datetime(2015, 1, 1)}) + return pd.Series({'b': datetime(2015, 1, 1), 'c': 2}) dfg_no_conversion = df.groupby(by=['a']).apply(func_with_no_date) dfg_no_conversion_expected = pd.DataFrame({'c': 2}, index=[1]) @@ -1628,8 +1628,8 @@ def f(g): def test_apply_with_mixed_dtype(self): # GH3480, apply with mixed dtype on axis=1 breaks in 0.11 - df = DataFrame({'foo1': ['one', 'two', 'two', 'three', 'one', 'two'], - 'foo2': np.random.randn(6)}) + df = DataFrame({'foo1': np.random.randn(6), + 'foo2': ['one', 'two', 'two', 'three', 'one', 'two']}) result = df.apply(lambda x: x, axis=1) assert_series_equal(df.get_dtype_counts(), result.get_dtype_counts()) @@ -2113,10 +2113,10 @@ def test_multifunc_sum_bug(self): def test_handle_dict_return_value(self): def f(group): - return {'min': group.min(), 'max': group.max()} + return {'max': group.max(), 'min': group.min()} def g(group): - return Series({'min': group.min(), 'max': group.max()}) + return Series({'max': group.max(), 'min': group.min()}) result = self.df.groupby('A')['C'].apply(f) expected = self.df.groupby('A')['C'].apply(g) diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index 4159d0f709a13..1be7dfdcc64e6 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -519,7 +519,9 @@ def test_cython_transform_frame(self, op, args, targop): 'timedelta': pd.timedelta_range(1, freq='s', periods=1000), 'string': strings * 50, - 'string_missing': strings_missing * 50}) + 'string_missing': strings_missing * 50}, + columns=['float', 'float_missing', 'int', 'datetime', + 'timedelta', 'string', 'string_missing']) df['cat'] = df['string'].astype('category') df2 = df.copy() @@ -552,7 +554,9 @@ def test_cython_transform_frame(self, op, args, targop): tm.assert_frame_equal(expected, gb.transform(op, *args).sort_index( axis=1)) - tm.assert_frame_equal(expected, getattr(gb, op)(*args)) + tm.assert_frame_equal( + expected, + getattr(gb, op)(*args).sort_index(axis=1)) # individual columns for c in df: if c not in ['float', 'int', 'float_missing' diff --git a/pandas/tests/indexing/test_ix.py b/pandas/tests/indexing/test_ix.py index 3f71e673a4ffe..c84576c984525 100644 --- a/pandas/tests/indexing/test_ix.py +++ b/pandas/tests/indexing/test_ix.py @@ -53,13 +53,15 @@ def test_ix_loc_setitem_consistency(self): # GH 8607 # ix setitem consistency - df = DataFrame({'timestamp': [1413840976, 1413842580, 1413760580], - 'delta': [1174, 904, 161], - 'elapsed': [7673, 9277, 1470]}) - expected = DataFrame({'timestamp': pd.to_datetime( - [1413840976, 1413842580, 1413760580], unit='s'), - 'delta': [1174, 904, 161], - 'elapsed': [7673, 9277, 1470]}) + df = DataFrame({'delta': [1174, 904, 161], + 'elapsed': [7673, 9277, 1470], + 'timestamp': [1413840976, 1413842580, 1413760580]}) + expected = DataFrame({'delta': [1174, 904, 161], + 'elapsed': [7673, 9277, 1470], + 'timestamp': pd.to_datetime( + [1413840976, 1413842580, 1413760580], + unit='s') + }) df2 = df.copy() df2['timestamp'] = pd.to_datetime(df['timestamp'], unit='s') diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index dddba5b425c3b..03c071dbe4bc5 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -539,8 +539,8 @@ def test_east_asian_unicode_frame(self): assert _rep(df) == expected # column name - df = DataFrame({u'あああああ': [1, 222, 33333, 4], - 'b': [u'あ', u'いいい', u'う', u'ええええええ']}, + df = DataFrame({'b': [u'あ', u'いいい', u'う', u'ええええええ'], + u'あああああ': [1, 222, 33333, 4]}, index=['a', 'bb', 'c', 'ddd']) expected = (u" b あああああ\na あ 1\n" u"bb いいい 222\nc う 33333\n" @@ -647,8 +647,8 @@ def test_east_asian_unicode_frame(self): assert _rep(df) == expected # column name - df = DataFrame({u'あああああ': [1, 222, 33333, 4], - 'b': [u'あ', u'いいい', u'う', u'ええええええ']}, + df = DataFrame({'b': [u'あ', u'いいい', u'う', u'ええええええ'], + u'あああああ': [1, 222, 33333, 4]}, index=['a', 'bb', 'c', 'ddd']) expected = (u" b あああああ\n" u"a あ 1\n" @@ -733,8 +733,8 @@ def test_east_asian_unicode_frame(self): assert _rep(df) == expected # ambiguous unicode - df = DataFrame({u'あああああ': [1, 222, 33333, 4], - 'b': [u'あ', u'いいい', u'¡¡', u'ええええええ']}, + df = DataFrame({'b': [u'あ', u'いいい', u'¡¡', u'ええええええ'], + u'あああああ': [1, 222, 33333, 4]}, index=['a', 'bb', 'c', '¡¡¡']) expected = (u" b あああああ\n" u"a あ 1\n" diff --git a/pandas/tests/io/formats/test_to_latex.py b/pandas/tests/io/formats/test_to_latex.py index f266a8b3a3268..5ebf196be094e 100644 --- a/pandas/tests/io/formats/test_to_latex.py +++ b/pandas/tests/io/formats/test_to_latex.py @@ -115,17 +115,18 @@ def test_to_latex_empty(self): assert result == expected def test_to_latex_with_formatters(self): - df = DataFrame({'int': [1, 2, 3], + df = DataFrame({'datetime64': [datetime(2016, 1, 1), + datetime(2016, 2, 5), + datetime(2016, 3, 3)], 'float': [1.0, 2.0, 3.0], + 'int': [1, 2, 3], 'object': [(1, 2), True, False], - 'datetime64': [datetime(2016, 1, 1), - datetime(2016, 2, 5), - datetime(2016, 3, 3)]}) + }) - formatters = {'int': lambda x: '0x{x:x}'.format(x=x), + formatters = {'datetime64': lambda x: x.strftime('%Y-%m'), 'float': lambda x: '[{x: 4.1f}]'.format(x=x), + 'int': lambda x: '0x{x:x}'.format(x=x), 'object': lambda x: '-{x!s}-'.format(x=x), - 'datetime64': lambda x: x.strftime('%Y-%m'), '__index__': lambda x: 'index: {x}'.format(x=x)} result = df.to_latex(formatters=dict(formatters)) @@ -347,10 +348,10 @@ def test_to_latex_escape(self): a = 'a' b = 'b' - test_dict = {u('co^l1'): {a: "a", - b: "b"}, - u('co$e^x$'): {a: "a", - b: "b"}} + test_dict = {u('co$e^x$'): {a: "a", + b: "b"}, + u('co^l1'): {a: "a", + b: "b"}} unescaped_result = DataFrame(test_dict).to_latex(escape=False) escaped_result = DataFrame(test_dict).to_latex( diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index a72744e08fa7c..7e497c395266f 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -553,7 +553,7 @@ def __str__(self): def test_label_overflow(self): # GH14256: buffer length not checked when writing label - df = pd.DataFrame({'foo': [1337], 'bar' * 100000: [1]}) + df = pd.DataFrame({'bar' * 100000: [1], 'foo': [1337]}) assert df.to_json() == \ '{{"{bar}":{{"0":1}},"foo":{{"0":1337}}}}'.format( bar=('bar' * 100000)) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 101d34ebdb89f..5dca45c8dd8bb 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -588,18 +588,18 @@ def test_merge_on_datetime64tz(self): result = pd.merge(left, right, on='key', how='outer') assert_frame_equal(result, expected) - left = pd.DataFrame({'value': pd.date_range('20151010', periods=2, - tz='US/Eastern'), - 'key': [1, 2]}) - right = pd.DataFrame({'value': pd.date_range('20151011', periods=2, - tz='US/Eastern'), - 'key': [2, 3]}) + left = pd.DataFrame({'key': [1, 2], + 'value': pd.date_range('20151010', periods=2, + tz='US/Eastern')}) + right = pd.DataFrame({'key': [2, 3], + 'value': pd.date_range('20151011', periods=2, + tz='US/Eastern')}) expected = DataFrame({ + 'key': [1, 2, 3], 'value_x': list(pd.date_range('20151010', periods=2, tz='US/Eastern')) + [pd.NaT], 'value_y': [pd.NaT] + list(pd.date_range('20151011', periods=2, - tz='US/Eastern')), - 'key': [1, 2, 3]}) + tz='US/Eastern'))}) result = pd.merge(left, right, on='key', how='outer') assert_frame_equal(result, expected) assert result['value_x'].dtype == 'datetime64[ns, US/Eastern]' @@ -632,18 +632,18 @@ def test_merge_on_periods(self): result = pd.merge(left, right, on='key', how='outer') assert_frame_equal(result, expected) - left = pd.DataFrame({'value': pd.period_range('20151010', periods=2, - freq='D'), - 'key': [1, 2]}) - right = pd.DataFrame({'value': pd.period_range('20151011', periods=2, - freq='D'), - 'key': [2, 3]}) + left = pd.DataFrame({'key': [1, 2], + 'value': pd.period_range('20151010', periods=2, + freq='D')}) + right = pd.DataFrame({'key': [2, 3], + 'value': pd.period_range('20151011', periods=2, + freq='D')}) exp_x = pd.period_range('20151010', periods=2, freq='D') exp_y = pd.period_range('20151011', periods=2, freq='D') - expected = DataFrame({'value_x': list(exp_x) + [pd.NaT], - 'value_y': [pd.NaT] + list(exp_y), - 'key': [1, 2, 3]}) + expected = DataFrame({'key': [1, 2, 3], + 'value_x': list(exp_x) + [pd.NaT], + 'value_y': [pd.NaT] + list(exp_y)}) result = pd.merge(left, right, on='key', how='outer') assert_frame_equal(result, expected) assert result['value_x'].dtype == 'object' @@ -651,12 +651,13 @@ def test_merge_on_periods(self): def test_indicator(self): # PR #10054. xref #7412 and closes #8790. - df1 = DataFrame({'col1': [0, 1], 'col_left': [ - 'a', 'b'], 'col_conflict': [1, 2]}) + df1 = DataFrame({'col1': [0, 1], 'col_conflict': [1, 2], + 'col_left': ['a', 'b']}) df1_copy = df1.copy() - df2 = DataFrame({'col1': [1, 2, 3, 4, 5], 'col_right': [2, 2, 2, 2, 2], - 'col_conflict': [1, 2, 3, 4, 5]}) + df2 = DataFrame({'col1': [1, 2, 3, 4, 5], + 'col_conflict': [1, 2, 3, 4, 5], + 'col_right': [2, 2, 2, 2, 2]}) df2_copy = df2.copy() df_result = DataFrame({ diff --git a/pandas/tests/reshape/merge/test_merge_ordered.py b/pandas/tests/reshape/merge/test_merge_ordered.py index 31c484a483d18..42d8eb7273ee1 100644 --- a/pandas/tests/reshape/merge/test_merge_ordered.py +++ b/pandas/tests/reshape/merge/test_merge_ordered.py @@ -83,9 +83,10 @@ def test_empty_sequence_concat(self): pd.concat([pd.DataFrame(), None]) def test_doc_example(self): - left = DataFrame({'key': ['a', 'c', 'e', 'a', 'c', 'e'], + left = DataFrame({'group': list('aaabbb'), + 'key': ['a', 'c', 'e', 'a', 'c', 'e'], 'lvalue': [1, 2, 3] * 2, - 'group': list('aaabbb')}) + }) right = DataFrame({'key': ['b', 'c', 'd'], 'rvalue': [1, 2, 3]}) diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index cc4eb6b475ae5..437b4179c580a 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -1542,10 +1542,10 @@ def test_concat_bug_2972(self): def test_concat_bug_3602(self): # GH 3602, duplicate columns - df1 = DataFrame({'firmNo': [0, 0, 0, 0], 'stringvar': [ - 'rrr', 'rrr', 'rrr', 'rrr'], 'prc': [6, 6, 6, 6]}) - df2 = DataFrame({'misc': [1, 2, 3, 4], 'prc': [ - 6, 6, 6, 6], 'C': [9, 10, 11, 12]}) + df1 = DataFrame({'firmNo': [0, 0, 0, 0], 'prc': [6, 6, 6, 6], + 'stringvar': ['rrr', 'rrr', 'rrr', 'rrr']}) + df2 = DataFrame({'C': [9, 10, 11, 12], 'misc': [1, 2, 3, 4], + 'prc': [6, 6, 6, 6]}) expected = DataFrame([[0, 6, 'rrr', 9, 1, 6], [0, 6, 'rrr', 10, 2, 6], [0, 6, 'rrr', 11, 3, 6], diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index b7422dfd7e911..000b22d4fdd36 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -589,11 +589,11 @@ def test_nonnumeric_suffix(self): def test_mixed_type_suffix(self): df = pd.DataFrame({ - 'treatment_1': [1.0, 2.0], - 'treatment_foo': [3.0, 4.0], - 'result_foo': [5.0, 6.0], + 'A': ['X1', 'X2'], 'result_1': [0, 9], - 'A': ['X1', 'X2']}) + 'result_foo': [5.0, 6.0], + 'treatment_1': [1.0, 2.0], + 'treatment_foo': [3.0, 4.0]}) expected = pd.DataFrame({ 'A': ['X1', 'X2', 'X1', 'X2'], 'colname': ['1', '1', 'foo', 'foo'], diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_reshape.py index a57c3c41b3637..c4d925b83585b 100644 --- a/pandas/tests/reshape/test_reshape.py +++ b/pandas/tests/reshape/test_reshape.py @@ -100,8 +100,8 @@ def test_basic_types(self, sparse, dtype): expected_counts = {'int64': 1, 'object': 1} expected_counts[dtype_name] = 3 + expected_counts.get(dtype_name, 0) - expected = Series(expected_counts).sort_values() - tm.assert_series_equal(result.get_dtype_counts().sort_values(), + expected = Series(expected_counts).sort_index() + tm.assert_series_equal(result.get_dtype_counts().sort_index(), expected) def test_just_na(self, sparse): @@ -212,10 +212,10 @@ def test_dataframe_dummies_prefix_str(self, df, sparse): def test_dataframe_dummies_subset(self, df, sparse): result = get_dummies(df, prefix=['from_A'], columns=['A'], sparse=sparse) - expected = DataFrame({'from_A_a': [1, 0, 1], - 'from_A_b': [0, 1, 0], - 'B': ['b', 'b', 'c'], - 'C': [1, 2, 3]}, dtype=np.uint8) + expected = DataFrame({'B': ['b', 'b', 'c'], + 'C': [1, 2, 3], + 'from_A_a': [1, 0, 1], + 'from_A_b': [0, 1, 0]}, dtype=np.uint8) expected[['C']] = df[['C']] assert_frame_equal(result, expected) @@ -249,16 +249,16 @@ def test_dataframe_dummies_prefix_sep_bad_length(self, df, sparse): def test_dataframe_dummies_prefix_dict(self, sparse): prefixes = {'A': 'from_A', 'B': 'from_B'} - df = DataFrame({'A': ['a', 'b', 'a'], - 'B': ['b', 'b', 'c'], - 'C': [1, 2, 3]}) + df = DataFrame({'C': [1, 2, 3], + 'A': ['a', 'b', 'a'], + 'B': ['b', 'b', 'c']}) result = get_dummies(df, prefix=prefixes, sparse=sparse) - expected = DataFrame({'from_A_a': [1, 0, 1], + expected = DataFrame({'C': [1, 2, 3], + 'from_A_a': [1, 0, 1], 'from_A_b': [0, 1, 0], 'from_B_b': [1, 1, 0], - 'from_B_c': [0, 0, 1], - 'C': [1, 2, 3]}) + 'from_B_c': [0, 0, 1]}) columns = ['from_A_a', 'from_A_b', 'from_B_b', 'from_B_c'] expected[columns] = expected[columns].astype(np.uint8) From 26dd5b1c895feef63249cd96d483470b8995a8e7 Mon Sep 17 00:00:00 2001 From: cbertinato Date: Sat, 24 Feb 2018 09:58:40 -0500 Subject: [PATCH 187/214] BUG: fix Series constructor for scalar and Categorical dtype (#19717) --- doc/source/whatsnew/v0.23.0.txt | 3 ++- pandas/core/dtypes/cast.py | 2 +- pandas/tests/dtypes/test_cast.py | 15 ++++++++++++++- pandas/tests/series/test_constructors.py | 7 +++++++ 4 files changed, 24 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index ca5749afd11bc..a188ddd613080 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -730,7 +730,8 @@ Categorical ``self`` but in a different order (:issue:`19551`) - Bug in :meth:`Index.astype` with a categorical dtype where the resultant index is not converted to a :class:`CategoricalIndex` for all types of index (:issue:`18630`) - Bug in :meth:`Series.astype` and ``Categorical.astype()`` where an existing categorical data does not get updated (:issue:`10696`, :issue:`18593`) -- Bug in :class:`Index` constructor with ``dtype=CategoricalDtype(...)`` where ``categories`` and ``ordered`` are not maintained (:issue:`19032`) +- Bug in :class:`Index` constructor with ``dtype=CategoricalDtype(...)`` where ``categories`` and ``ordered`` are not maintained (issue:`19032`) +- Bug in :class:`Series` constructor with scalar and ``dtype=CategoricalDtype(...)`` where ``categories`` and ``ordered`` are not maintained (issue:`19565`) Datetimelike ^^^^^^^^^^^^ diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 55919fb2bea0d..352ce29f5c37b 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1178,7 +1178,7 @@ def construct_1d_arraylike_from_scalar(value, length, dtype): subarr = DatetimeIndex([value] * length, dtype=dtype) elif is_categorical_dtype(dtype): from pandas import Categorical - subarr = Categorical([value] * length) + subarr = Categorical([value] * length, dtype=dtype) else: if not isinstance(dtype, (np.dtype, type(np.dtype))): dtype = dtype.dtype diff --git a/pandas/tests/dtypes/test_cast.py b/pandas/tests/dtypes/test_cast.py index d13d781f03117..31bd962b67afb 100644 --- a/pandas/tests/dtypes/test_cast.py +++ b/pandas/tests/dtypes/test_cast.py @@ -22,7 +22,8 @@ maybe_convert_string_to_object, maybe_convert_scalar, find_common_type, - construct_1d_object_array_from_listlike) + construct_1d_object_array_from_listlike, + construct_1d_arraylike_from_scalar) from pandas.core.dtypes.dtypes import ( CategoricalDtype, DatetimeTZDtype, @@ -422,3 +423,15 @@ def test_cast_1d_array(self, datum1, datum2): @pytest.mark.parametrize('val', [1, 2., None]) def test_cast_1d_array_invalid_scalar(self, val): pytest.raises(TypeError, construct_1d_object_array_from_listlike, val) + + def test_cast_1d_arraylike_from_scalar_categorical(self): + # GH 19565 - Categorical result from scalar did not maintain categories + # and ordering of the passed dtype + cats = ['a', 'b', 'c'] + cat_type = CategoricalDtype(categories=cats, ordered=False) + expected = pd.Categorical(['a', 'a'], categories=cats) + result = construct_1d_arraylike_from_scalar('a', len(expected), + cat_type) + tm.assert_categorical_equal(result, expected, + check_category_order=True, + check_dtype=True) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 33737387edffa..77f9dfcce686d 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -270,6 +270,13 @@ def test_constructor_categorical_dtype(self): tm.assert_index_equal(result.cat.categories, pd.Index(['b', 'a'])) assert result.cat.ordered is False + # GH 19565 - Check broadcasting of scalar with Categorical dtype + result = Series('a', index=[0, 1], + dtype=CategoricalDtype(['a', 'b'], ordered=True)) + expected = Series(['a', 'a'], index=[0, 1], + dtype=CategoricalDtype(['a', 'b'], ordered=True)) + tm.assert_series_equal(result, expected, check_categorical=True) + def test_categorical_sideeffects_free(self): # Passing a categorical to a Series and then changing values in either # the series or the categorical should not change the values in the From ecef6d0860d5266fb38ca9d0183c5911af79448a Mon Sep 17 00:00:00 2001 From: jayfoad Date: Sat, 24 Feb 2018 15:08:16 +0000 Subject: [PATCH 188/214] Raise OptionError instead of KeyError in __getattr__. Fixes #19789. (#19790) --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/core/config.py | 5 ++++- pandas/tests/test_config.py | 6 ++++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index a188ddd613080..6bcc6d1582c34 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -909,3 +909,4 @@ Other ^^^^^ - Improved error message when attempting to use a Python keyword as an identifier in a ``numexpr`` backed query (:issue:`18221`) +- Bug in accessing a :func:`pandas.get_option`, which raised ``KeyError`` rather than ``OptionError`` when looking up a non-existant option key in some cases (:issue:`19789`) diff --git a/pandas/core/config.py b/pandas/core/config.py index 692aed178719d..369e0568346ef 100644 --- a/pandas/core/config.py +++ b/pandas/core/config.py @@ -196,7 +196,10 @@ def __getattr__(self, key): if prefix: prefix += "." prefix += key - v = object.__getattribute__(self, "d")[key] + try: + v = object.__getattribute__(self, "d")[key] + except KeyError: + raise OptionError("No such option") if isinstance(v, dict): return DictWrapper(v, prefix) else: diff --git a/pandas/tests/test_config.py b/pandas/tests/test_config.py index 8d6f36ac6a798..91ce65dcce9b2 100644 --- a/pandas/tests/test_config.py +++ b/pandas/tests/test_config.py @@ -428,3 +428,9 @@ def test_option_context_scope(self): # Ensure the current context is reset assert self.cf.get_option(option_name) == original_value + + def test_dictwrapper_getattr(self): + options = self.cf.options + # GH 19789 + pytest.raises(self.cf.OptionError, getattr, options, 'bananas') + assert not hasattr(options, 'bananas') From 26a2d41ebdb506e2419f2b38a53ecf622d16c319 Mon Sep 17 00:00:00 2001 From: Jaume Bonet Date: Sat, 24 Feb 2018 16:10:34 +0100 Subject: [PATCH 189/214] Keep subclassing in apply (#19823) --- doc/source/whatsnew/v0.23.0.txt | 2 ++ pandas/core/apply.py | 16 ++++----- pandas/tests/frame/test_subclass.py | 56 +++++++++++++++++++++++++++++ 3 files changed, 66 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 6bcc6d1582c34..fd3c3a5a7a301 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -295,8 +295,10 @@ Other Enhancements - ``IntervalIndex.astype`` now supports conversions between subtypes when passed an ``IntervalDtype`` (:issue:`19197`) - :class:`IntervalIndex` and its associated constructor methods (``from_arrays``, ``from_breaks``, ``from_tuples``) have gained a ``dtype`` parameter (:issue:`19262`) - Added :func:`SeriesGroupBy.is_monotonic_increasing` and :func:`SeriesGroupBy.is_monotonic_decreasing` (:issue:`17015`) +- For subclassed ``DataFrames``, :func:`DataFrame.apply` will now preserve the ``Series`` subclass (if defined) when passing the data to the applied function (:issue:`19822`) - :func:`DataFrame.from_dict` now accepts a ``columns`` argument that can be used to specify the column names when ``orient='index'`` is used (:issue:`18529`) + .. _whatsnew_0230.api_breaking: Backwards incompatible API changes diff --git a/pandas/core/apply.py b/pandas/core/apply.py index c65943fbbb201..9056f78ee02ed 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -162,7 +162,7 @@ def apply_empty_result(self): pass if reduce: - return Series(np.nan, index=self.agg_axis) + return self.obj._constructor_sliced(np.nan, index=self.agg_axis) else: return self.obj.copy() @@ -175,11 +175,13 @@ def apply_raw(self): result = np.apply_along_axis(self.f, self.axis, self.values) # TODO: mixed type case - from pandas import DataFrame, Series if result.ndim == 2: - return DataFrame(result, index=self.index, columns=self.columns) + return self.obj._constructor(result, + index=self.index, + columns=self.columns) else: - return Series(result, index=self.agg_axis) + return self.obj._constructor_sliced(result, + index=self.agg_axis) def apply_broadcast(self, target): result_values = np.empty_like(target.values) @@ -232,7 +234,7 @@ def apply_standard(self): axis=self.axis, dummy=dummy, labels=labels) - return Series(result, index=labels) + return self.obj._constructor_sliced(result, index=labels) except Exception: pass @@ -291,8 +293,7 @@ def wrap_results(self): return self.wrap_results_for_axis() # dict of scalars - from pandas import Series - result = Series(results) + result = self.obj._constructor_sliced(results) result.index = self.res_index return result @@ -379,7 +380,6 @@ def wrap_results_for_axis(self): # we have a non-series and don't want inference elif not isinstance(results[0], ABCSeries): from pandas import Series - result = Series(results) result.index = self.res_index diff --git a/pandas/tests/frame/test_subclass.py b/pandas/tests/frame/test_subclass.py index c52b512c2930a..caaa311e9ee96 100644 --- a/pandas/tests/frame/test_subclass.py +++ b/pandas/tests/frame/test_subclass.py @@ -514,3 +514,59 @@ def test_subclassed_wide_to_long(self): long_frame = pd.wide_to_long(df, ["A", "B"], i="id", j="year") tm.assert_frame_equal(long_frame, expected) + + def test_subclassed_apply(self): + # GH 19822 + + def check_row_subclass(row): + assert isinstance(row, tm.SubclassedSeries) + + def strech(row): + if row["variable"] == "height": + row["value"] += 0.5 + return row + + df = tm.SubclassedDataFrame([ + ['John', 'Doe', 'height', 5.5], + ['Mary', 'Bo', 'height', 6.0], + ['John', 'Doe', 'weight', 130], + ['Mary', 'Bo', 'weight', 150]], + columns=['first', 'last', 'variable', 'value']) + + df.apply(lambda x: check_row_subclass(x)) + df.apply(lambda x: check_row_subclass(x), axis=1) + + expected = tm.SubclassedDataFrame([ + ['John', 'Doe', 'height', 6.0], + ['Mary', 'Bo', 'height', 6.5], + ['John', 'Doe', 'weight', 130], + ['Mary', 'Bo', 'weight', 150]], + columns=['first', 'last', 'variable', 'value']) + + result = df.apply(lambda x: strech(x), axis=1) + assert isinstance(result, tm.SubclassedDataFrame) + tm.assert_frame_equal(result, expected) + + expected = tm.SubclassedDataFrame([ + [1, 2, 3], + [1, 2, 3], + [1, 2, 3], + [1, 2, 3]]) + + result = df.apply(lambda x: tm.SubclassedSeries([1, 2, 3]), axis=1) + assert isinstance(result, tm.SubclassedDataFrame) + tm.assert_frame_equal(result, expected) + + result = df.apply(lambda x: [1, 2, 3], axis=1, result_type="expand") + assert isinstance(result, tm.SubclassedDataFrame) + tm.assert_frame_equal(result, expected) + + expected = tm.SubclassedSeries([ + [1, 2, 3], + [1, 2, 3], + [1, 2, 3], + [1, 2, 3]]) + + result = df.apply(lambda x: [1, 2, 3], axis=1) + assert not isinstance(result, tm.SubclassedDataFrame) + tm.assert_series_equal(result, expected) From e362281fc605cd2a8c3cbb64e4edb9388ce0515e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 24 Feb 2018 09:19:19 -0600 Subject: [PATCH 190/214] REF: Base class for all extension tests (#19863) --- ci/lint.sh | 9 +++++++ pandas/tests/extension/base/__init__.py | 8 ++++++ pandas/tests/extension/base/base.py | 6 +++++ pandas/tests/extension/base/casting.py | 4 ++- pandas/tests/extension/base/constructors.py | 4 ++- pandas/tests/extension/base/dtype.py | 4 ++- pandas/tests/extension/base/getitem.py | 29 +++++++++++---------- pandas/tests/extension/base/interface.py | 4 ++- pandas/tests/extension/base/methods.py | 9 ++++--- pandas/tests/extension/base/missing.py | 14 +++++----- pandas/tests/extension/base/reshaping.py | 17 ++++++------ 11 files changed, 72 insertions(+), 36 deletions(-) create mode 100644 pandas/tests/extension/base/base.py diff --git a/ci/lint.sh b/ci/lint.sh index fcd65fc5aba5e..545ac9c90c5c1 100755 --- a/ci/lint.sh +++ b/ci/lint.sh @@ -111,6 +111,15 @@ if [ "$LINT" ]; then RET=1 fi + # Check for the following code in the extension array base tests + # tm.assert_frame_equal + # tm.assert_series_equal + grep -r -E --include '*.py' --exclude base.py 'tm.assert_(series|frame)_equal' pandas/tests/extension/base + + if [ $? = "0" ]; then + RET=1 + fi + echo "Check for invalid testing DONE" # Check for imports from pandas.core.common instead diff --git a/pandas/tests/extension/base/__init__.py b/pandas/tests/extension/base/__init__.py index 2273ef1f3e110..27c106efd0524 100644 --- a/pandas/tests/extension/base/__init__.py +++ b/pandas/tests/extension/base/__init__.py @@ -31,6 +31,14 @@ class TestMyDtype(BaseDtypeTests): Your class ``TestDtype`` will inherit all the tests defined on ``BaseDtypeTests``. pytest's fixture discover will supply your ``dtype`` wherever the test requires it. You're free to implement additional tests. + +All the tests in these modules use ``self.assert_frame_equal`` or +``self.assert_series_equal`` for dataframe or series comparisons. By default, +they use the usual ``pandas.testing.assert_frame_equal`` and +``pandas.testing.assert_series_equal``. You can override the checks used +by defining the staticmethods ``assert_frame_equal`` and +``assert_series_equal`` on your base test class. + """ from .casting import BaseCastingTests # noqa from .constructors import BaseConstructorsTests # noqa diff --git a/pandas/tests/extension/base/base.py b/pandas/tests/extension/base/base.py new file mode 100644 index 0000000000000..d29587e635ebd --- /dev/null +++ b/pandas/tests/extension/base/base.py @@ -0,0 +1,6 @@ +import pandas.util.testing as tm + + +class BaseExtensionTests(object): + assert_series_equal = staticmethod(tm.assert_series_equal) + assert_frame_equal = staticmethod(tm.assert_frame_equal) diff --git a/pandas/tests/extension/base/casting.py b/pandas/tests/extension/base/casting.py index bcfbf0a247269..adc690939b36c 100644 --- a/pandas/tests/extension/base/casting.py +++ b/pandas/tests/extension/base/casting.py @@ -1,8 +1,10 @@ import pandas as pd from pandas.core.internals import ObjectBlock +from .base import BaseExtensionTests -class BaseCastingTests(object): + +class BaseCastingTests(BaseExtensionTests): """Casting to and from ExtensionDtypes""" def test_astype_object_series(self, all_data): diff --git a/pandas/tests/extension/base/constructors.py b/pandas/tests/extension/base/constructors.py index 7ad100e6289e9..2d5d747aec5a7 100644 --- a/pandas/tests/extension/base/constructors.py +++ b/pandas/tests/extension/base/constructors.py @@ -4,8 +4,10 @@ import pandas.util.testing as tm from pandas.core.internals import ExtensionBlock +from .base import BaseExtensionTests -class BaseConstructorsTests(object): + +class BaseConstructorsTests(BaseExtensionTests): def test_series_constructor(self, data): result = pd.Series(data) diff --git a/pandas/tests/extension/base/dtype.py b/pandas/tests/extension/base/dtype.py index f5015bd469f13..63d3d807c270c 100644 --- a/pandas/tests/extension/base/dtype.py +++ b/pandas/tests/extension/base/dtype.py @@ -1,8 +1,10 @@ import numpy as np import pandas as pd +from .base import BaseExtensionTests -class BaseDtypeTests(object): + +class BaseDtypeTests(BaseExtensionTests): """Base class for ExtensionDtype classes""" def test_name(self, dtype): diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index f43971e928cac..31ed8b9e01225 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -1,20 +1,21 @@ import numpy as np import pandas as pd -import pandas.util.testing as tm +from .base import BaseExtensionTests -class BaseGetitemTests(object): + +class BaseGetitemTests(BaseExtensionTests): """Tests for ExtensionArray.__getitem__.""" def test_iloc_series(self, data): ser = pd.Series(data) result = ser.iloc[:4] expected = pd.Series(data[:4]) - tm.assert_series_equal(result, expected) + self.assert_series_equal(result, expected) result = ser.iloc[[0, 1, 2, 3]] - tm.assert_series_equal(result, expected) + self.assert_series_equal(result, expected) def test_iloc_frame(self, data): df = pd.DataFrame({"A": data, 'B': np.arange(len(data))}) @@ -22,30 +23,30 @@ def test_iloc_frame(self, data): # slice -> frame result = df.iloc[:4, [0]] - tm.assert_frame_equal(result, expected) + self.assert_frame_equal(result, expected) # sequence -> frame result = df.iloc[[0, 1, 2, 3], [0]] - tm.assert_frame_equal(result, expected) + self.assert_frame_equal(result, expected) expected = pd.Series(data[:4], name='A') # slice -> series result = df.iloc[:4, 0] - tm.assert_series_equal(result, expected) + self.assert_series_equal(result, expected) # sequence -> series result = df.iloc[:4, 0] - tm.assert_series_equal(result, expected) + self.assert_series_equal(result, expected) def test_loc_series(self, data): ser = pd.Series(data) result = ser.loc[:3] expected = pd.Series(data[:4]) - tm.assert_series_equal(result, expected) + self.assert_series_equal(result, expected) result = ser.loc[[0, 1, 2, 3]] - tm.assert_series_equal(result, expected) + self.assert_series_equal(result, expected) def test_loc_frame(self, data): df = pd.DataFrame({"A": data, 'B': np.arange(len(data))}) @@ -53,21 +54,21 @@ def test_loc_frame(self, data): # slice -> frame result = df.loc[:3, ['A']] - tm.assert_frame_equal(result, expected) + self.assert_frame_equal(result, expected) # sequence -> frame result = df.loc[[0, 1, 2, 3], ['A']] - tm.assert_frame_equal(result, expected) + self.assert_frame_equal(result, expected) expected = pd.Series(data[:4], name='A') # slice -> series result = df.loc[:3, 'A'] - tm.assert_series_equal(result, expected) + self.assert_series_equal(result, expected) # sequence -> series result = df.loc[:3, 'A'] - tm.assert_series_equal(result, expected) + self.assert_series_equal(result, expected) def test_getitem_scalar(self, data): result = data[0] diff --git a/pandas/tests/extension/base/interface.py b/pandas/tests/extension/base/interface.py index 8f17131a9482b..e1596f0675f32 100644 --- a/pandas/tests/extension/base/interface.py +++ b/pandas/tests/extension/base/interface.py @@ -5,8 +5,10 @@ from pandas.core.dtypes.common import is_extension_array_dtype from pandas.core.dtypes.dtypes import ExtensionDtype +from .base import BaseExtensionTests -class BaseInterfaceTests(object): + +class BaseInterfaceTests(BaseExtensionTests): """Tests that the basic interface is satisfied.""" # ------------------------------------------------------------------------ # Interface diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index c77811ca63926..74e5d180b1aa3 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -2,10 +2,11 @@ import numpy as np import pandas as pd -import pandas.util.testing as tm +from .base import BaseExtensionTests -class BaseMethodsTests(object): + +class BaseMethodsTests(BaseExtensionTests): """Various Series and DataFrame methods.""" @pytest.mark.parametrize('dropna', [True, False]) @@ -19,13 +20,13 @@ def test_value_counts(self, all_data, dropna): result = pd.Series(all_data).value_counts(dropna=dropna).sort_index() expected = pd.Series(other).value_counts(dropna=dropna).sort_index() - tm.assert_series_equal(result, expected) + self.assert_series_equal(result, expected) def test_count(self, data_missing): df = pd.DataFrame({"A": data_missing}) result = df.count(axis='columns') expected = pd.Series([0, 1]) - tm.assert_series_equal(result, expected) + self.assert_series_equal(result, expected) def test_apply_simple_series(self, data): result = pd.Series(data).apply(id) diff --git a/pandas/tests/extension/base/missing.py b/pandas/tests/extension/base/missing.py index 1d6f2eea1f1f9..3ae82fa1ca432 100644 --- a/pandas/tests/extension/base/missing.py +++ b/pandas/tests/extension/base/missing.py @@ -3,8 +3,10 @@ import pandas as pd import pandas.util.testing as tm +from .base import BaseExtensionTests -class BaseMissingTests(object): + +class BaseMissingTests(BaseExtensionTests): def test_isna(self, data_missing): if data_missing._can_hold_na: expected = np.array([True, False]) @@ -16,13 +18,13 @@ def test_isna(self, data_missing): result = pd.Series(data_missing).isna() expected = pd.Series(expected) - tm.assert_series_equal(result, expected) + self.assert_series_equal(result, expected) def test_dropna_series(self, data_missing): ser = pd.Series(data_missing) result = ser.dropna() expected = ser.iloc[[1]] - tm.assert_series_equal(result, expected) + self.assert_series_equal(result, expected) def test_dropna_frame(self, data_missing): df = pd.DataFrame({"A": data_missing}) @@ -30,16 +32,16 @@ def test_dropna_frame(self, data_missing): # defaults result = df.dropna() expected = df.iloc[[1]] - tm.assert_frame_equal(result, expected) + self.assert_frame_equal(result, expected) # axis = 1 result = df.dropna(axis='columns') expected = pd.DataFrame(index=[0, 1]) - tm.assert_frame_equal(result, expected) + self.assert_frame_equal(result, expected) # multiple df = pd.DataFrame({"A": data_missing, "B": [1, np.nan]}) result = df.dropna() expected = df.iloc[:0] - tm.assert_frame_equal(result, expected) + self.assert_frame_equal(result, expected) diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index d8f577c6fa50d..cfb70f2291555 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -1,11 +1,12 @@ import pytest import pandas as pd -import pandas.util.testing as tm from pandas.core.internals import ExtensionBlock +from .base import BaseExtensionTests -class BaseReshapingTests(object): + +class BaseReshapingTests(BaseExtensionTests): """Tests for reshaping and concatenation.""" @pytest.mark.parametrize('in_frame', [True, False]) def test_concat(self, data, in_frame): @@ -32,8 +33,8 @@ def test_align(self, data, na_value): # Assumes that the ctor can take a list of scalars of the type e1 = pd.Series(type(data)(list(a) + [na_value])) e2 = pd.Series(type(data)([na_value] + list(b))) - tm.assert_series_equal(r1, e1) - tm.assert_series_equal(r2, e2) + self.assert_series_equal(r1, e1) + self.assert_series_equal(r2, e2) def test_align_frame(self, data, na_value): a = data[:3] @@ -45,17 +46,17 @@ def test_align_frame(self, data, na_value): # Assumes that the ctor can take a list of scalars of the type e1 = pd.DataFrame({'A': type(data)(list(a) + [na_value])}) e2 = pd.DataFrame({'A': type(data)([na_value] + list(b))}) - tm.assert_frame_equal(r1, e1) - tm.assert_frame_equal(r2, e2) + self.assert_frame_equal(r1, e1) + self.assert_frame_equal(r2, e2) def test_set_frame_expand_regular_with_extension(self, data): df = pd.DataFrame({"A": [1] * len(data)}) df['B'] = data expected = pd.DataFrame({"A": [1] * len(data), "B": data}) - tm.assert_frame_equal(df, expected) + self.assert_frame_equal(df, expected) def test_set_frame_expand_extension_with_regular(self, data): df = pd.DataFrame({'A': data}) df['B'] = [1] * len(data) expected = pd.DataFrame({"A": data, "B": [1] * len(data)}) - tm.assert_frame_equal(df, expected) + self.assert_frame_equal(df, expected) From e52a059f306bd0a7604ff9104b13de0df897dbc4 Mon Sep 17 00:00:00 2001 From: Tommy <10076072+tommyod@users.noreply.github.com> Date: Sat, 24 Feb 2018 17:31:12 +0100 Subject: [PATCH 191/214] DOC: Updated links to 2 tutorials in tutorials.rst (#19857) --- doc/source/tutorials.rst | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/doc/source/tutorials.rst b/doc/source/tutorials.rst index db9385519bff2..0398e2892cef5 100644 --- a/doc/source/tutorials.rst +++ b/doc/source/tutorials.rst @@ -26,32 +26,34 @@ repository `_. To run the examples in th clone the GitHub repository and get IPython Notebook running. See `How to use this cookbook `_. -- `A quick tour of the IPython Notebook: `_ +- `A quick tour of the IPython Notebook: `_ Shows off IPython's awesome tab completion and magic functions. -- `Chapter 1: `_ +- `Chapter 1: `_ Reading your data into pandas is pretty much the easiest thing. Even when the encoding is wrong! -- `Chapter 2: `_ +- `Chapter 2: `_ It's not totally obvious how to select data from a pandas dataframe. Here we explain the basics (how to take slices and get columns) -- `Chapter 3: `_ +- `Chapter 3: `_ Here we get into serious slicing and dicing and learn how to filter dataframes in complicated ways, really fast. -- `Chapter 4: `_ +- `Chapter 4: `_ Groupby/aggregate is seriously my favorite thing about pandas and I use it all the time. You should probably read this. -- `Chapter 5: `_ +- `Chapter 5: `_ Here you get to find out if it's cold in Montreal in the winter (spoiler: yes). Web scraping with pandas is fun! Here we combine dataframes. -- `Chapter 6: `_ +- `Chapter 6: `_ Strings with pandas are great. It has all these vectorized string operations and they're the best. We will turn a bunch of strings containing "Snow" into vectors of numbers in a trice. -- `Chapter 7: `_ +- `Chapter 7: `_ Cleaning up messy data is never a joy, but with pandas it's easier. -- `Chapter 8: `_ +- `Chapter 8: `_ Parsing Unix timestamps is confusing at first but it turns out to be really easy. +- `Chapter 9: `_ + Reading data from SQL databases. Lessons for new pandas users From feedf66ace23d884973420785e77affe2451b0bc Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 24 Feb 2018 08:38:22 -0800 Subject: [PATCH 192/214] templatize timedelta arith ops (#19871) --- pandas/_libs/tslibs/timedeltas.pyx | 47 +++++++++++++++++++++++++----- 1 file changed, 40 insertions(+), 7 deletions(-) diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index c4578a289b020..7aeff9bec75b5 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -471,9 +471,12 @@ def _binary_op_method_timedeltalike(op, name): # define a binary operation that only works if the other argument is # timedelta like or an array of timedeltalike def f(self, other): - if hasattr(other, 'delta') and not PyDelta_Check(other): - # offsets.Tick - return op(self, other.delta) + if hasattr(other, '_typ'): + # Series, DataFrame, ... + if other._typ == 'dateoffset' and hasattr(other, 'delta'): + # Tick offset + return op(self, other.delta) + return NotImplemented elif other is NaT: return NaT @@ -1052,7 +1055,14 @@ class Timedelta(_Timedelta): __rsub__ = _binary_op_method_timedeltalike(lambda x, y: y - x, '__rsub__') def __mul__(self, other): - if hasattr(other, 'dtype'): + if hasattr(other, '_typ'): + # Series, DataFrame, ... + if other._typ == 'dateoffset' and hasattr(other, 'delta'): + # Tick offset; this op will raise TypeError + return other.delta * self + return NotImplemented + + elif hasattr(other, 'dtype'): # ndarray-like return other * self.to_timedelta64() @@ -1068,7 +1078,18 @@ class Timedelta(_Timedelta): __rmul__ = __mul__ def __truediv__(self, other): - if hasattr(other, 'dtype'): + if hasattr(other, '_typ'): + # Series, DataFrame, ... + if other._typ == 'dateoffset' and hasattr(other, 'delta'): + # Tick offset + return self / other.delta + return NotImplemented + + elif is_timedelta64_object(other): + # convert to Timedelta below + pass + + elif hasattr(other, 'dtype'): return self.to_timedelta64() / other elif is_integer_object(other) or is_float_object(other): @@ -1084,7 +1105,18 @@ class Timedelta(_Timedelta): return self.value / float(other.value) def __rtruediv__(self, other): - if hasattr(other, 'dtype'): + if hasattr(other, '_typ'): + # Series, DataFrame, ... + if other._typ == 'dateoffset' and hasattr(other, 'delta'): + # Tick offset + return other.delta / self + return NotImplemented + + elif is_timedelta64_object(other): + # convert to Timedelta below + pass + + elif hasattr(other, 'dtype'): return other / self.to_timedelta64() elif not _validate_ops_compat(other): @@ -1160,9 +1192,10 @@ class Timedelta(_Timedelta): '{op}'.format(dtype=other.dtype, op='__floordiv__')) - if is_float_object(other) and util._checknull(other): + elif is_float_object(other) and util._checknull(other): # i.e. np.nan return NotImplemented + elif not _validate_ops_compat(other): return NotImplemented From e97be6fd1b7f8961b2917a5a210b0744d0a33027 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 24 Feb 2018 12:21:03 -0500 Subject: [PATCH 193/214] COMPAT: fixup decimal extension for indexing compat (#19882) --- pandas/tests/extension/base/getitem.py | 6 ++++-- pandas/tests/extension/decimal/array.py | 2 ++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index 31ed8b9e01225..566ba1721d13c 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -18,7 +18,8 @@ def test_iloc_series(self, data): self.assert_series_equal(result, expected) def test_iloc_frame(self, data): - df = pd.DataFrame({"A": data, 'B': np.arange(len(data))}) + df = pd.DataFrame({"A": data, 'B': + np.arange(len(data), dtype='int64')}) expected = pd.DataFrame({"A": data[:4]}) # slice -> frame @@ -49,7 +50,8 @@ def test_loc_series(self, data): self.assert_series_equal(result, expected) def test_loc_frame(self, data): - df = pd.DataFrame({"A": data, 'B': np.arange(len(data))}) + df = pd.DataFrame({"A": data, + 'B': np.arange(len(data), dtype='int64')}) expected = pd.DataFrame({"A": data[:4]}) # slice -> frame diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index f526ac5996a10..8b2eaadeca99e 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -8,6 +8,7 @@ import pandas as pd from pandas.core.arrays import ExtensionArray from pandas.core.dtypes.base import ExtensionDtype +from pandas.core.dtypes.common import _ensure_platform_int class DecimalDtype(ExtensionDtype): @@ -68,6 +69,7 @@ def isna(self): def take(self, indexer, allow_fill=True, fill_value=None): mask = indexer == -1 + indexer = _ensure_platform_int(indexer) out = self.values.take(indexer) out[mask] = self._na_value From 10cc8f4c5f3bfc446d195a6b695ab3cd177b50ad Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 24 Feb 2018 20:44:50 -0500 Subject: [PATCH 194/214] CI: pin jemalloc=4.5.0.poast for 2.7 build per (#19888) https://issues.apache.org/jira/browse/ARROW-2208 --- ci/requirements-2.7.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/requirements-2.7.sh b/ci/requirements-2.7.sh index e3bd5e46026c5..95169e5dcce57 100644 --- a/ci/requirements-2.7.sh +++ b/ci/requirements-2.7.sh @@ -4,4 +4,4 @@ source activate pandas echo "install 27" -conda install -n pandas -c conda-forge feather-format pyarrow=0.4.1 fastparquet +conda install -n pandas -c conda-forge feather-format pyarrow=0.4.1 jemalloc=4.5.0.post fastparquet From d87ca1c723154b09a005f865a06a38d4bb82917c Mon Sep 17 00:00:00 2001 From: William Ayd Date: Sun, 25 Feb 2018 08:05:26 -0800 Subject: [PATCH 195/214] Cythonized GroupBy Fill (#19673) --- asv_bench/benchmarks/groupby.py | 10 +- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/_libs/groupby.pyx | 216 +++++++++++++++++++++++++++ pandas/_libs/groupby_helper.pxi.in | 163 -------------------- pandas/core/groupby.py | 102 ++++++++++--- pandas/tests/groupby/test_groupby.py | 55 +++++++ 6 files changed, 362 insertions(+), 185 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 61db39528a5fb..c347442784d41 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -370,11 +370,11 @@ class GroupByMethods(object): param_names = ['dtype', 'method'] params = [['int', 'float'], - ['all', 'any', 'count', 'cumcount', 'cummax', 'cummin', - 'cumprod', 'cumsum', 'describe', 'first', 'head', 'last', 'mad', - 'max', 'min', 'median', 'mean', 'nunique', 'pct_change', 'prod', - 'rank', 'sem', 'shift', 'size', 'skew', 'std', 'sum', 'tail', - 'unique', 'value_counts', 'var']] + ['all', 'any', 'bfill', 'count', 'cumcount', 'cummax', 'cummin', + 'cumprod', 'cumsum', 'describe', 'ffill', 'first', 'head', + 'last', 'mad', 'max', 'min', 'median', 'mean', 'nunique', + 'pct_change', 'prod', 'rank', 'sem', 'shift', 'size', 'skew', + 'std', 'sum', 'tail', 'unique', 'value_counts', 'var']] def setup(self, dtype, method): ngroups = 1000 diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index fd3c3a5a7a301..fcaf46b1c3d71 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -689,6 +689,7 @@ Performance Improvements - Improved performance of pairwise ``.rolling()`` and ``.expanding()`` with ``.cov()`` and ``.corr()`` operations (:issue:`17917`) - Improved performance of :func:`DataFrameGroupBy.rank` (:issue:`15779`) - Improved performance of variable ``.rolling()`` on ``.min()`` and ``.max()`` (:issue:`19521`) +- Improved performance of ``GroupBy.ffill`` and ``GroupBy.bfill`` (:issue:`11296`) .. _whatsnew_0230.docs: diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 866683ce378ab..e3d208a915225 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -94,5 +94,221 @@ cdef inline float64_t kth_smallest_c(float64_t* a, return a[k] +@cython.boundscheck(False) +@cython.wraparound(False) +def group_median_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] labels, + Py_ssize_t min_count=-1): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, ngroups, size + ndarray[int64_t] _counts + ndarray data + float64_t* ptr + + assert min_count == -1, "'min_count' only used in add and prod" + + ngroups = len(counts) + N, K = ( values).shape + + indexer, _counts = groupsort_indexer(labels, ngroups) + counts[:] = _counts[1:] + + data = np.empty((K, N), dtype=np.float64) + ptr = data.data + + take_2d_axis1_float64_float64(values.T, indexer, out=data) + + with nogil: + + for i in range(K): + # exclude NA group + ptr += _counts[0] + for j in range(ngroups): + size = _counts[j + 1] + out[j, i] = median_linear(ptr, size) + ptr += size + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_cumprod_float64(float64_t[:, :] out, + float64_t[:, :] values, + int64_t[:] labels, + bint is_datetimelike): + """ + Only transforms on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, size + float64_t val + float64_t[:, :] accum + int64_t lab + + N, K = ( values).shape + accum = np.ones_like(values) + + with nogil: + for i in range(N): + lab = labels[i] + + if lab < 0: + continue + for j in range(K): + val = values[i, j] + if val == val: + accum[lab, j] *= val + out[i, j] = accum[lab, j] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_cumsum(numeric[:, :] out, + numeric[:, :] values, + int64_t[:] labels, + is_datetimelike): + """ + Only transforms on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, size + numeric val + numeric[:, :] accum + int64_t lab + + N, K = ( values).shape + accum = np.zeros_like(values) + + with nogil: + for i in range(N): + lab = labels[i] + + if lab < 0: + continue + for j in range(K): + val = values[i, j] + + if numeric == float32_t or numeric == float64_t: + if val == val: + accum[lab, j] += val + out[i, j] = accum[lab, j] + else: + accum[lab, j] += val + out[i, j] = accum[lab, j] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_shift_indexer(ndarray[int64_t] out, ndarray[int64_t] labels, + int ngroups, int periods): + cdef: + Py_ssize_t N, i, j, ii + int offset, sign + int64_t lab, idxer, idxer_slot + int64_t[:] label_seen = np.zeros(ngroups, dtype=np.int64) + int64_t[:, :] label_indexer + + N, = ( labels).shape + + if periods < 0: + periods = -periods + offset = N - 1 + sign = -1 + elif periods > 0: + offset = 0 + sign = 1 + + if periods == 0: + with nogil: + for i in range(N): + out[i] = i + else: + # array of each previous indexer seen + label_indexer = np.zeros((ngroups, periods), dtype=np.int64) + with nogil: + for i in range(N): + ## reverse iterator if shifting backwards + ii = offset + sign * i + lab = labels[ii] + + # Skip null keys + if lab == -1: + out[ii] = -1 + continue + + label_seen[lab] += 1 + + idxer_slot = label_seen[lab] % periods + idxer = label_indexer[lab, idxer_slot] + + if label_seen[lab] > periods: + out[ii] = idxer + else: + out[ii] = -1 + + label_indexer[lab, idxer_slot] = ii + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels, + ndarray[uint8_t] mask, object direction, + int64_t limit): + """Indexes how to fill values forwards or backwards within a group + + Parameters + ---------- + out : array of int64_t values which this method will write its results to + Missing values will be written to with a value of -1 + labels : array containing unique label for each group, with its ordering + matching up to the corresponding record in `values` + mask : array of int64_t values where a 1 indicates a missing value + direction : {'ffill', 'bfill'} + Direction for fill to be applied (forwards or backwards, respectively) + limit : Consecutive values to fill before stopping, or -1 for no limit + + Notes + ----- + This method modifies the `out` parameter rather than returning an object + """ + cdef: + Py_ssize_t i, N + ndarray[int64_t] sorted_labels + int64_t idx, curr_fill_idx=-1, filled_vals=0 + + N = len(out) + + # Make sure all arrays are the same size + assert N == len(labels) == len(mask) + + sorted_labels = np.argsort(labels).astype(np.int64, copy=False) + if direction == 'bfill': + sorted_labels = sorted_labels[::-1] + + with nogil: + for i in range(N): + idx = sorted_labels[i] + if mask[idx] == 1: # is missing + # Stop filling once we've hit the limit + if filled_vals >= limit and limit != -1: + curr_fill_idx = -1 + filled_vals += 1 + else: # reset items when not missing + filled_vals = 0 + curr_fill_idx = idx + + out[idx] = curr_fill_idx + + # If we move to the next group, reset + # the fill_idx and counter + if i == N - 1 or labels[idx] != labels[sorted_labels[i+1]]: + curr_fill_idx = -1 + filled_vals = 0 + + # generated from template include "groupby_helper.pxi" diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index e03e3af65755b..de802f4a72277 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -791,166 +791,3 @@ def group_cummax_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, out[i, j] = mval {{endfor}} - -#---------------------------------------------------------------------- -# other grouping functions not needing a template -#---------------------------------------------------------------------- - - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_median_float64(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[int64_t] labels, - Py_ssize_t min_count=-1): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, ngroups, size - ndarray[int64_t] _counts - ndarray data - float64_t* ptr - - assert min_count == -1, "'min_count' only used in add and prod" - - ngroups = len(counts) - N, K = ( values).shape - - indexer, _counts = groupsort_indexer(labels, ngroups) - counts[:] = _counts[1:] - - data = np.empty((K, N), dtype=np.float64) - ptr = data.data - - take_2d_axis1_float64_float64(values.T, indexer, out=data) - - with nogil: - - for i in range(K): - # exclude NA group - ptr += _counts[0] - for j in range(ngroups): - size = _counts[j + 1] - out[j, i] = median_linear(ptr, size) - ptr += size - - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_cumprod_float64(float64_t[:, :] out, - float64_t[:, :] values, - int64_t[:] labels, - bint is_datetimelike): - """ - Only transforms on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, size - float64_t val - float64_t[:, :] accum - int64_t lab - - N, K = ( values).shape - accum = np.ones_like(values) - - with nogil: - for i in range(N): - lab = labels[i] - - if lab < 0: - continue - for j in range(K): - val = values[i, j] - if val == val: - accum[lab, j] *= val - out[i, j] = accum[lab, j] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_cumsum(numeric[:, :] out, - numeric[:, :] values, - int64_t[:] labels, - is_datetimelike): - """ - Only transforms on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, size - numeric val - numeric[:, :] accum - int64_t lab - - N, K = ( values).shape - accum = np.zeros_like(values) - - with nogil: - for i in range(N): - lab = labels[i] - - if lab < 0: - continue - for j in range(K): - val = values[i, j] - - if numeric == float32_t or numeric == float64_t: - if val == val: - accum[lab, j] += val - out[i, j] = accum[lab, j] - else: - accum[lab, j] += val - out[i, j] = accum[lab, j] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_shift_indexer(int64_t[:] out, int64_t[:] labels, - int ngroups, int periods): - cdef: - Py_ssize_t N, i, j, ii - int offset, sign - int64_t lab, idxer, idxer_slot - int64_t[:] label_seen = np.zeros(ngroups, dtype=np.int64) - int64_t[:, :] label_indexer - - N, = ( labels).shape - - if periods < 0: - periods = -periods - offset = N - 1 - sign = -1 - elif periods > 0: - offset = 0 - sign = 1 - - if periods == 0: - with nogil: - for i in range(N): - out[i] = i - else: - # array of each previous indexer seen - label_indexer = np.zeros((ngroups, periods), dtype=np.int64) - with nogil: - for i in range(N): - ## reverse iterator if shifting backwards - ii = offset + sign * i - lab = labels[ii] - - # Skip null keys - if lab == -1: - out[ii] = -1 - continue - - label_seen[lab] += 1 - - idxer_slot = label_seen[lab] % periods - idxer = label_indexer[lab, idxer_slot] - - if label_seen[lab] > periods: - out[ii] = idxer - else: - out[ii] = -1 - - label_indexer[lab, idxer_slot] = ii diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index b1615f720368d..852ad04cd8a2e 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1,5 +1,5 @@ import types -from functools import wraps +from functools import wraps, partial import numpy as np import datetime import collections @@ -38,7 +38,7 @@ _ensure_float) from pandas.core.dtypes.cast import maybe_downcast_to_dtype from pandas.core.dtypes.generic import ABCSeries -from pandas.core.dtypes.missing import isna, notna, _maybe_fill +from pandas.core.dtypes.missing import isna, isnull, notna, _maybe_fill from pandas.core.base import (PandasObject, SelectionMixin, GroupByError, DataError, SpecificationError) @@ -1457,6 +1457,36 @@ def expanding(self, *args, **kwargs): from pandas.core.window import ExpandingGroupby return ExpandingGroupby(self, *args, **kwargs) + def _fill(self, direction, limit=None): + """Shared function for `pad` and `backfill` to call Cython method + + Parameters + ---------- + direction : {'ffill', 'bfill'} + Direction passed to underlying Cython function. `bfill` will cause + values to be filled backwards. `ffill` and any other values will + default to a forward fill + limit : int, default None + Maximum number of consecutive values to fill. If `None`, this + method will convert to -1 prior to passing to Cython + + Returns + ------- + `Series` or `DataFrame` with filled values + + See Also + -------- + pad + backfill + """ + # Need int value for Cython + if limit is None: + limit = -1 + + return self._get_cythonized_result('group_fillna_indexer', + self.grouper, needs_mask=True, + direction=direction, limit=limit) + @Substitution(name='groupby') def pad(self, limit=None): """ @@ -1474,7 +1504,7 @@ def pad(self, limit=None): Series.fillna DataFrame.fillna """ - return self.apply(lambda x: x.ffill(limit=limit)) + return self._fill('ffill', limit=limit) ffill = pad @Substitution(name='groupby') @@ -1494,7 +1524,7 @@ def backfill(self, limit=None): Series.fillna DataFrame.fillna """ - return self.apply(lambda x: x.bfill(limit=limit)) + return self._fill('bfill', limit=limit) bfill = backfill @Substitution(name='groupby') @@ -1843,6 +1873,45 @@ def cummax(self, axis=0, **kwargs): return self._cython_transform('cummax', numeric_only=False) + def _get_cythonized_result(self, how, grouper, needs_mask=False, + needs_ngroups=False, **kwargs): + """Get result for Cythonized functions + + Parameters + ---------- + how : str, Cythonized function name to be called + grouper : Grouper object containing pertinent group info + needs_mask : bool, default False + Whether boolean mask needs to be part of the Cython call signature + needs_ngroups : bool, default False + Whether number of groups part of the Cython call signature + **kwargs : dict + Extra arguments to be passed back to Cython funcs + + Returns + ------- + `Series` or `DataFrame` with filled values + """ + + labels, _, ngroups = grouper.group_info + output = collections.OrderedDict() + base_func = getattr(libgroupby, how) + + for name, obj in self._iterate_slices(): + indexer = np.zeros_like(labels, dtype=np.int64) + func = partial(base_func, indexer, labels) + if needs_mask: + mask = isnull(obj.values).view(np.uint8) + func = partial(func, mask) + + if needs_ngroups: + func = partial(func, ngroups) + + func(**kwargs) # Call func to modify indexer values in place + output[name] = algorithms.take_nd(obj.values, indexer) + + return self._wrap_transformed_output(output) + @Substitution(name='groupby') @Appender(_doc_template) def shift(self, periods=1, freq=None, axis=0): @@ -1860,17 +1929,9 @@ def shift(self, periods=1, freq=None, axis=0): if freq is not None or axis != 0: return self.apply(lambda x: x.shift(periods, freq, axis)) - labels, _, ngroups = self.grouper.group_info - - # filled in by Cython - indexer = np.zeros_like(labels) - libgroupby.group_shift_indexer(indexer, labels, ngroups, periods) - - output = {} - for name, obj in self._iterate_slices(): - output[name] = algorithms.take_nd(obj.values, indexer) - - return self._wrap_transformed_output(output) + return self._get_cythonized_result('group_shift_indexer', + self.grouper, needs_ngroups=True, + periods=periods) @Substitution(name='groupby') @Appender(_doc_template) @@ -3577,7 +3638,6 @@ def describe(self, **kwargs): def value_counts(self, normalize=False, sort=True, ascending=False, bins=None, dropna=True): - from functools import partial from pandas.core.reshape.tile import cut from pandas.core.reshape.merge import _get_join_indexers @@ -4585,9 +4645,17 @@ def _apply_to_column_groupbys(self, func): in self._iterate_column_groupbys()), keys=self._selected_obj.columns, axis=1) + def _fill(self, direction, limit=None): + """Overriden method to join grouped columns in output""" + res = super(DataFrameGroupBy, self)._fill(direction, limit=limit) + output = collections.OrderedDict( + (grp.name, grp.grouper) for grp in self.grouper.groupings) + + from pandas import concat + return concat((self._wrap_transformed_output(output), res), axis=1) + def count(self): """ Compute count of group, excluding missing values """ - from functools import partial from pandas.core.dtypes.missing import _isna_ndarraylike as isna data, _ = self._get_data_to_aggregate() diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 129ac6b06205c..2429e9975fc8e 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2061,6 +2061,61 @@ def test_rank_object_raises(self, ties_method, ascending, na_option, ascending=ascending, na_option=na_option, pct=pct) + @pytest.mark.parametrize("mix_groupings", [True, False]) + @pytest.mark.parametrize("as_series", [True, False]) + @pytest.mark.parametrize("val1,val2", [ + ('foo', 'bar'), (1, 2), (1., 2.)]) + @pytest.mark.parametrize("fill_method,limit,exp_vals", [ + ("ffill", None, + [np.nan, np.nan, 'val1', 'val1', 'val1', 'val2', 'val2', 'val2']), + ("ffill", 1, + [np.nan, np.nan, 'val1', 'val1', np.nan, 'val2', 'val2', np.nan]), + ("bfill", None, + ['val1', 'val1', 'val1', 'val2', 'val2', 'val2', np.nan, np.nan]), + ("bfill", 1, + [np.nan, 'val1', 'val1', np.nan, 'val2', 'val2', np.nan, np.nan]) + ]) + def test_group_fill_methods(self, mix_groupings, as_series, val1, val2, + fill_method, limit, exp_vals): + vals = [np.nan, np.nan, val1, np.nan, np.nan, val2, np.nan, np.nan] + _exp_vals = list(exp_vals) + # Overwrite placeholder values + for index, exp_val in enumerate(_exp_vals): + if exp_val == 'val1': + _exp_vals[index] = val1 + elif exp_val == 'val2': + _exp_vals[index] = val2 + + # Need to modify values and expectations depending on the + # Series / DataFrame that we ultimately want to generate + if mix_groupings: # ['a', 'b', 'a, 'b', ...] + keys = ['a', 'b'] * len(vals) + + def interweave(list_obj): + temp = list() + for x in list_obj: + temp.extend([x, x]) + + return temp + + _exp_vals = interweave(_exp_vals) + vals = interweave(vals) + else: # ['a', 'a', 'a', ... 'b', 'b', 'b'] + keys = ['a'] * len(vals) + ['b'] * len(vals) + _exp_vals = _exp_vals * 2 + vals = vals * 2 + + df = DataFrame({'key': keys, 'val': vals}) + if as_series: + result = getattr( + df.groupby('key')['val'], fill_method)(limit=limit) + exp = Series(_exp_vals, name='val') + assert_series_equal(result, exp) + else: + result = getattr(df.groupby('key'), fill_method)(limit=limit) + exp = DataFrame({'key': keys, 'val': _exp_vals}) + assert_frame_equal(result, exp) + def test_dont_clobber_name_column(self): df = DataFrame({'key': ['a', 'a', 'a', 'b', 'b', 'b'], 'name': ['foo', 'bar', 'baz'] * 2}) From fb54f40b99ae413c4acee30f1063518672c3e152 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Sun, 25 Feb 2018 08:06:59 -0800 Subject: [PATCH 196/214] Fixed pct_change with 'fill_method' returning NaN instead of 0 (#19875) --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/core/generic.py | 2 +- pandas/tests/frame/test_analytics.py | 3 --- pandas/tests/frame/test_timeseries.py | 2 +- pandas/tests/generic/test_generic.py | 20 ++++++++++++++++++++ pandas/tests/generic/test_panel.py | 2 +- pandas/tests/series/test_timeseries.py | 2 +- 7 files changed, 25 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index fcaf46b1c3d71..ba24c93121dcb 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -808,6 +808,7 @@ Numeric - Bug in :class:`Index` constructor with ``dtype='uint64'`` where int-like floats were not coerced to :class:`UInt64Index` (:issue:`18400`) - Bug in :class:`DataFrame` flex arithmetic (e.g. ``df.add(other, fill_value=foo)``) with a ``fill_value`` other than ``None`` failed to raise ``NotImplementedError`` in corner cases where either the frame or ``other`` has length zero (:issue:`19522`) - Multiplication and division of numeric-dtyped :class:`Index` objects with timedelta-like scalars returns ``TimedeltaIndex`` instead of raising ``TypeError`` (:issue:`19333`) +- Bug where ``NaN`` was returned instead of 0 by :func:`Series.pct_change` and :func:`DataFrame.pct_change` when ``fill_method`` is not ``None`` (provided) (:issue:`19873`) Indexing diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 85e2ce475ffa2..e1ed6ae9c8a6c 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7488,7 +7488,7 @@ def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None, **kwargs)) - 1) rs = rs.reindex_like(data) if freq is None: - mask = isna(com._values_from_object(self)) + mask = isna(com._values_from_object(data)) np.putmask(rs.values, mask, np.nan) return rs diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index f2b8387072c8d..de4a132e0d613 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -1941,12 +1941,9 @@ def test_pct_change(self): pnl.iat[1, 1] = np.nan pnl.iat[2, 3] = 60 - mask = pnl.isnull() - for axis in range(2): expected = pnl.ffill(axis=axis) / pnl.ffill(axis=axis).shift( axis=axis) - 1 - expected[mask] = np.nan result = pnl.pct_change(axis=axis, fill_method='pad') tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_timeseries.py b/pandas/tests/frame/test_timeseries.py index 25dd285e883a0..9f94439a71a57 100644 --- a/pandas/tests/frame/test_timeseries.py +++ b/pandas/tests/frame/test_timeseries.py @@ -118,7 +118,7 @@ def test_pct_change_shift_over_nas(self): df = DataFrame({'a': s, 'b': s}) chg = df.pct_change() - expected = Series([np.nan, 0.5, np.nan, 2.5 / 1.5 - 1, .2]) + expected = Series([np.nan, 0.5, 0., 2.5 / 1.5 - 1, .2]) edf = DataFrame({'a': expected, 'b': expected}) assert_frame_equal(chg, edf) diff --git a/pandas/tests/generic/test_generic.py b/pandas/tests/generic/test_generic.py index 3868bdf7d4620..311c71f734945 100644 --- a/pandas/tests/generic/test_generic.py +++ b/pandas/tests/generic/test_generic.py @@ -592,6 +592,26 @@ def test_copy_and_deepcopy(self): assert obj_copy is not obj self._compare(obj_copy, obj) + @pytest.mark.parametrize("periods,fill_method,limit,exp", [ + (1, "ffill", None, [np.nan, np.nan, np.nan, 1, 1, 1.5, 0, 0]), + (1, "ffill", 1, [np.nan, np.nan, np.nan, 1, 1, 1.5, 0, np.nan]), + (1, "bfill", None, [np.nan, 0, 0, 1, 1, 1.5, np.nan, np.nan]), + (1, "bfill", 1, [np.nan, np.nan, 0, 1, 1, 1.5, np.nan, np.nan]), + (-1, "ffill", None, [np.nan, np.nan, -.5, -.5, -.6, 0, 0, np.nan]), + (-1, "ffill", 1, [np.nan, np.nan, -.5, -.5, -.6, 0, np.nan, np.nan]), + (-1, "bfill", None, [0, 0, -.5, -.5, -.6, np.nan, np.nan, np.nan]), + (-1, "bfill", 1, [np.nan, 0, -.5, -.5, -.6, np.nan, np.nan, np.nan]) + ]) + def test_pct_change(self, periods, fill_method, limit, exp): + vals = [np.nan, np.nan, 1, 2, 4, 10, np.nan, np.nan] + obj = self._typ(vals) + func = getattr(obj, 'pct_change') + res = func(periods=periods, fill_method=fill_method, limit=limit) + if type(obj) is DataFrame: + tm.assert_frame_equal(res, DataFrame(exp)) + else: + tm.assert_series_equal(res, Series(exp)) + class TestNDFrame(object): # tests that don't fit elsewhere diff --git a/pandas/tests/generic/test_panel.py b/pandas/tests/generic/test_panel.py index 4cbd5cb2aa69f..49cb773a1bd10 100644 --- a/pandas/tests/generic/test_panel.py +++ b/pandas/tests/generic/test_panel.py @@ -45,7 +45,7 @@ def test_to_xarray(self): 'test_stat_non_defaults_args', 'test_truncate_out_of_bounds', 'test_metadata_propagation', 'test_copy_and_deepcopy', - 'test_sample']: + 'test_pct_change', 'test_sample']: def f(): def tester(self): diff --git a/pandas/tests/series/test_timeseries.py b/pandas/tests/series/test_timeseries.py index 7a1aff1cc223c..63a05ef7de565 100644 --- a/pandas/tests/series/test_timeseries.py +++ b/pandas/tests/series/test_timeseries.py @@ -352,7 +352,7 @@ def test_pct_change_shift_over_nas(self): s = Series([1., 1.5, np.nan, 2.5, 3.]) chg = s.pct_change() - expected = Series([np.nan, 0.5, np.nan, 2.5 / 1.5 - 1, .2]) + expected = Series([np.nan, 0.5, 0., 2.5 / 1.5 - 1, .2]) assert_series_equal(chg, expected) def test_pct_change_periods_freq(self): From c1237f2b72fb29155ac9d7a06f60d746e7b20eff Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 25 Feb 2018 08:09:35 -0800 Subject: [PATCH 197/214] Use pandas_datetimestruct instead of date_info (#19874) --- pandas/_libs/src/period_helper.c | 159 ++++-------- pandas/_libs/src/period_helper.h | 5 +- pandas/_libs/tslibs/period.pyx | 240 ++++++++++-------- .../{test_period_asfreq.py => test_asfreq.py} | 0 4 files changed, 178 insertions(+), 226 deletions(-) rename pandas/tests/scalar/period/{test_period_asfreq.py => test_asfreq.py} (100%) diff --git a/pandas/_libs/src/period_helper.c b/pandas/_libs/src/period_helper.c index e3d250aa44f17..19a7282f38049 100644 --- a/pandas/_libs/src/period_helper.c +++ b/pandas/_libs/src/period_helper.c @@ -89,14 +89,12 @@ static npy_int64 daytime_conversion_factor_matrix[7][7] = { {0, 0, 0, 0, 0, 1, 1000}, {0, 0, 0, 0, 0, 0, 1}}; -PANDAS_INLINE int max_value(int a, int b) { return a > b ? a : b; } +int max_value(int a, int b) { return a > b ? a : b; } PANDAS_INLINE int min_value(int a, int b) { return a < b ? a : b; } PANDAS_INLINE int get_freq_group(int freq) { return (freq / 1000) * 1000; } -PANDAS_INLINE int get_freq_group_index(int freq) { return freq / 1000; } - npy_int64 get_daytime_conversion_factor(int from_index, int to_index) { int row = min_value(from_index, to_index); @@ -227,16 +225,6 @@ static npy_int64 asfreq_DTtoB(npy_int64 ordinal, asfreq_info *af_info) { return DtoB(&dinfo, roll_back, ordinal); } -// all intra day calculations are now done within one function -static npy_int64 asfreq_DownsampleWithinDay(npy_int64 ordinal, - asfreq_info *af_info) { - return downsample_daytime(ordinal, af_info); -} - -static npy_int64 asfreq_UpsampleWithinDay(npy_int64 ordinal, - asfreq_info *af_info) { - return upsample_daytime(ordinal, af_info); -} //************ FROM BUSINESS *************** static npy_int64 asfreq_BtoDT(npy_int64 ordinal, asfreq_info *af_info) { @@ -288,26 +276,26 @@ static npy_int64 asfreq_WtoW(npy_int64 ordinal, asfreq_info *af_info) { static npy_int64 asfreq_WtoB(npy_int64 ordinal, asfreq_info *af_info) { struct date_info dinfo; npy_int64 unix_date = asfreq_WtoDT(ordinal, af_info); + int roll_back = af_info->is_end; dInfoCalc_SetFromAbsDate(&dinfo, unix_date); - return DtoB(&dinfo, roll_back, unix_date); } //************ FROM MONTHLY *************** -static void MtoD_ym(npy_int64 ordinal, int *y, int *m) { - *y = floordiv(ordinal, 12) + 1970; - *m = mod_compat(ordinal, 12) + 1; +static void MtoD_ym(npy_int64 ordinal, int *year, int *month) { + *year = floordiv(ordinal, 12) + 1970; + *month = mod_compat(ordinal, 12) + 1; } static npy_int64 asfreq_MtoDT(npy_int64 ordinal, asfreq_info *af_info) { npy_int64 unix_date; - int y, m; + int year, month; ordinal += af_info->is_end; - MtoD_ym(ordinal, &y, &m); - unix_date = unix_date_from_ymd(y, m, 1); + MtoD_ym(ordinal, &year, &month); + unix_date = unix_date_from_ymd(year, month, 1); unix_date -= af_info->is_end; return upsample_daytime(unix_date, af_info); } @@ -327,38 +315,37 @@ static npy_int64 asfreq_MtoW(npy_int64 ordinal, asfreq_info *af_info) { static npy_int64 asfreq_MtoB(npy_int64 ordinal, asfreq_info *af_info) { struct date_info dinfo; npy_int64 unix_date = asfreq_MtoDT(ordinal, af_info); - int roll_back = af_info->is_end; + int roll_back = af_info->is_end; dInfoCalc_SetFromAbsDate(&dinfo, unix_date); - return DtoB(&dinfo, roll_back, unix_date); } //************ FROM QUARTERLY *************** -static void QtoD_ym(npy_int64 ordinal, int *y, int *m, asfreq_info *af_info) { - *y = floordiv(ordinal, 4) + 1970; - *m = mod_compat(ordinal, 4) * 3 + 1; +static void QtoD_ym(npy_int64 ordinal, int *year, int *month, + asfreq_info *af_info) { + *year = floordiv(ordinal, 4) + 1970; + *month = mod_compat(ordinal, 4) * 3 + 1; if (af_info->from_q_year_end != 12) { - *m += af_info->from_q_year_end; - if (*m > 12) { - *m -= 12; + *month += af_info->from_q_year_end; + if (*month > 12) { + *month -= 12; } else { - *y -= 1; + *year -= 1; } } } static npy_int64 asfreq_QtoDT(npy_int64 ordinal, asfreq_info *af_info) { npy_int64 unix_date; - int y, m; + int year, month; ordinal += af_info->is_end; - QtoD_ym(ordinal, &y, &m, af_info); - - unix_date = unix_date_from_ymd(y, m, 1); + QtoD_ym(ordinal, &year, &month, af_info); + unix_date = unix_date_from_ymd(year, month, 1); unix_date -= af_info->is_end; return upsample_daytime(unix_date, af_info); } @@ -382,29 +369,39 @@ static npy_int64 asfreq_QtoW(npy_int64 ordinal, asfreq_info *af_info) { static npy_int64 asfreq_QtoB(npy_int64 ordinal, asfreq_info *af_info) { struct date_info dinfo; npy_int64 unix_date = asfreq_QtoDT(ordinal, af_info); - int roll_back = af_info->is_end; + int roll_back = af_info->is_end; dInfoCalc_SetFromAbsDate(&dinfo, unix_date); - return DtoB(&dinfo, roll_back, unix_date); } //************ FROM ANNUAL *************** -static npy_int64 asfreq_AtoDT(npy_int64 ordinal, asfreq_info *af_info) { - npy_int64 unix_date; +static void AtoD_ym(npy_int64 ordinal, int *year, int *month, + asfreq_info *af_info) { + *year = ordinal + 1970; + *month = 1; - // start from 1970 - npy_int64 year = ordinal + 1970; - - int month = (af_info->from_a_year_end % 12) + 1; if (af_info->from_a_year_end != 12) { - year -= 1; + *month += af_info->from_a_year_end; + if (*month > 12) { + // This case is never reached, but is kept for symmetry + // with QtoD_ym + *month -= 12; + } else { + *year -= 1; + } } +} - year += af_info->is_end; - unix_date = unix_date_from_ymd(year, month, 1); +static npy_int64 asfreq_AtoDT(npy_int64 ordinal, asfreq_info *af_info) { + npy_int64 unix_date; + int year, month; + + ordinal += af_info->is_end; + AtoD_ym(ordinal, &year, &month, af_info); + unix_date = unix_date_from_ymd(year, month, 1); unix_date -= af_info->is_end; return upsample_daytime(unix_date, af_info); } @@ -428,9 +425,9 @@ static npy_int64 asfreq_AtoW(npy_int64 ordinal, asfreq_info *af_info) { static npy_int64 asfreq_AtoB(npy_int64 ordinal, asfreq_info *af_info) { struct date_info dinfo; npy_int64 unix_date = asfreq_AtoDT(ordinal, af_info); + int roll_back = af_info->is_end; dInfoCalc_SetFromAbsDate(&dinfo, unix_date); - return DtoB(&dinfo, roll_back, unix_date); } @@ -443,57 +440,6 @@ static npy_int64 no_op(npy_int64 ordinal, asfreq_info *af_info) { // end of frequency specific conversion routines -static int calc_a_year_end(int freq, int group) { - int result = (freq - group) % 12; - if (result == 0) { - return 12; - } else { - return result; - } -} - -static int calc_week_end(int freq, int group) { return freq - group; } - -void get_asfreq_info(int fromFreq, int toFreq, char relation, - asfreq_info *af_info) { - int fromGroup = get_freq_group(fromFreq); - int toGroup = get_freq_group(toFreq); - - if (relation == 'E') { - af_info->is_end = 1; - } else { - af_info->is_end = 0; - } - - af_info->intraday_conversion_factor = get_daytime_conversion_factor( - get_freq_group_index(max_value(fromGroup, FR_DAY)), - get_freq_group_index(max_value(toGroup, FR_DAY))); - - switch (fromGroup) { - case FR_WK: - af_info->from_week_end = calc_week_end(fromFreq, fromGroup); - break; - case FR_ANN: - af_info->from_a_year_end = calc_a_year_end(fromFreq, fromGroup); - break; - case FR_QTR: - af_info->from_q_year_end = calc_a_year_end(fromFreq, fromGroup); - break; - } - - switch (toGroup) { - case FR_WK: - af_info->to_week_end = calc_week_end(toFreq, toGroup); - break; - case FR_ANN: - af_info->to_a_year_end = calc_a_year_end(toFreq, toGroup); - break; - case FR_QTR: - af_info->to_q_year_end = calc_a_year_end(toFreq, toGroup); - break; - } -} - freq_conv_func get_asfreq_func(int fromFreq, int toFreq) { int fromGroup = get_freq_group(fromFreq); int toGroup = get_freq_group(toFreq); @@ -650,9 +596,9 @@ freq_conv_func get_asfreq_func(int fromFreq, int toFreq) { case FR_US: case FR_NS: if (fromGroup > toGroup) { - return &asfreq_DownsampleWithinDay; + return &downsample_daytime; } else { - return &asfreq_UpsampleWithinDay; + return &upsample_daytime; } default: return &nofunc; @@ -662,20 +608,3 @@ freq_conv_func get_asfreq_func(int fromFreq, int toFreq) { return &nofunc; } } - -/* ------------------------------------------------------------------ - * New pandas API-helper code, to expose to cython - * ------------------------------------------------------------------*/ - -npy_int64 asfreq(npy_int64 period_ordinal, int freq1, int freq2, - char relation) { - npy_int64 val; - freq_conv_func func; - asfreq_info finfo; - - func = get_asfreq_func(freq1, freq2); - - get_asfreq_info(freq1, freq2, relation, &finfo); - val = (*func)(period_ordinal, &finfo); - return val; -} diff --git a/pandas/_libs/src/period_helper.h b/pandas/_libs/src/period_helper.h index 7163dc960d152..c6313924adddd 100644 --- a/pandas/_libs/src/period_helper.h +++ b/pandas/_libs/src/period_helper.h @@ -108,12 +108,9 @@ typedef npy_int64 (*freq_conv_func)(npy_int64, asfreq_info *af_info); * new pandas API helper functions here */ -npy_int64 asfreq(npy_int64 period_ordinal, int freq1, int freq2, char relation); - freq_conv_func get_asfreq_func(int fromFreq, int toFreq); -void get_asfreq_info(int fromFreq, int toFreq, char relation, - asfreq_info *af_info); npy_int64 get_daytime_conversion_factor(int from_index, int to_index); +int max_value(int a, int b); #endif // PANDAS__LIBS_SRC_PERIOD_HELPER_H_ diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index f1a193706144f..9cf7e39791f2b 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -75,15 +75,8 @@ cdef extern from "period_helper.h": int FR_BUS int FR_UND - ctypedef struct date_info: - double second - int minute - int hour - int day - int month - int year - ctypedef struct asfreq_info: + int64_t intraday_conversion_factor int is_end int from_week_end @@ -97,24 +90,21 @@ cdef extern from "period_helper.h": ctypedef int64_t (*freq_conv_func)(int64_t, asfreq_info*) nogil - int64_t asfreq(int64_t dtordinal, int freq1, int freq2, - char relation) except INT32_MIN freq_conv_func get_asfreq_func(int fromFreq, int toFreq) nogil - void get_asfreq_info(int fromFreq, int toFreq, char relation, - asfreq_info *af_info) nogil int64_t get_daytime_conversion_factor(int from_index, int to_index) nogil + int max_value(int left, int right) nogil @cython.cdivision -cdef char* c_strftime(date_info *dinfo, char *fmt): +cdef char* c_strftime(pandas_datetimestruct *dts, char *fmt): """ Generate a nice string representation of the period object, originally from DateObject_strftime Parameters ---------- - dinfo : date_info* + dts : pandas_datetimestruct* fmt : char* Returns @@ -126,14 +116,14 @@ cdef char* c_strftime(date_info *dinfo, char *fmt): char *result int result_len = strlen(fmt) + 50 - c_date.tm_sec = dinfo.second - c_date.tm_min = dinfo.minute - c_date.tm_hour = dinfo.hour - c_date.tm_mday = dinfo.day - c_date.tm_mon = dinfo.month - 1 - c_date.tm_year = dinfo.year - 1900 - c_date.tm_wday = (dayofweek(dinfo.year, dinfo.month, dinfo.day) + 1) % 7 - c_date.tm_yday = get_day_of_year(dinfo.year, dinfo.month, dinfo.day) - 1 + c_date.tm_sec = dts.sec + c_date.tm_min = dts.min + c_date.tm_hour = dts.hour + c_date.tm_mday = dts.day + c_date.tm_mon = dts.month - 1 + c_date.tm_year = dts.year - 1900 + c_date.tm_wday = (dayofweek(dts.year, dts.month, dts.day) + 1) % 7 + c_date.tm_yday = get_day_of_year(dts.year, dts.month, dts.day) - 1 c_date.tm_isdst = -1 result = malloc(result_len * sizeof(char)) @@ -150,6 +140,10 @@ cdef inline int get_freq_group(int freq) nogil: return (freq // 1000) * 1000 +cdef inline int get_freq_group_index(int freq) nogil: + return freq // 1000 + + # specifically _dont_ use cdvision or else ordinals near -1 are assigned to # incorrect dates GH#19643 @cython.cdivision(False) @@ -261,7 +255,8 @@ cdef int64_t get_period_ordinal(int year, int month, int day, # raise ValueError -cdef void get_date_info(int64_t ordinal, int freq, date_info *dinfo) nogil: +cdef void get_date_info(int64_t ordinal, int freq, + pandas_datetimestruct *dts) nogil: cdef: int64_t unix_date double abstime @@ -277,7 +272,7 @@ cdef void get_date_info(int64_t ordinal, int freq, date_info *dinfo) nogil: abstime -= 86400 unix_date += 1 - date_info_from_days_and_time(dinfo, unix_date, abstime) + date_info_from_days_and_time(dts, unix_date, abstime) cdef int64_t get_unix_date(int64_t period_ordinal, int freq) nogil: @@ -304,12 +299,12 @@ cdef int64_t get_unix_date(int64_t period_ordinal, int freq) nogil: return period_ordinal toDaily = get_asfreq_func(freq, FR_DAY) - get_asfreq_info(freq, FR_DAY, 'E', &af_info) + get_asfreq_info(freq, FR_DAY, True, &af_info) return toDaily(period_ordinal, &af_info) @cython.cdivision -cdef void date_info_from_days_and_time(date_info *dinfo, +cdef void date_info_from_days_and_time(pandas_datetimestruct *dts, int64_t unix_date, double abstime) nogil: """ @@ -317,7 +312,7 @@ cdef void date_info_from_days_and_time(date_info *dinfo, Parameters ---------- - dinfo : date_info* + dts : pandas_datetimestruct* unix_date : int64_t days elapsed since datetime(1970, 1, 1) abstime : double @@ -325,23 +320,19 @@ cdef void date_info_from_days_and_time(date_info *dinfo, Notes ----- - Updates dinfo inplace + Updates dts inplace """ cdef: - pandas_datetimestruct dts int inttime int hour, minute - double second + double second, subsecond_fraction # Bounds check # The calling function is responsible for ensuring that # abstime >= 0.0 and abstime <= 86400 # Calculate the date - pandas_datetime_to_datetimestruct(unix_date, PANDAS_FR_D, &dts) - dinfo.year = dts.year - dinfo.month = dts.month - dinfo.day = dts.day + pandas_datetime_to_datetimestruct(unix_date, PANDAS_FR_D, dts) # Calculate the time inttime = abstime @@ -349,9 +340,13 @@ cdef void date_info_from_days_and_time(date_info *dinfo, minute = (inttime % 3600) / 60 second = abstime - (hour * 3600 + minute * 60) - dinfo.hour = hour - dinfo.minute = minute - dinfo.second = second + dts.hour = hour + dts.min = minute + dts.sec = second + + subsecond_fraction = second - dts.sec + dts.us = int((subsecond_fraction) * 1e6) + dts.ps = int(((subsecond_fraction) * 1e6 - dts.us) * 1e6) @cython.cdivision @@ -439,7 +434,7 @@ cdef int get_yq(int64_t ordinal, int freq, int *quarter, int *year): else: qtr_freq = FR_QTR - get_asfreq_info(FR_DAY, qtr_freq, 'E', &af_info) + get_asfreq_info(FR_DAY, qtr_freq, True, &af_info) quarter[0] = DtoQ_yq(unix_date, &af_info, year) return qtr_freq @@ -447,20 +442,20 @@ cdef int get_yq(int64_t ordinal, int freq, int *quarter, int *year): cdef int DtoQ_yq(int64_t unix_date, asfreq_info *af_info, int *year): cdef: - date_info dinfo + pandas_datetimestruct dts int quarter - date_info_from_days_and_time(&dinfo, unix_date, 0) + date_info_from_days_and_time(&dts, unix_date, 0) if af_info.to_q_year_end != 12: - dinfo.month -= af_info.to_q_year_end - if dinfo.month <= 0: - dinfo.month += 12 + dts.month -= af_info.to_q_year_end + if dts.month <= 0: + dts.month += 12 else: - dinfo.year += 1 + dts.year += 1 - year[0] = dinfo.year - quarter = month_to_quarter(dinfo.month) + year[0] = dts.year + quarter = month_to_quarter(dts.month) return quarter @@ -528,10 +523,6 @@ def periodarr_to_dt64arr(ndarray[int64_t] periodarr, int freq): return out -cdef char START = 'S' -cdef char END = 'E' - - cpdef int64_t period_asfreq(int64_t ordinal, int freq1, int freq2, bint end): """ Convert period ordinal from one frequency to another, and if upsampling, @@ -539,14 +530,15 @@ cpdef int64_t period_asfreq(int64_t ordinal, int freq1, int freq2, bint end): """ cdef: int64_t retval + freq_conv_func func + asfreq_info af_info if ordinal == iNaT: return iNaT - if end: - retval = asfreq(ordinal, freq1, freq2, END) - else: - retval = asfreq(ordinal, freq1, freq2, START) + func = get_asfreq_func(freq1, freq2) + get_asfreq_info(freq1, freq2, end, &af_info) + retval = func(ordinal, &af_info) if retval == INT32_MIN: raise ValueError('Frequency conversion failed') @@ -554,6 +546,58 @@ cpdef int64_t period_asfreq(int64_t ordinal, int freq1, int freq2, bint end): return retval +cdef void get_asfreq_info(int from_freq, int to_freq, + bint is_end, asfreq_info *af_info) nogil: + """ + Construct the `asfreq_info` object used to convert an ordinal from + `from_freq` to `to_freq`. + + Parameters + ---------- + from_freq : int + to_freq int + is_end : bool + af_info : *asfreq_info + """ + cdef: + int from_group = get_freq_group(from_freq) + int to_group = get_freq_group(to_freq) + + af_info.is_end = is_end + + af_info.intraday_conversion_factor = get_daytime_conversion_factor( + get_freq_group_index(max_value(from_group, FR_DAY)), + get_freq_group_index(max_value(to_group, FR_DAY))) + + if from_group == FR_WK: + af_info.from_week_end = calc_week_end(from_freq, from_group) + elif from_group == FR_ANN: + af_info.from_a_year_end = calc_a_year_end(from_freq, from_group) + elif from_group == FR_QTR: + af_info.from_q_year_end = calc_a_year_end(from_freq, from_group) + + if to_group == FR_WK: + af_info.to_week_end = calc_week_end(to_freq, to_group) + elif to_group == FR_ANN: + af_info.to_a_year_end = calc_a_year_end(to_freq, to_group) + elif to_group == FR_QTR: + af_info.to_q_year_end = calc_a_year_end(to_freq, to_group) + + +@cython.cdivision +cdef int calc_a_year_end(int freq, int group) nogil: + cdef: + int result = (freq - group) % 12 + if result == 0: + return 12 + else: + return result + + +cdef inline int calc_week_end(int freq, int group) nogil: + return freq - group + + def period_asfreq_arr(ndarray[int64_t] arr, int freq1, int freq2, bint end): """ Convert int64-array of period ordinals from one frequency to another, and @@ -565,18 +609,12 @@ def period_asfreq_arr(ndarray[int64_t] arr, int freq1, int freq2, bint end): freq_conv_func func asfreq_info af_info int64_t val - char relation n = len(arr) result = np.empty(n, dtype=np.int64) - if end: - relation = END - else: - relation = START - func = get_asfreq_func(freq1, freq2) - get_asfreq_info(freq1, freq2, relation, &af_info) + get_asfreq_info(freq1, freq2, end, &af_info) mask = arr == iNaT if mask.any(): # NaT process @@ -605,24 +643,12 @@ def period_ordinal(int y, int m, int d, int h, int min, cpdef int64_t period_ordinal_to_dt64(int64_t ordinal, int freq) nogil: cdef: pandas_datetimestruct dts - date_info dinfo float subsecond_fraction if ordinal == NPY_NAT: return NPY_NAT - get_date_info(ordinal, freq, &dinfo) - - dts.year = dinfo.year - dts.month = dinfo.month - dts.day = dinfo.day - dts.hour = dinfo.hour - dts.min = dinfo.minute - dts.sec = int(dinfo.second) - subsecond_fraction = dinfo.second - dts.sec - dts.us = int((subsecond_fraction) * 1e6) - dts.ps = int(((subsecond_fraction) * 1e6 - dts.us) * 1e6) - + get_date_info(ordinal, freq, &dts) return dtstruct_to_dt64(&dts) @@ -680,7 +706,7 @@ cdef list str_extra_fmts = ["^`AB`^", "^`CD`^", "^`EF`^", cdef object _period_strftime(int64_t value, int freq, object fmt): cdef: Py_ssize_t i - date_info dinfo + pandas_datetimestruct dts char *formatted object pat, repl, result list found_pat = [False] * len(extra_fmts) @@ -689,7 +715,7 @@ cdef object _period_strftime(int64_t value, int freq, object fmt): if PyUnicode_Check(fmt): fmt = fmt.encode('utf-8') - get_date_info(value, freq, &dinfo) + get_date_info(value, freq, &dts) for i in range(len(extra_fmts)): pat = extra_fmts[i][0] repl = extra_fmts[i][1] @@ -697,7 +723,7 @@ cdef object _period_strftime(int64_t value, int freq, object fmt): fmt = fmt.replace(pat, repl) found_pat[i] = True - formatted = c_strftime(&dinfo, fmt) + formatted = c_strftime(&dts, fmt) result = util.char_to_string(formatted) free(formatted) @@ -736,9 +762,9 @@ ctypedef int (*accessor)(int64_t ordinal, int freq) except INT32_MIN cdef int pyear(int64_t ordinal, int freq): cdef: - date_info dinfo - get_date_info(ordinal, freq, &dinfo) - return dinfo.year + pandas_datetimestruct dts + get_date_info(ordinal, freq, &dts) + return dts.year @cython.cdivision @@ -762,65 +788,65 @@ cdef int pquarter(int64_t ordinal, int freq): cdef int pmonth(int64_t ordinal, int freq): cdef: - date_info dinfo - get_date_info(ordinal, freq, &dinfo) - return dinfo.month + pandas_datetimestruct dts + get_date_info(ordinal, freq, &dts) + return dts.month cdef int pday(int64_t ordinal, int freq): cdef: - date_info dinfo - get_date_info(ordinal, freq, &dinfo) - return dinfo.day + pandas_datetimestruct dts + get_date_info(ordinal, freq, &dts) + return dts.day cdef int pweekday(int64_t ordinal, int freq): cdef: - date_info dinfo - get_date_info(ordinal, freq, &dinfo) - return dayofweek(dinfo.year, dinfo.month, dinfo.day) + pandas_datetimestruct dts + get_date_info(ordinal, freq, &dts) + return dayofweek(dts.year, dts.month, dts.day) cdef int pday_of_year(int64_t ordinal, int freq): cdef: - date_info dinfo - get_date_info(ordinal, freq, &dinfo) - return get_day_of_year(dinfo.year, dinfo.month, dinfo.day) + pandas_datetimestruct dts + get_date_info(ordinal, freq, &dts) + return get_day_of_year(dts.year, dts.month, dts.day) cdef int pweek(int64_t ordinal, int freq): cdef: - date_info dinfo - get_date_info(ordinal, freq, &dinfo) - return ccalendar.get_week_of_year(dinfo.year, dinfo.month, dinfo.day) + pandas_datetimestruct dts + get_date_info(ordinal, freq, &dts) + return ccalendar.get_week_of_year(dts.year, dts.month, dts.day) cdef int phour(int64_t ordinal, int freq): cdef: - date_info dinfo - get_date_info(ordinal, freq, &dinfo) - return dinfo.hour + pandas_datetimestruct dts + get_date_info(ordinal, freq, &dts) + return dts.hour cdef int pminute(int64_t ordinal, int freq): cdef: - date_info dinfo - get_date_info(ordinal, freq, &dinfo) - return dinfo.minute + pandas_datetimestruct dts + get_date_info(ordinal, freq, &dts) + return dts.min cdef int psecond(int64_t ordinal, int freq): cdef: - date_info dinfo - get_date_info(ordinal, freq, &dinfo) - return dinfo.second + pandas_datetimestruct dts + get_date_info(ordinal, freq, &dts) + return dts.sec cdef int pdays_in_month(int64_t ordinal, int freq): cdef: - date_info dinfo - get_date_info(ordinal, freq, &dinfo) - return ccalendar.get_days_in_month(dinfo.year, dinfo.month) + pandas_datetimestruct dts + get_date_info(ordinal, freq, &dts) + return ccalendar.get_days_in_month(dts.year, dts.month) def get_period_field_arr(int code, ndarray[int64_t] arr, int freq): diff --git a/pandas/tests/scalar/period/test_period_asfreq.py b/pandas/tests/scalar/period/test_asfreq.py similarity index 100% rename from pandas/tests/scalar/period/test_period_asfreq.py rename to pandas/tests/scalar/period/test_asfreq.py From 5a87765f82f97177fa0ac65e7faa1fe7c45d69c4 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 25 Feb 2018 08:11:41 -0800 Subject: [PATCH 198/214] Fix+test DTI/TDI/PI add/sub with ndarray[datetime64/timedelta64] (#19847) --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/core/indexes/datetimelike.py | 21 ++++--- pandas/core/indexes/timedeltas.py | 4 ++ .../indexes/datetimes/test_arithmetic.py | 56 ++++++++++++++++++ .../tests/indexes/period/test_arithmetic.py | 58 +++++++++++++++++++ .../indexes/timedeltas/test_arithmetic.py | 49 ++++++++++++++++ 6 files changed, 182 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index ba24c93121dcb..b7dfdf9cfea1e 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -756,6 +756,7 @@ Datetimelike - Bug in :func:`Timestamp.floor` :func:`DatetimeIndex.floor` where time stamps far in the future and past were not rounded correctly (:issue:`19206`) - Bug in :func:`to_datetime` where passing an out-of-bounds datetime with ``errors='coerce'`` and ``utc=True`` would raise ``OutOfBoundsDatetime`` instead of parsing to ``NaT`` (:issue:`19612`) - Bug in :class:`DatetimeIndex` and :class:`TimedeltaIndex` addition and subtraction where name of the returned object was not always set consistently. (:issue:`19744`) +- Bug in :class:`DatetimeIndex` and :class:`TimedeltaIndex` addition and subtraction where operations with numpy arrays raised ``TypeError`` (:issue:`19847`) - Timedelta diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index a68d883f04380..9411428b2e68d 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -31,6 +31,7 @@ is_integer_dtype, is_object_dtype, is_string_dtype, + is_datetime64_dtype, is_period_dtype, is_timedelta64_dtype) from pandas.core.dtypes.generic import ( @@ -676,9 +677,7 @@ def _add_datetimelike_methods(cls): """ def __add__(self, other): - from pandas.core.index import Index - from pandas.core.indexes.timedeltas import TimedeltaIndex - from pandas.tseries.offsets import DateOffset + from pandas import Index, DatetimeIndex, TimedeltaIndex, DateOffset other = lib.item_from_zerodim(other) if isinstance(other, ABCSeries): @@ -710,6 +709,9 @@ def __add__(self, other): .format(typ=type(other))) elif isinstance(other, Index): result = self._add_datelike(other) + elif is_datetime64_dtype(other): + # ndarray[datetime64]; note DatetimeIndex is caught above + return self + DatetimeIndex(other) elif is_integer_dtype(other) and self.freq is None: # GH#19123 raise NullFrequencyError("Cannot shift with no freq") @@ -729,10 +731,7 @@ def __radd__(self, other): cls.__radd__ = __radd__ def __sub__(self, other): - from pandas.core.index import Index - from pandas.core.indexes.datetimes import DatetimeIndex - from pandas.core.indexes.timedeltas import TimedeltaIndex - from pandas.tseries.offsets import DateOffset + from pandas import Index, DatetimeIndex, TimedeltaIndex, DateOffset other = lib.item_from_zerodim(other) if isinstance(other, ABCSeries): @@ -764,6 +763,9 @@ def __sub__(self, other): .format(typ=type(other).__name__)) elif isinstance(other, DatetimeIndex): result = self._sub_datelike(other) + elif is_datetime64_dtype(other): + # ndarray[datetime64]; note we caught DatetimeIndex earlier + return self - DatetimeIndex(other) elif isinstance(other, Index): raise TypeError("cannot subtract {typ1} and {typ2}" .format(typ1=type(self).__name__, @@ -782,6 +784,11 @@ def __sub__(self, other): cls.__sub__ = __sub__ def __rsub__(self, other): + if is_datetime64_dtype(other) and is_timedelta64_dtype(self): + # ndarray[datetime64] cannot be subtracted from self, so + # we need to wrap in DatetimeIndex and flip the operation + from pandas import DatetimeIndex + return DatetimeIndex(other) - self return -(self - other) cls.__rsub__ = __rsub__ diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 219adfdb66c82..6f80962eab079 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -377,6 +377,10 @@ def _add_delta(self, delta): new_values = self._add_delta_td(delta) elif isinstance(delta, TimedeltaIndex): new_values = self._add_delta_tdi(delta) + elif is_timedelta64_dtype(delta): + # ndarray[timedelta64] --> wrap in TimedeltaIndex + delta = TimedeltaIndex(delta) + new_values = self._add_delta_tdi(delta) else: raise TypeError("cannot add the type {0} to a TimedeltaIndex" .format(type(delta))) diff --git a/pandas/tests/indexes/datetimes/test_arithmetic.py b/pandas/tests/indexes/datetimes/test_arithmetic.py index 7900c983b6c77..5a7ea44f3698c 100644 --- a/pandas/tests/indexes/datetimes/test_arithmetic.py +++ b/pandas/tests/indexes/datetimes/test_arithmetic.py @@ -571,6 +571,62 @@ def test_add_datetimelike_and_dti_tz(self, addend): with tm.assert_raises_regex(TypeError, msg): addend + dti_tz + # ------------------------------------------------------------- + # __add__/__sub__ with ndarray[datetime64] and ndarray[timedelta64] + + def test_dti_add_dt64_array_raises(self, tz): + dti = pd.date_range('2016-01-01', periods=3, tz=tz) + dtarr = dti.values + + with pytest.raises(TypeError): + dti + dtarr + with pytest.raises(TypeError): + dtarr + dti + + def test_dti_sub_dt64_array_naive(self): + dti = pd.date_range('2016-01-01', periods=3, tz=None) + dtarr = dti.values + + expected = dti - dti + result = dti - dtarr + tm.assert_index_equal(result, expected) + result = dtarr - dti + tm.assert_index_equal(result, expected) + + def test_dti_sub_dt64_array_aware_raises(self, tz): + if tz is None: + return + dti = pd.date_range('2016-01-01', periods=3, tz=tz) + dtarr = dti.values + + with pytest.raises(TypeError): + dti - dtarr + with pytest.raises(TypeError): + dtarr - dti + + def test_dti_add_td64_array(self, tz): + dti = pd.date_range('2016-01-01', periods=3, tz=tz) + tdi = dti - dti.shift(1) + tdarr = tdi.values + + expected = dti + tdi + result = dti + tdarr + tm.assert_index_equal(result, expected) + result = tdarr + dti + tm.assert_index_equal(result, expected) + + def test_dti_sub_td64_array(self, tz): + dti = pd.date_range('2016-01-01', periods=3, tz=tz) + tdi = dti - dti.shift(1) + tdarr = tdi.values + + expected = dti - tdi + result = dti - tdarr + tm.assert_index_equal(result, expected) + + with pytest.raises(TypeError): + tdarr - dti + # ------------------------------------------------------------- def test_sub_dti_dti(self): diff --git a/pandas/tests/indexes/period/test_arithmetic.py b/pandas/tests/indexes/period/test_arithmetic.py index 0c06e6a4963b4..d7bf1e0210f62 100644 --- a/pandas/tests/indexes/period/test_arithmetic.py +++ b/pandas/tests/indexes/period/test_arithmetic.py @@ -255,6 +255,64 @@ def test_comp_nat(self, dtype): class TestPeriodIndexArithmetic(object): + + # ----------------------------------------------------------------- + # __add__/__sub__ with ndarray[datetime64] and ndarray[timedelta64] + + def test_pi_add_sub_dt64_array_raises(self): + rng = pd.period_range('1/1/2000', freq='D', periods=3) + dti = pd.date_range('2016-01-01', periods=3) + dtarr = dti.values + + with pytest.raises(TypeError): + rng + dtarr + with pytest.raises(TypeError): + dtarr + rng + + with pytest.raises(TypeError): + rng - dtarr + with pytest.raises(TypeError): + dtarr - rng + + def test_pi_add_sub_td64_array_non_tick_raises(self): + rng = pd.period_range('1/1/2000', freq='Q', periods=3) + dti = pd.date_range('2016-01-01', periods=3) + tdi = dti - dti.shift(1) + tdarr = tdi.values + + with pytest.raises(period.IncompatibleFrequency): + rng + tdarr + with pytest.raises(period.IncompatibleFrequency): + tdarr + rng + + with pytest.raises(period.IncompatibleFrequency): + rng - tdarr + with pytest.raises(period.IncompatibleFrequency): + tdarr - rng + + @pytest.mark.xfail(reason='op with TimedeltaIndex raises, with ndarray OK') + def test_pi_add_sub_td64_array_tick(self): + rng = pd.period_range('1/1/2000', freq='Q', periods=3) + dti = pd.date_range('2016-01-01', periods=3) + tdi = dti - dti.shift(1) + tdarr = tdi.values + + expected = rng + tdi + result = rng + tdarr + tm.assert_index_equal(result, expected) + result = tdarr + rng + tm.assert_index_equal(result, expected) + + expected = rng - tdi + result = rng - tdarr + tm.assert_index_equal(result, expected) + + with pytest.raises(TypeError): + tdarr - rng + + # ----------------------------------------------------------------- + # operations with array/Index of DateOffset objects + @pytest.mark.parametrize('box', [np.array, pd.Index]) def test_pi_add_offset_array(self, box): # GH#18849 diff --git a/pandas/tests/indexes/timedeltas/test_arithmetic.py b/pandas/tests/indexes/timedeltas/test_arithmetic.py index 282501860f7e5..6a80b995b6ee9 100644 --- a/pandas/tests/indexes/timedeltas/test_arithmetic.py +++ b/pandas/tests/indexes/timedeltas/test_arithmetic.py @@ -586,6 +586,55 @@ def test_tdi_radd_timestamp(self): expected = DatetimeIndex(['2011-01-02', '2011-01-03']) tm.assert_index_equal(result, expected) + # ------------------------------------------------------------- + # __add__/__sub__ with ndarray[datetime64] and ndarray[timedelta64] + + def test_tdi_sub_dt64_array(self): + dti = pd.date_range('2016-01-01', periods=3) + tdi = dti - dti.shift(1) + dtarr = dti.values + + with pytest.raises(TypeError): + tdi - dtarr + + # TimedeltaIndex.__rsub__ + expected = pd.DatetimeIndex(dtarr) - tdi + result = dtarr - tdi + tm.assert_index_equal(result, expected) + + def test_tdi_add_dt64_array(self): + dti = pd.date_range('2016-01-01', periods=3) + tdi = dti - dti.shift(1) + dtarr = dti.values + + expected = pd.DatetimeIndex(dtarr) + tdi + result = tdi + dtarr + tm.assert_index_equal(result, expected) + result = dtarr + tdi + tm.assert_index_equal(result, expected) + + def test_tdi_add_td64_array(self): + dti = pd.date_range('2016-01-01', periods=3) + tdi = dti - dti.shift(1) + tdarr = tdi.values + + expected = 2 * tdi + result = tdi + tdarr + tm.assert_index_equal(result, expected) + result = tdarr + tdi + tm.assert_index_equal(result, expected) + + def test_tdi_sub_td64_array(self): + dti = pd.date_range('2016-01-01', periods=3) + tdi = dti - dti.shift(1) + tdarr = tdi.values + + expected = 0 * tdi + result = tdi - tdarr + tm.assert_index_equal(result, expected) + result = tdarr - tdi + tm.assert_index_equal(result, expected) + # ------------------------------------------------------------- def test_subtraction_ops(self): From 8f1dfa74e244622a7f356a496072f93dce032839 Mon Sep 17 00:00:00 2001 From: jjames34 Date: Sun, 25 Feb 2018 10:17:10 -0600 Subject: [PATCH 199/214] Fixed issue with leftover test.json file (#19879) --- doc/source/whatsnew/v0.23.0.txt | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index b7dfdf9cfea1e..99a3773603fc4 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -214,13 +214,22 @@ Please note that the string `index` is not supported with the round trip format, :okwarning: df.index.name = 'index' + df.to_json('test.json', orient='table') new_df = pd.read_json('test.json', orient='table') new_df - print(new_df.index.name) + new_df.dtypes + +.. ipython:: python + :suppress: + + import os + os.remove('test.json') + .. _whatsnew_0230.enhancements.assign_dependent: + ``.assign()`` accepts dependent arguments ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ From d40fb54a8a538c9312f29b6042f4d72ee3edb2a8 Mon Sep 17 00:00:00 2001 From: Ming Li <14131823+minggli@users.noreply.github.com> Date: Sun, 25 Feb 2018 23:05:25 +0000 Subject: [PATCH 200/214] ENH: ISO8601-compliant datetime string conversion in `iterrows()` and Series construction. (#19762) --- doc/source/whatsnew/v0.23.0.txt | 2 +- pandas/core/dtypes/cast.py | 17 ++++++++++++----- pandas/core/internals.py | 4 ++-- pandas/tests/dtypes/test_cast.py | 4 ++++ pandas/tests/frame/test_api.py | 15 ++++++++++++++- 5 files changed, 33 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 99a3773603fc4..7f33372f765fb 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -766,7 +766,6 @@ Datetimelike - Bug in :func:`to_datetime` where passing an out-of-bounds datetime with ``errors='coerce'`` and ``utc=True`` would raise ``OutOfBoundsDatetime`` instead of parsing to ``NaT`` (:issue:`19612`) - Bug in :class:`DatetimeIndex` and :class:`TimedeltaIndex` addition and subtraction where name of the returned object was not always set consistently. (:issue:`19744`) - Bug in :class:`DatetimeIndex` and :class:`TimedeltaIndex` addition and subtraction where operations with numpy arrays raised ``TypeError`` (:issue:`19847`) -- Timedelta ^^^^^^^^^ @@ -918,6 +917,7 @@ Reshaping - :func:`Series.rename` now accepts ``axis`` as a kwarg (:issue:`18589`) - Comparisons between :class:`Series` and :class:`Index` would return a ``Series`` with an incorrect name, ignoring the ``Index``'s name attribute (:issue:`19582`) - Bug in :func:`qcut` where datetime and timedelta data with ``NaT`` present raised a ``ValueError`` (:issue:`19768`) +- Bug in :func:`DataFrame.iterrows`, which would infers strings not compliant to `ISO8601 `_ to datetimes (:issue:`19671`) Other ^^^^^ diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 352ce29f5c37b..b1d0dc2a2442e 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -904,16 +904,23 @@ def maybe_infer_to_datetimelike(value, convert_dates=False): def try_datetime(v): # safe coerce to datetime64 try: - v = tslib.array_to_datetime(v, errors='raise') + # GH19671 + v = tslib.array_to_datetime(v, + require_iso8601=True, + errors='raise') except ValueError: # we might have a sequence of the same-datetimes with tz's # if so coerce to a DatetimeIndex; if they are not the same, - # then these stay as object dtype + # then these stay as object dtype, xref GH19671 try: - from pandas import to_datetime - return to_datetime(v) - except Exception: + from pandas._libs.tslibs import conversion + from pandas import DatetimeIndex + + values, tz = conversion.datetime_to_datetime64(v) + return DatetimeIndex(values).tz_localize( + 'UTC').tz_convert(tz=tz) + except (ValueError, TypeError): pass except Exception: diff --git a/pandas/core/internals.py b/pandas/core/internals.py index d385185fbb558..00ef8f9cef598 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -2602,8 +2602,8 @@ def _maybe_coerce_values(self, values): """Input validation for values passed to __init__. Ensure that we have datetime64ns, coercing if necessary. - Parametetrs - ----------- + Parameters + ---------- values : array-like Must be convertible to datetime64 diff --git a/pandas/tests/dtypes/test_cast.py b/pandas/tests/dtypes/test_cast.py index 31bd962b67afb..96a9e3227b40b 100644 --- a/pandas/tests/dtypes/test_cast.py +++ b/pandas/tests/dtypes/test_cast.py @@ -301,6 +301,10 @@ def test_maybe_infer_to_datetimelike(self): [NaT, 'b', 1]])) assert result.size == 6 + # GH19671 + result = Series(['M1701', Timestamp('20130101')]) + assert result.dtype.kind == 'O' + class TestConvert(object): diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 91fe7f99ca681..8ba5469480e64 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -15,7 +15,8 @@ from numpy.random import randn import numpy as np -from pandas import DataFrame, Series, date_range, timedelta_range, Categorical +from pandas import (DataFrame, Series, date_range, timedelta_range, + Categorical, SparseDataFrame) import pandas as pd from pandas.util.testing import (assert_almost_equal, @@ -214,6 +215,18 @@ def test_iterrows(self): exp = self.mixed_frame.loc[k] self._assert_series_equal(v, exp) + def test_iterrows_iso8601(self): + # GH19671 + if self.klass == SparseDataFrame: + pytest.xfail(reason='SparseBlock datetime type not implemented.') + + s = self.klass( + {'non_iso8601': ['M1701', 'M1802', 'M1903', 'M2004'], + 'iso8601': date_range('2000-01-01', periods=4, freq='M')}) + for k, v in s.iterrows(): + exp = s.loc[k] + self._assert_series_equal(v, exp) + def test_itertuples(self): for i, tup in enumerate(self.frame.itertuples()): s = self.klass._constructor_sliced(tup[1:]) From 92dbc78af9f34af83691d86347fc04d2c543b94c Mon Sep 17 00:00:00 2001 From: Ming Li <14131823+minggli@users.noreply.github.com> Date: Sun, 25 Feb 2018 23:06:25 +0000 Subject: [PATCH 201/214] parameterize test_pct_change_periods_freq (#19897) --- pandas/tests/frame/test_timeseries.py | 45 ++++++++++++-------------- pandas/tests/series/test_timeseries.py | 41 ++++++++++++----------- 2 files changed, 40 insertions(+), 46 deletions(-) diff --git a/pandas/tests/frame/test_timeseries.py b/pandas/tests/frame/test_timeseries.py index 9f94439a71a57..e1bc310e1e934 100644 --- a/pandas/tests/frame/test_timeseries.py +++ b/pandas/tests/frame/test_timeseries.py @@ -122,36 +122,31 @@ def test_pct_change_shift_over_nas(self): edf = DataFrame({'a': expected, 'b': expected}) assert_frame_equal(chg, edf) - def test_pct_change_periods_freq(self): + @pytest.mark.parametrize("freq, periods, fill_method, limit", + [('5B', 5, None, None), + ('3B', 3, None, None), + ('3B', 3, 'bfill', None), + ('7B', 7, 'pad', 1), + ('7B', 7, 'bfill', 3), + ('14B', 14, None, None)]) + def test_pct_change_periods_freq(self, freq, periods, fill_method, limit): # GH 7292 - rs_freq = self.tsframe.pct_change(freq='5B') - rs_periods = self.tsframe.pct_change(5) - assert_frame_equal(rs_freq, rs_periods) - - rs_freq = self.tsframe.pct_change(freq='3B', fill_method=None) - rs_periods = self.tsframe.pct_change(3, fill_method=None) - assert_frame_equal(rs_freq, rs_periods) - - rs_freq = self.tsframe.pct_change(freq='3B', fill_method='bfill') - rs_periods = self.tsframe.pct_change(3, fill_method='bfill') - assert_frame_equal(rs_freq, rs_periods) - - rs_freq = self.tsframe.pct_change(freq='7B', - fill_method='pad', - limit=1) - rs_periods = self.tsframe.pct_change(7, fill_method='pad', limit=1) - assert_frame_equal(rs_freq, rs_periods) - - rs_freq = self.tsframe.pct_change(freq='7B', - fill_method='bfill', - limit=3) - rs_periods = self.tsframe.pct_change(7, fill_method='bfill', limit=3) + rs_freq = self.tsframe.pct_change(freq=freq, + fill_method=fill_method, + limit=limit) + rs_periods = self.tsframe.pct_change(periods, + fill_method=fill_method, + limit=limit) assert_frame_equal(rs_freq, rs_periods) empty_ts = DataFrame(index=self.tsframe.index, columns=self.tsframe.columns) - rs_freq = empty_ts.pct_change(freq='14B') - rs_periods = empty_ts.pct_change(14) + rs_freq = empty_ts.pct_change(freq=freq, + fill_method=fill_method, + limit=limit) + rs_periods = empty_ts.pct_change(periods, + fill_method=fill_method, + limit=limit) assert_frame_equal(rs_freq, rs_periods) def test_frame_ctor_datetime64_column(self): diff --git a/pandas/tests/series/test_timeseries.py b/pandas/tests/series/test_timeseries.py index 63a05ef7de565..baf2619c7b022 100644 --- a/pandas/tests/series/test_timeseries.py +++ b/pandas/tests/series/test_timeseries.py @@ -355,31 +355,30 @@ def test_pct_change_shift_over_nas(self): expected = Series([np.nan, 0.5, 0., 2.5 / 1.5 - 1, .2]) assert_series_equal(chg, expected) - def test_pct_change_periods_freq(self): + @pytest.mark.parametrize("freq, periods, fill_method, limit", + [('5B', 5, None, None), + ('3B', 3, None, None), + ('3B', 3, 'bfill', None), + ('7B', 7, 'pad', 1), + ('7B', 7, 'bfill', 3), + ('14B', 14, None, None)]) + def test_pct_change_periods_freq(self, freq, periods, fill_method, limit): # GH 7292 - rs_freq = self.ts.pct_change(freq='5B') - rs_periods = self.ts.pct_change(5) - assert_series_equal(rs_freq, rs_periods) - - rs_freq = self.ts.pct_change(freq='3B', fill_method=None) - rs_periods = self.ts.pct_change(3, fill_method=None) - assert_series_equal(rs_freq, rs_periods) - - rs_freq = self.ts.pct_change(freq='3B', fill_method='bfill') - rs_periods = self.ts.pct_change(3, fill_method='bfill') - assert_series_equal(rs_freq, rs_periods) - - rs_freq = self.ts.pct_change(freq='7B', fill_method='pad', limit=1) - rs_periods = self.ts.pct_change(7, fill_method='pad', limit=1) - assert_series_equal(rs_freq, rs_periods) - - rs_freq = self.ts.pct_change(freq='7B', fill_method='bfill', limit=3) - rs_periods = self.ts.pct_change(7, fill_method='bfill', limit=3) + rs_freq = self.ts.pct_change(freq=freq, + fill_method=fill_method, + limit=limit) + rs_periods = self.ts.pct_change(periods, + fill_method=fill_method, + limit=limit) assert_series_equal(rs_freq, rs_periods) empty_ts = Series(index=self.ts.index) - rs_freq = empty_ts.pct_change(freq='14B') - rs_periods = empty_ts.pct_change(14) + rs_freq = empty_ts.pct_change(freq=freq, + fill_method=fill_method, + limit=limit) + rs_periods = empty_ts.pct_change(periods, + fill_method=fill_method, + limit=limit) assert_series_equal(rs_freq, rs_periods) def test_autocorr(self): From 2f09f86507d3b2ae62b66a99b5d5c3a60daf719f Mon Sep 17 00:00:00 2001 From: Wilson Lin Date: Mon, 26 Feb 2018 00:43:00 -0800 Subject: [PATCH 202/214] DOC: Make API reference intro section concise (#19846) --- doc/source/api.rst | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/doc/source/api.rst b/doc/source/api.rst index b8aad67e147ba..0e47499a03f3a 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -6,19 +6,18 @@ API Reference ************* This page gives an overview of all public pandas objects, functions and -methods. In general, all classes and functions exposed in the top-level -``pandas.*`` namespace are regarded as public. +methods. All classes and functions exposed in ``pandas.*`` namespace are public. -Further some of the subpackages are public, including ``pandas.errors``, -``pandas.plotting``, and ``pandas.testing``. Certain functions in the -``pandas.io`` and ``pandas.tseries`` submodules are public as well (those -mentioned in the documentation). Further, the ``pandas.api.types`` subpackage -holds some public functions related to data types in pandas. +Some subpackages are public which include ``pandas.errors``, +``pandas.plotting``, and ``pandas.testing``. Public functions in +``pandas.io`` and ``pandas.tseries`` submodules are mentioned in +the documentation. ``pandas.api.types`` subpackage holds some +public functions related to data types in pandas. .. warning:: - The ``pandas.core``, ``pandas.compat``, and ``pandas.util`` top-level modules are considered to be PRIVATE. Stability of functionality in those modules in not guaranteed. + The ``pandas.core``, ``pandas.compat``, and ``pandas.util`` top-level modules are PRIVATE. Stable functionality in such modules is not guaranteed. .. _api.functions: From 4ddbcb8afc9f76e8503eeffc7f730a59666c3a20 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 26 Feb 2018 09:46:25 +0100 Subject: [PATCH 203/214] DOC/BLD: unpin sphinx to use sphinx 1.7 (#19687) --- ci/requirements-3.6_DOC.run | 2 +- ci/requirements_dev.txt | 2 +- doc/sphinxext/numpydoc/numpydoc.py | 1 - 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/ci/requirements-3.6_DOC.run b/ci/requirements-3.6_DOC.run index 6c45e3371e9cf..084f38ce17eb2 100644 --- a/ci/requirements-3.6_DOC.run +++ b/ci/requirements-3.6_DOC.run @@ -1,7 +1,7 @@ ipython ipykernel ipywidgets -sphinx=1.5* +sphinx nbconvert nbformat notebook diff --git a/ci/requirements_dev.txt b/ci/requirements_dev.txt index a474658fa2922..82f8de277c57b 100644 --- a/ci/requirements_dev.txt +++ b/ci/requirements_dev.txt @@ -7,4 +7,4 @@ pytest>=3.1 python-dateutil>=2.5.0 pytz setuptools>=3.3 -sphinx=1.5* +sphinx diff --git a/doc/sphinxext/numpydoc/numpydoc.py b/doc/sphinxext/numpydoc/numpydoc.py index 2bc2d1e91ed3f..4861aa90edce1 100755 --- a/doc/sphinxext/numpydoc/numpydoc.py +++ b/doc/sphinxext/numpydoc/numpydoc.py @@ -26,7 +26,6 @@ raise RuntimeError("Sphinx 1.0.1 or newer is required") from .docscrape_sphinx import get_doc_object, SphinxDocString -from sphinx.util.compat import Directive if sys.version_info[0] >= 3: sixu = lambda s: s From 6751b9c394ff458eee242755fc1f3a32a4933d59 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 26 Feb 2018 09:48:10 +0100 Subject: [PATCH 204/214] DOC: fix numpydoc section titles in misc plotting docstrings (#19899) --- pandas/plotting/_misc.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index d6048f54993e6..45594e9c6ea95 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -149,8 +149,8 @@ def _get_marker_compat(marker): def radviz(frame, class_column, ax=None, color=None, colormap=None, **kwds): """RadViz - a multivariate data visualization algorithm - Parameters: - ----------- + Parameters + ---------- frame: DataFrame class_column: str Column name containing class names @@ -163,8 +163,8 @@ def radviz(frame, class_column, ax=None, color=None, colormap=None, **kwds): kwds: keywords Options to pass to matplotlib scatter plotting method - Returns: - -------- + Returns + ------- ax: Matplotlib axis object """ import matplotlib.pyplot as plt @@ -247,8 +247,8 @@ def andrews_curves(frame, class_column, ax=None, samples=200, color=None, linearly spaced between -pi and +pi. Each row of frame then corresponds to a single curve. - Parameters: - ----------- + Parameters + ---------- frame : DataFrame Data to be plotted, preferably normalized to (0.0, 1.0) class_column : Name of the column containing class names @@ -262,8 +262,8 @@ def andrews_curves(frame, class_column, ax=None, samples=200, color=None, kwds: keywords Options to pass to matplotlib plotting method - Returns: - -------- + Returns + ------- ax: Matplotlib axis object """ @@ -325,8 +325,8 @@ def f(t): def bootstrap_plot(series, fig=None, size=50, samples=500, **kwds): """Bootstrap plot. - Parameters: - ----------- + Parameters + ---------- series: Time series fig: matplotlib figure object, optional size: number of data points to consider during each sampling @@ -334,8 +334,8 @@ def bootstrap_plot(series, fig=None, size=50, samples=500, **kwds): kwds: optional keyword arguments for plotting commands, must be accepted by both hist and plot - Returns: - -------- + Returns + ------- fig: matplotlib figure """ import random @@ -503,15 +503,15 @@ def parallel_coordinates(frame, class_column, cols=None, ax=None, color=None, def lag_plot(series, lag=1, ax=None, **kwds): """Lag plot for time series. - Parameters: - ----------- + Parameters + ---------- series: Time series lag: lag of the scatter plot, default 1 ax: Matplotlib axis object, optional kwds: Matplotlib scatter method keyword arguments, optional - Returns: - -------- + Returns + ------- ax: Matplotlib axis object """ import matplotlib.pyplot as plt From 15bbb28865fa1a259e4d61b85604dea1c5f2f249 Mon Sep 17 00:00:00 2001 From: Colin Date: Mon, 26 Feb 2018 05:34:58 -0500 Subject: [PATCH 205/214] DOC: small typo fix (#19901) --- pandas/core/groupby.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 852ad04cd8a2e..00643614e8803 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1440,7 +1440,7 @@ def resample(self, rule, *args, **kwargs): def rolling(self, *args, **kwargs): """ Return a rolling grouper, providing rolling - functionaility per group + functionality per group """ from pandas.core.window import RollingGroupby @@ -1451,7 +1451,7 @@ def rolling(self, *args, **kwargs): def expanding(self, *args, **kwargs): """ Return an expanding grouper, providing expanding - functionaility per group + functionality per group """ from pandas.core.window import ExpandingGroupby From f105cdf0b322db6127915f13582f1573770c2274 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 26 Feb 2018 03:10:14 -0800 Subject: [PATCH 206/214] cleanup order of operations kludges (#19895) --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/core/indexes/datetimelike.py | 48 +++++++------------ pandas/core/indexes/datetimes.py | 16 +++++-- pandas/core/indexes/timedeltas.py | 30 +++++++----- .../indexes/datetimes/test_arithmetic.py | 4 +- .../indexes/timedeltas/test_arithmetic.py | 2 +- 6 files changed, 49 insertions(+), 52 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 7f33372f765fb..fb22dc40e335f 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -586,6 +586,7 @@ Datetimelike API Changes - Operations between a :class:`Series` with dtype ``dtype='datetime64[ns]'`` and a :class:`PeriodIndex` will correctly raises ``TypeError`` (:issue:`18850`) - Subtraction of :class:`Series` with timezone-aware ``dtype='datetime64[ns]'`` with mis-matched timezones will raise ``TypeError`` instead of ``ValueError`` (:issue:`18817`) - :func:`pandas.merge` provides a more informative error message when trying to merge on timezone-aware and timezone-naive columns (:issue:`15800`) +- For :class:`DatetimeIndex` and :class:`TimedeltaIndex` with ``freq=None``, addition or subtraction of integer-dtyped array or ``Index`` will raise ``NullFrequencyError`` instead of ``TypeError`` (:issue:`19895`) .. _whatsnew_0230.api.other: diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 9411428b2e68d..8e56fc2775a56 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -32,6 +32,7 @@ is_object_dtype, is_string_dtype, is_datetime64_dtype, + is_datetime64tz_dtype, is_period_dtype, is_timedelta64_dtype) from pandas.core.dtypes.generic import ( @@ -200,8 +201,9 @@ def _evaluate_compare(self, other, op): if is_bool_dtype(result): result[mask] = False return result + + result[mask] = iNaT try: - result[mask] = iNaT return Index(result) except TypeError: return result @@ -349,7 +351,7 @@ def _nat_new(self, box=True): return result attribs = self._get_attributes_dict() - if not isinstance(self, ABCPeriodIndex): + if not is_period_dtype(self): attribs['freq'] = None return self._simple_new(result, **attribs) @@ -631,9 +633,9 @@ def _convert_scalar_indexer(self, key, kind=None): ._convert_scalar_indexer(key, kind=kind)) def _add_datelike(self, other): - raise TypeError("cannot add {0} and {1}" - .format(type(self).__name__, - type(other).__name__)) + raise TypeError("cannot add {cls} and {typ}" + .format(cls=type(self).__name__, + typ=type(other).__name__)) def _sub_datelike(self, other): raise com.AbstractMethodError(self) @@ -677,7 +679,7 @@ def _add_datetimelike_methods(cls): """ def __add__(self, other): - from pandas import Index, DatetimeIndex, TimedeltaIndex, DateOffset + from pandas import DateOffset other = lib.item_from_zerodim(other) if isinstance(other, ABCSeries): @@ -700,18 +702,9 @@ def __add__(self, other): elif is_offsetlike(other): # Array/Index of DateOffset objects result = self._addsub_offset_array(other, operator.add) - elif isinstance(self, TimedeltaIndex) and isinstance(other, Index): - if hasattr(other, '_add_delta'): - # i.e. DatetimeIndex, TimedeltaIndex, or PeriodIndex - result = other._add_delta(self) - else: - raise TypeError("cannot add TimedeltaIndex and {typ}" - .format(typ=type(other))) - elif isinstance(other, Index): - result = self._add_datelike(other) - elif is_datetime64_dtype(other): - # ndarray[datetime64]; note DatetimeIndex is caught above - return self + DatetimeIndex(other) + elif is_datetime64_dtype(other) or is_datetime64tz_dtype(other): + # DatetimeIndex, ndarray[datetime64] + return self._add_datelike(other) elif is_integer_dtype(other) and self.freq is None: # GH#19123 raise NullFrequencyError("Cannot shift with no freq") @@ -731,7 +724,7 @@ def __radd__(self, other): cls.__radd__ = __radd__ def __sub__(self, other): - from pandas import Index, DatetimeIndex, TimedeltaIndex, DateOffset + from pandas import Index, DateOffset other = lib.item_from_zerodim(other) if isinstance(other, ABCSeries): @@ -756,20 +749,13 @@ def __sub__(self, other): elif is_offsetlike(other): # Array/Index of DateOffset objects result = self._addsub_offset_array(other, operator.sub) - elif isinstance(self, TimedeltaIndex) and isinstance(other, Index): - # We checked above for timedelta64_dtype(other) so this - # must be invalid. - raise TypeError("cannot subtract TimedeltaIndex and {typ}" - .format(typ=type(other).__name__)) - elif isinstance(other, DatetimeIndex): + elif is_datetime64_dtype(other) or is_datetime64tz_dtype(other): + # DatetimeIndex, ndarray[datetime64] result = self._sub_datelike(other) - elif is_datetime64_dtype(other): - # ndarray[datetime64]; note we caught DatetimeIndex earlier - return self - DatetimeIndex(other) elif isinstance(other, Index): - raise TypeError("cannot subtract {typ1} and {typ2}" - .format(typ1=type(self).__name__, - typ2=type(other).__name__)) + raise TypeError("cannot subtract {cls} and {typ}" + .format(cls=type(self).__name__, + typ=type(other).__name__)) elif is_integer_dtype(other) and self.freq is None: # GH#19123 raise NullFrequencyError("Cannot shift with no freq") diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 36ea2bffb9531..55d8b7c18a622 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -864,11 +864,16 @@ def _add_datelike(self, other): def _sub_datelike(self, other): # subtract a datetime from myself, yielding a TimedeltaIndex from pandas import TimedeltaIndex - if isinstance(other, DatetimeIndex): + + if isinstance(other, (DatetimeIndex, np.ndarray)): + # if other is an ndarray, we assume it is datetime64-dtype + other = DatetimeIndex(other) + # require tz compat if not self._has_same_tz(other): - raise TypeError("DatetimeIndex subtraction must have the same " - "timezones or no timezones") + raise TypeError("{cls} subtraction must have the same " + "timezones or no timezones" + .format(cls=type(self).__name__)) result = self._sub_datelike_dti(other) elif isinstance(other, (datetime, np.datetime64)): other = Timestamp(other) @@ -885,8 +890,9 @@ def _sub_datelike(self, other): result = self._maybe_mask_results(result, fill_value=libts.iNaT) else: - raise TypeError("cannot subtract DatetimeIndex and {typ}" - .format(typ=type(other).__name__)) + raise TypeError("cannot subtract {cls} and {typ}" + .format(cls=type(self).__name__, + typ=type(other).__name__)) return TimedeltaIndex(result) def _sub_datelike_dti(self, other): diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 6f80962eab079..eebd52d7fb801 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -59,30 +59,28 @@ def _td_index_cmp(opname, cls): nat_result = True if opname == '__ne__' else False def wrapper(self, other): - msg = "cannot compare a TimedeltaIndex with type {0}" + msg = "cannot compare a {cls} with type {typ}" func = getattr(super(TimedeltaIndex, self), opname) if _is_convertible_to_td(other) or other is NaT: try: other = _to_m8(other) except ValueError: # failed to parse as timedelta - raise TypeError(msg.format(type(other))) + raise TypeError(msg.format(cls=type(self).__name__, + typ=type(other).__name__)) result = func(other) if isna(other): result.fill(nat_result) - else: - if not is_list_like(other): - raise TypeError(msg.format(type(other))) + elif not is_list_like(other): + raise TypeError(msg.format(cls=type(self).__name__, + typ=type(other).__name__)) + else: other = TimedeltaIndex(other).values result = func(other) result = com._values_from_object(result) - if isinstance(other, Index): - o_mask = other.values.view('i8') == iNaT - else: - o_mask = other.view('i8') == iNaT - + o_mask = np.array(isna(other)) if o_mask.any(): result[o_mask] = nat_result @@ -416,9 +414,15 @@ def _evaluate_with_timedelta_like(self, other, op): def _add_datelike(self, other): # adding a timedeltaindex to a datetimelike from pandas import Timestamp, DatetimeIndex + if other is NaT: # GH#19124 pd.NaT is treated like a timedelta return self._nat_new() + elif isinstance(other, (DatetimeIndex, np.ndarray)): + # if other is an ndarray, we assume it is datetime64-dtype + # defer to implementation in DatetimeIndex + other = DatetimeIndex(other) + return other + self else: other = Timestamp(other) i8 = self.asi8 @@ -434,7 +438,8 @@ def _sub_datelike(self, other): if other is NaT: return self._nat_new() else: - raise TypeError("cannot subtract a datelike from a TimedeltaIndex") + raise TypeError("cannot subtract a datelike from a {cls}" + .format(cls=type(self).__name__)) def _addsub_offset_array(self, other, op): # Add or subtract Array-like of DateOffset objects @@ -962,8 +967,7 @@ def _is_convertible_to_index(other): def _is_convertible_to_td(key): - # TODO: Not all DateOffset objects are convertible to Timedelta - return isinstance(key, (DateOffset, timedelta, Timedelta, + return isinstance(key, (Tick, timedelta, np.timedelta64, compat.string_types)) diff --git a/pandas/tests/indexes/datetimes/test_arithmetic.py b/pandas/tests/indexes/datetimes/test_arithmetic.py index 5a7ea44f3698c..0c56c6b16fb2f 100644 --- a/pandas/tests/indexes/datetimes/test_arithmetic.py +++ b/pandas/tests/indexes/datetimes/test_arithmetic.py @@ -508,7 +508,7 @@ def test_dti_sub_tdi(self, tz): result = dti - tdi tm.assert_index_equal(result, expected) - msg = 'cannot subtract TimedeltaIndex and DatetimeIndex' + msg = 'cannot subtract .*TimedeltaIndex' with tm.assert_raises_regex(TypeError, msg): tdi - dti @@ -531,7 +531,7 @@ def test_dti_isub_tdi(self, tz): result -= tdi tm.assert_index_equal(result, expected) - msg = 'cannot subtract TimedeltaIndex and DatetimeIndex' + msg = 'cannot subtract .*TimedeltaIndex' with tm.assert_raises_regex(TypeError, msg): tdi -= dti diff --git a/pandas/tests/indexes/timedeltas/test_arithmetic.py b/pandas/tests/indexes/timedeltas/test_arithmetic.py index 6a80b995b6ee9..9ffffb6ff06d5 100644 --- a/pandas/tests/indexes/timedeltas/test_arithmetic.py +++ b/pandas/tests/indexes/timedeltas/test_arithmetic.py @@ -792,7 +792,7 @@ def test_addition_ops(self): pytest.raises(ValueError, lambda: tdi[0:1] + dti) # random indexes - pytest.raises(TypeError, lambda: tdi + Int64Index([1, 2, 3])) + pytest.raises(NullFrequencyError, lambda: tdi + Int64Index([1, 2, 3])) # this is a union! # pytest.raises(TypeError, lambda : Int64Index([1,2,3]) + tdi) From 960a7464ea78835d1622ad8d8de9e9b1ee7310c7 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 26 Feb 2018 03:37:50 -0800 Subject: [PATCH 207/214] make ops.add_foo take just class (#19828) --- pandas/core/frame.py | 4 +- pandas/core/ops.py | 184 ++++++++++++++++++++++------------- pandas/core/panel.py | 4 +- pandas/core/series.py | 4 +- pandas/core/sparse/array.py | 14 +-- pandas/core/sparse/frame.py | 4 +- pandas/core/sparse/series.py | 5 +- 7 files changed, 127 insertions(+), 92 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 061b69f25e7ac..e4ef1b97882d9 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6131,8 +6131,8 @@ def isin(self, values): DataFrame._add_numeric_operations() DataFrame._add_series_or_dataframe_operations() -ops.add_flex_arithmetic_methods(DataFrame, **ops.frame_flex_funcs) -ops.add_special_arithmetic_methods(DataFrame, **ops.frame_special_funcs) +ops.add_flex_arithmetic_methods(DataFrame) +ops.add_special_arithmetic_methods(DataFrame) def _arrays_to_mgr(arrays, arr_names, index, columns, dtype=None): diff --git a/pandas/core/ops.py b/pandas/core/ops.py index b20f208d14dc5..7bdbac66b4f31 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -37,7 +37,7 @@ construct_1d_object_array_from_listlike) from pandas.core.dtypes.generic import ( ABCSeries, - ABCDataFrame, + ABCDataFrame, ABCPanel, ABCIndex, ABCSparseSeries, ABCSparseArray) @@ -711,6 +711,64 @@ def mask_cmp_op(x, y, op, allowed_types): # Functions that add arithmetic methods to objects, given arithmetic factory # methods +def _get_method_wrappers(cls): + """ + Find the appropriate operation-wrappers to use when defining flex/special + arithmetic, boolean, and comparison operations with the given class. + + Parameters + ---------- + cls : class + + Returns + ------- + arith_flex : function or None + comp_flex : function or None + arith_special : function + comp_special : function + bool_special : function + + Notes + ----- + None is only returned for SparseArray + """ + if issubclass(cls, ABCSparseSeries): + # Be sure to catch this before ABCSeries and ABCSparseArray, + # as they will both come see SparseSeries as a subclass + arith_flex = _flex_method_SERIES + comp_flex = _flex_method_SERIES + arith_special = _arith_method_SPARSE_SERIES + comp_special = _arith_method_SPARSE_SERIES + bool_special = _bool_method_SERIES + # TODO: I don't think the functions defined by bool_method are tested + elif issubclass(cls, ABCSeries): + # Just Series; SparseSeries is caught above + arith_flex = _flex_method_SERIES + comp_flex = _flex_method_SERIES + arith_special = _arith_method_SERIES + comp_special = _comp_method_SERIES + bool_special = _bool_method_SERIES + elif issubclass(cls, ABCSparseArray): + arith_flex = None + comp_flex = None + arith_special = _arith_method_SPARSE_ARRAY + comp_special = _arith_method_SPARSE_ARRAY + bool_special = _arith_method_SPARSE_ARRAY + elif issubclass(cls, ABCPanel): + arith_flex = _flex_method_PANEL + comp_flex = _comp_method_PANEL + arith_special = _arith_method_PANEL + comp_special = _comp_method_PANEL + bool_special = _arith_method_PANEL + elif issubclass(cls, ABCDataFrame): + # Same for DataFrame and SparseDataFrame + arith_flex = _arith_method_FRAME + comp_flex = _flex_comp_method_FRAME + arith_special = _arith_method_FRAME + comp_special = _comp_method_FRAME + bool_special = _arith_method_FRAME + return arith_flex, comp_flex, arith_special, comp_special, bool_special + def _create_methods(cls, arith_method, comp_method, bool_method, special=False): @@ -743,16 +801,18 @@ def _create_methods(cls, arith_method, comp_method, bool_method, # yapf: enable new_methods['div'] = new_methods['truediv'] new_methods['rdiv'] = new_methods['rtruediv'] + if have_divmod: + # divmod doesn't have an op that is supported by numexpr + new_methods['divmod'] = arith_method(cls, divmod, special) + + new_methods.update(dict( + eq=comp_method(cls, operator.eq, special), + ne=comp_method(cls, operator.ne, special), + lt=comp_method(cls, operator.lt, special), + gt=comp_method(cls, operator.gt, special), + le=comp_method(cls, operator.le, special), + ge=comp_method(cls, operator.ge, special))) - # Comp methods never had a default axis set - if comp_method: - new_methods.update(dict( - eq=comp_method(cls, operator.eq, special), - ne=comp_method(cls, operator.ne, special), - lt=comp_method(cls, operator.lt, special), - gt=comp_method(cls, operator.gt, special), - le=comp_method(cls, operator.le, special), - ge=comp_method(cls, operator.ge, special))) if bool_method: new_methods.update( dict(and_=bool_method(cls, operator.and_, special), @@ -762,9 +822,6 @@ def _create_methods(cls, arith_method, comp_method, bool_method, rand_=bool_method(cls, rand_, special), ror_=bool_method(cls, ror_, special), rxor=bool_method(cls, rxor, special))) - if have_divmod: - # divmod doesn't have an op that is supported by numexpr - new_methods['divmod'] = arith_method(cls, divmod, special) if special: dunderize = lambda x: '__{name}__'.format(name=x.strip('_')) @@ -788,22 +845,17 @@ def add_methods(cls, new_methods): # ---------------------------------------------------------------------- # Arithmetic -def add_special_arithmetic_methods(cls, arith_method=None, - comp_method=None, bool_method=None): +def add_special_arithmetic_methods(cls): """ Adds the full suite of special arithmetic methods (``__add__``, ``__sub__``, etc.) to the class. Parameters ---------- - arith_method : function (optional) - factory for special arithmetic methods: - f(cls, op, special) - comp_method : function (optional) - factory for rich comparison - signature: f(cls, op, special) - bool_method : function (optional) - factory for boolean methods - signature: f(cls, op, special) + cls : class + special methods will be defined and pinned to this class """ + _, _, arith_method, comp_method, bool_method = _get_method_wrappers(cls) new_methods = _create_methods(cls, arith_method, comp_method, bool_method, special=True) # inplace operators (I feel like these should get passed an `inplace=True` @@ -836,28 +888,26 @@ def f(self, other): __ipow__=_wrap_inplace_method(new_methods["__pow__"]))) if not compat.PY3: new_methods["__idiv__"] = _wrap_inplace_method(new_methods["__div__"]) - if bool_method: - new_methods.update( - dict(__iand__=_wrap_inplace_method(new_methods["__and__"]), - __ior__=_wrap_inplace_method(new_methods["__or__"]), - __ixor__=_wrap_inplace_method(new_methods["__xor__"]))) + + new_methods.update( + dict(__iand__=_wrap_inplace_method(new_methods["__and__"]), + __ior__=_wrap_inplace_method(new_methods["__or__"]), + __ixor__=_wrap_inplace_method(new_methods["__xor__"]))) add_methods(cls, new_methods=new_methods) -def add_flex_arithmetic_methods(cls, flex_arith_method, flex_comp_method=None): +def add_flex_arithmetic_methods(cls): """ Adds the full suite of flex arithmetic methods (``pow``, ``mul``, ``add``) to the class. Parameters ---------- - flex_arith_method : function - factory for flex arithmetic methods: - f(cls, op, special) - flex_comp_method : function, optional, - factory for rich comparison - signature: f(cls, op, special) + cls : class + flex methods will be defined and pinned to this class """ + flex_arith_method, flex_comp_method, _, _, _ = _get_method_wrappers(cls) new_methods = _create_methods(cls, flex_arith_method, flex_comp_method, bool_method=None, special=False) @@ -1284,14 +1334,6 @@ def flex_wrapper(self, other, level=None, fill_value=None, axis=0): return flex_wrapper -series_flex_funcs = dict(flex_arith_method=_flex_method_SERIES, - flex_comp_method=_flex_method_SERIES) - -series_special_funcs = dict(arith_method=_arith_method_SERIES, - comp_method=_comp_method_SERIES, - bool_method=_bool_method_SERIES) - - # ----------------------------------------------------------------------------- # DataFrame @@ -1533,14 +1575,6 @@ def f(self, other): return f -frame_flex_funcs = dict(flex_arith_method=_arith_method_FRAME, - flex_comp_method=_flex_comp_method_FRAME) - -frame_special_funcs = dict(arith_method=_arith_method_FRAME, - comp_method=_comp_method_FRAME, - bool_method=_arith_method_FRAME) - - # ----------------------------------------------------------------------------- # Panel @@ -1629,16 +1663,38 @@ def f(self, other, axis=0): return f -panel_special_funcs = dict(arith_method=_arith_method_PANEL, - comp_method=_comp_method_PANEL, - bool_method=_arith_method_PANEL) - -panel_flex_funcs = dict(flex_arith_method=_flex_method_PANEL, - flex_comp_method=_comp_method_PANEL) - # ----------------------------------------------------------------------------- # Sparse +def _cast_sparse_series_op(left, right, opname): + """ + For SparseSeries operation, coerce to float64 if the result is expected + to have NaN or inf values + + Parameters + ---------- + left : SparseArray + right : SparseArray + opname : str + + Returns + ------- + left : SparseArray + right : SparseArray + """ + opname = opname.strip('_') + + if is_integer_dtype(left) and is_integer_dtype(right): + # series coerces to float64 if result should have NaN/inf + if opname in ('floordiv', 'mod') and (right.values == 0).any(): + left = left.astype(np.float64) + right = right.astype(np.float64) + elif opname in ('rfloordiv', 'rmod') and (left.values == 0).any(): + left = left.astype(np.float64) + right = right.astype(np.float64) + + return left, right + def _arith_method_SPARSE_SERIES(cls, op, special): """ @@ -1674,8 +1730,8 @@ def _sparse_series_op(left, right, op, name): new_name = get_op_result_name(left, right) from pandas.core.sparse.array import _sparse_array_op - result = _sparse_array_op(left.values, right.values, op, name, - series=True) + lvalues, rvalues = _cast_sparse_series_op(left.values, right.values, name) + result = _sparse_array_op(lvalues, rvalues, op, name) return left._constructor(result, index=new_index, name=new_name) @@ -1697,7 +1753,7 @@ def wrapper(self, other): dtype = getattr(other, 'dtype', None) other = SparseArray(other, fill_value=self.fill_value, dtype=dtype) - return _sparse_array_op(self, other, op, name, series=False) + return _sparse_array_op(self, other, op, name) elif is_scalar(other): with np.errstate(all='ignore'): fill = op(_get_fill(self), np.asarray(other)) @@ -1710,13 +1766,3 @@ def wrapper(self, other): wrapper.__name__ = name return wrapper - - -sparse_array_special_funcs = dict(arith_method=_arith_method_SPARSE_ARRAY, - comp_method=_arith_method_SPARSE_ARRAY, - bool_method=_arith_method_SPARSE_ARRAY) - -sparse_series_special_funcs = dict(arith_method=_arith_method_SPARSE_SERIES, - comp_method=_arith_method_SPARSE_SERIES, - bool_method=_bool_method_SERIES) -# TODO: I don't think the functions defined by bool_method are tested diff --git a/pandas/core/panel.py b/pandas/core/panel.py index 3be1e3ef8734d..fc7fad861df44 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -1527,8 +1527,8 @@ def _extract_axis(self, data, axis=0, intersect=False): slicers={'major_axis': 'index', 'minor_axis': 'columns'}) -ops.add_special_arithmetic_methods(Panel, **ops.panel_special_funcs) -ops.add_flex_arithmetic_methods(Panel, **ops.panel_flex_funcs) +ops.add_special_arithmetic_methods(Panel) +ops.add_flex_arithmetic_methods(Panel) Panel._add_numeric_operations() diff --git a/pandas/core/series.py b/pandas/core/series.py index b42e02bc99237..26b7fd552b062 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3100,8 +3100,8 @@ def to_period(self, freq=None, copy=True): Series._add_series_or_dataframe_operations() # Add arithmetic! -ops.add_flex_arithmetic_methods(Series, **ops.series_flex_funcs) -ops.add_special_arithmetic_methods(Series, **ops.series_special_funcs) +ops.add_flex_arithmetic_methods(Series) +ops.add_special_arithmetic_methods(Series) # ----------------------------------------------------------------------------- diff --git a/pandas/core/sparse/array.py b/pandas/core/sparse/array.py index 92c4fe932f066..5532d7522cd2d 100644 --- a/pandas/core/sparse/array.py +++ b/pandas/core/sparse/array.py @@ -53,20 +53,11 @@ def _get_fill(arr): return np.asarray(arr.fill_value) -def _sparse_array_op(left, right, op, name, series=False): +def _sparse_array_op(left, right, op, name): if name.startswith('__'): # For lookups in _libs.sparse we need non-dunder op name name = name[2:-2] - if series and is_integer_dtype(left) and is_integer_dtype(right): - # series coerces to float64 if result should have NaN/inf - if name in ('floordiv', 'mod') and (right.values == 0).any(): - left = left.astype(np.float64) - right = right.astype(np.float64) - elif name in ('rfloordiv', 'rmod') and (left.values == 0).any(): - left = left.astype(np.float64) - right = right.astype(np.float64) - # dtype used to find corresponding sparse method if not is_dtype_equal(left.dtype, right.dtype): dtype = find_common_type([left.dtype, right.dtype]) @@ -850,5 +841,4 @@ def _make_index(length, indices, kind): return index -ops.add_special_arithmetic_methods(SparseArray, - **ops.sparse_array_special_funcs) +ops.add_special_arithmetic_methods(SparseArray) diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index 872a17d8dbabe..d89b1d681c478 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -1014,5 +1014,5 @@ def homogenize(series_dict): # use unaccelerated ops for sparse objects -ops.add_flex_arithmetic_methods(SparseDataFrame, **ops.frame_flex_funcs) -ops.add_special_arithmetic_methods(SparseDataFrame, **ops.frame_special_funcs) +ops.add_flex_arithmetic_methods(SparseDataFrame) +ops.add_special_arithmetic_methods(SparseDataFrame) diff --git a/pandas/core/sparse/series.py b/pandas/core/sparse/series.py index 26cf9dbadbbf2..7a1496bf11117 100644 --- a/pandas/core/sparse/series.py +++ b/pandas/core/sparse/series.py @@ -812,6 +812,5 @@ def from_coo(cls, A, dense_index=False): # overwrite series methods with unaccelerated Sparse-specific versions -ops.add_flex_arithmetic_methods(SparseSeries, **ops.series_flex_funcs) -ops.add_special_arithmetic_methods(SparseSeries, - **ops.sparse_series_special_funcs) +ops.add_flex_arithmetic_methods(SparseSeries) +ops.add_special_arithmetic_methods(SparseSeries) From 1e4c50a56f7e953ab84308f000dff6fc1ac71171 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Mon, 26 Feb 2018 04:21:06 -0800 Subject: [PATCH 208/214] Test Decorators and Better Pytest Integration in 'test_excel' (#19829) --- pandas/compat/__init__.py | 14 + pandas/tests/io/test_excel.py | 1619 ++++++++++++++----------------- pandas/util/_test_decorators.py | 6 +- pandas/util/testing.py | 2 +- 4 files changed, 739 insertions(+), 902 deletions(-) diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 80a2c05d86971..78aaf4596c8b7 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -365,6 +365,20 @@ def callable(obj): return any("__call__" in klass.__dict__ for klass in type(obj).__mro__) +if sys.version_info[0] < 3: + # In PY2 functools.wraps doesn't provide metadata pytest needs to generate + # decorated tests using parametrization. See pytest GH issue #2782 + def wraps(wrapped, assigned=functools.WRAPPER_ASSIGNMENTS, + updated=functools.WRAPPER_UPDATES): + def wrapper(f): + f = functools.wraps(wrapped, assigned, updated)(f) + f.__wrapped__ = wrapped + return f + return wrapper +else: + wraps = functools.wraps + + def add_metaclass(metaclass): """Class decorator for creating a class with a metaclass.""" def wrapper(cls): diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index 86cee54665781..fdf9954285db8 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -28,43 +28,6 @@ from pandas.util.testing import ensure_clean, makeCustomDataframe as mkdf -def _skip_if_no_xlrd(): - try: - import xlrd - ver = tuple(map(int, xlrd.__VERSION__.split(".")[:2])) - if ver < (0, 9): - pytest.skip('xlrd < 0.9, skipping') - except ImportError: - pytest.skip('xlrd not installed, skipping') - - -def _skip_if_no_xlwt(): - try: - import xlwt # NOQA - except ImportError: - pytest.skip('xlwt not installed, skipping') - - -def _skip_if_no_openpyxl(): - try: - import openpyxl # NOQA - except ImportError: - pytest.skip('openpyxl not installed, skipping') - - -def _skip_if_no_xlsxwriter(): - try: - import xlsxwriter # NOQA - except ImportError: - pytest.skip('xlsxwriter not installed, skipping') - - -def _skip_if_no_excelsuite(): - _skip_if_no_xlrd() - _skip_if_no_xlwt() - _skip_if_no_openpyxl() - - _seriesd = tm.getSeriesData() _tsd = tm.getTimeSeriesData() _frame = DataFrame(_seriesd)[:10] @@ -74,6 +37,7 @@ def _skip_if_no_excelsuite(): _mixed_frame['foo'] = 'bar' +@td.skip_if_no('xlrd', '0.9') class SharedItems(object): def setup_method(self, method): @@ -103,7 +67,7 @@ def get_csv_refdf(self, basename): dfref = read_csv(pref, index_col=0, parse_dates=True, engine='python') return dfref - def get_excelfile(self, basename): + def get_excelfile(self, basename, ext): """ Return test data ExcelFile instance. Test data path is defined by pandas.util.testing.get_data_path() @@ -119,9 +83,9 @@ def get_excelfile(self, basename): excel : io.excel.ExcelFile """ - return ExcelFile(os.path.join(self.dirpath, basename + self.ext)) + return ExcelFile(os.path.join(self.dirpath, basename + ext)) - def get_exceldf(self, basename, *args, **kwds): + def get_exceldf(self, basename, ext, *args, **kwds): """ Return test data DataFrame. Test data path is defined by pandas.util.testing.get_data_path() @@ -137,36 +101,23 @@ def get_exceldf(self, basename, *args, **kwds): df : DataFrame """ - pth = os.path.join(self.dirpath, basename + self.ext) + pth = os.path.join(self.dirpath, basename + ext) return read_excel(pth, *args, **kwds) class ReadingTestsBase(SharedItems): # This is based on ExcelWriterBase - # - # Base class for test cases to run with different Excel readers. - # To add a reader test, define the following: - # 1. A check_skip function that skips your tests if your reader isn't - # installed. - # 2. Add a property ext, which is the file extension that your reader - # reades from. (needs to start with '.' so it's a valid path) - # 3. Add a property engine_name, which is the name of the reader class. - # For the reader this is not used for anything at the moment. - def setup_method(self, method): - self.check_skip() - super(ReadingTestsBase, self).setup_method(method) - - def test_usecols_int(self): + def test_usecols_int(self, ext): dfref = self.get_csv_refdf('test1') dfref = dfref.reindex(columns=['A', 'B', 'C']) - df1 = self.get_exceldf('test1', 'Sheet1', index_col=0, usecols=3) - df2 = self.get_exceldf('test1', 'Sheet2', skiprows=[1], index_col=0, - usecols=3) + df1 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0, usecols=3) + df2 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], + index_col=0, usecols=3) with tm.assert_produces_warning(FutureWarning): - df3 = self.get_exceldf('test1', 'Sheet2', skiprows=[1], + df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], index_col=0, parse_cols=3) # TODO add index to xls file) @@ -174,17 +125,17 @@ def test_usecols_int(self): tm.assert_frame_equal(df2, dfref, check_names=False) tm.assert_frame_equal(df3, dfref, check_names=False) - def test_usecols_list(self): + def test_usecols_list(self, ext): dfref = self.get_csv_refdf('test1') dfref = dfref.reindex(columns=['B', 'C']) - df1 = self.get_exceldf('test1', 'Sheet1', index_col=0, - usecols=[0, 2, 3]) - df2 = self.get_exceldf('test1', 'Sheet2', skiprows=[1], index_col=0, + df1 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0, usecols=[0, 2, 3]) + df2 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], + index_col=0, usecols=[0, 2, 3]) with tm.assert_produces_warning(FutureWarning): - df3 = self.get_exceldf('test1', 'Sheet2', skiprows=[1], + df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], index_col=0, parse_cols=[0, 2, 3]) # TODO add index to xls file) @@ -192,18 +143,18 @@ def test_usecols_list(self): tm.assert_frame_equal(df2, dfref, check_names=False) tm.assert_frame_equal(df3, dfref, check_names=False) - def test_usecols_str(self): + def test_usecols_str(self, ext): dfref = self.get_csv_refdf('test1') df1 = dfref.reindex(columns=['A', 'B', 'C']) - df2 = self.get_exceldf('test1', 'Sheet1', index_col=0, - usecols='A:D') - df3 = self.get_exceldf('test1', 'Sheet2', skiprows=[1], index_col=0, + df2 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0, usecols='A:D') + df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], + index_col=0, usecols='A:D') with tm.assert_produces_warning(FutureWarning): - df4 = self.get_exceldf('test1', 'Sheet2', skiprows=[1], + df4 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], index_col=0, parse_cols='A:D') # TODO add index to xls, read xls ignores index name ? @@ -212,37 +163,37 @@ def test_usecols_str(self): tm.assert_frame_equal(df4, df1, check_names=False) df1 = dfref.reindex(columns=['B', 'C']) - df2 = self.get_exceldf('test1', 'Sheet1', index_col=0, - usecols='A,C,D') - df3 = self.get_exceldf('test1', 'Sheet2', skiprows=[1], index_col=0, + df2 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0, usecols='A,C,D') + df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], + index_col=0, usecols='A,C,D') # TODO add index to xls file tm.assert_frame_equal(df2, df1, check_names=False) tm.assert_frame_equal(df3, df1, check_names=False) df1 = dfref.reindex(columns=['B', 'C']) - df2 = self.get_exceldf('test1', 'Sheet1', index_col=0, - usecols='A,C:D') - df3 = self.get_exceldf('test1', 'Sheet2', skiprows=[1], index_col=0, + df2 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0, usecols='A,C:D') + df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], + index_col=0, usecols='A,C:D') tm.assert_frame_equal(df2, df1, check_names=False) tm.assert_frame_equal(df3, df1, check_names=False) - def test_excel_stop_iterator(self): + def test_excel_stop_iterator(self, ext): - parsed = self.get_exceldf('test2', 'Sheet1') + parsed = self.get_exceldf('test2', ext, 'Sheet1') expected = DataFrame([['aaaa', 'bbbbb']], columns=['Test', 'Test1']) tm.assert_frame_equal(parsed, expected) - def test_excel_cell_error_na(self): + def test_excel_cell_error_na(self, ext): - parsed = self.get_exceldf('test3', 'Sheet1') + parsed = self.get_exceldf('test3', ext, 'Sheet1') expected = DataFrame([[np.nan]], columns=['Test']) tm.assert_frame_equal(parsed, expected) - def test_excel_passes_na(self): + def test_excel_passes_na(self, ext): - excel = self.get_excelfile('test4') + excel = self.get_excelfile('test4', ext) parsed = read_excel(excel, 'Sheet1', keep_default_na=False, na_values=['apple']) @@ -257,7 +208,7 @@ def test_excel_passes_na(self): tm.assert_frame_equal(parsed, expected) # 13967 - excel = self.get_excelfile('test5') + excel = self.get_excelfile('test5', ext) parsed = read_excel(excel, 'Sheet1', keep_default_na=False, na_values=['apple']) @@ -271,9 +222,9 @@ def test_excel_passes_na(self): columns=['Test']) tm.assert_frame_equal(parsed, expected) - def test_excel_table_sheet_by_index(self): + def test_excel_table_sheet_by_index(self, ext): - excel = self.get_excelfile('test1') + excel = self.get_excelfile('test1', ext) dfref = self.get_csv_refdf('test1') df1 = read_excel(excel, 0, index_col=0) @@ -300,21 +251,22 @@ def test_excel_table_sheet_by_index(self): with pytest.raises(xlrd.XLRDError): read_excel(excel, 'asdf') - def test_excel_table(self): + def test_excel_table(self, ext): dfref = self.get_csv_refdf('test1') - df1 = self.get_exceldf('test1', 'Sheet1', index_col=0) - df2 = self.get_exceldf('test1', 'Sheet2', skiprows=[1], index_col=0) + df1 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0) + df2 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], + index_col=0) # TODO add index to file tm.assert_frame_equal(df1, dfref, check_names=False) tm.assert_frame_equal(df2, dfref, check_names=False) - df3 = self.get_exceldf('test1', 'Sheet1', index_col=0, + df3 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0, skipfooter=1) tm.assert_frame_equal(df3, df1.iloc[:-1]) - def test_reader_special_dtypes(self): + def test_reader_special_dtypes(self, ext): expected = DataFrame.from_dict(OrderedDict([ ("IntCol", [1, 2, -3, 4, 0]), @@ -330,36 +282,36 @@ def test_reader_special_dtypes(self): basename = 'test_types' # should read in correctly and infer types - actual = self.get_exceldf(basename, 'Sheet1') + actual = self.get_exceldf(basename, ext, 'Sheet1') tm.assert_frame_equal(actual, expected) # if not coercing number, then int comes in as float float_expected = expected.copy() float_expected["IntCol"] = float_expected["IntCol"].astype(float) float_expected.loc[float_expected.index[1], "Str2Col"] = 3.0 - actual = self.get_exceldf(basename, 'Sheet1', convert_float=False) + actual = self.get_exceldf(basename, ext, 'Sheet1', convert_float=False) tm.assert_frame_equal(actual, float_expected) # check setting Index (assuming xls and xlsx are the same here) for icol, name in enumerate(expected.columns): - actual = self.get_exceldf(basename, 'Sheet1', index_col=icol) + actual = self.get_exceldf(basename, ext, 'Sheet1', index_col=icol) exp = expected.set_index(name) tm.assert_frame_equal(actual, exp) # convert_float and converters should be different but both accepted expected["StrCol"] = expected["StrCol"].apply(str) actual = self.get_exceldf( - basename, 'Sheet1', converters={"StrCol": str}) + basename, ext, 'Sheet1', converters={"StrCol": str}) tm.assert_frame_equal(actual, expected) no_convert_float = float_expected.copy() no_convert_float["StrCol"] = no_convert_float["StrCol"].apply(str) - actual = self.get_exceldf(basename, 'Sheet1', convert_float=False, + actual = self.get_exceldf(basename, ext, 'Sheet1', convert_float=False, converters={"StrCol": str}) tm.assert_frame_equal(actual, no_convert_float) # GH8212 - support for converters and missing values - def test_reader_converters(self): + def test_reader_converters(self, ext): basename = 'test_converters' @@ -378,13 +330,14 @@ def test_reader_converters(self): # should read in correctly and set types of single cells (not array # dtypes) - actual = self.get_exceldf(basename, 'Sheet1', converters=converters) + actual = self.get_exceldf(basename, ext, 'Sheet1', + converters=converters) tm.assert_frame_equal(actual, expected) - def test_reader_dtype(self): + def test_reader_dtype(self, ext): # GH 8212 basename = 'testdtype' - actual = self.get_exceldf(basename) + actual = self.get_exceldf(basename, ext) expected = DataFrame({ 'a': [1, 2, 3, 4], @@ -395,7 +348,7 @@ def test_reader_dtype(self): tm.assert_frame_equal(actual, expected) - actual = self.get_exceldf(basename, + actual = self.get_exceldf(basename, ext, dtype={'a': 'float64', 'b': 'float32', 'c': str}) @@ -406,14 +359,14 @@ def test_reader_dtype(self): tm.assert_frame_equal(actual, expected) with pytest.raises(ValueError): - actual = self.get_exceldf(basename, dtype={'d': 'int64'}) + actual = self.get_exceldf(basename, ext, dtype={'d': 'int64'}) - def test_reading_all_sheets(self): + def test_reading_all_sheets(self, ext): # Test reading all sheetnames by setting sheetname to None, # Ensure a dict is returned. # See PR #9450 basename = 'test_multisheet' - dfs = self.get_exceldf(basename, sheet_name=None) + dfs = self.get_exceldf(basename, ext, sheet_name=None) # ensure this is not alphabetical to test order preservation expected_keys = ['Charlie', 'Alpha', 'Beta'] tm.assert_contains_all(expected_keys, dfs.keys()) @@ -421,7 +374,7 @@ def test_reading_all_sheets(self): # Ensure sheet order is preserved assert expected_keys == list(dfs.keys()) - def test_reading_multiple_specific_sheets(self): + def test_reading_multiple_specific_sheets(self, ext): # Test reading specific sheetnames by specifying a mixed list # of integers and strings, and confirm that duplicated sheet # references (positions/names) are removed properly. @@ -430,42 +383,41 @@ def test_reading_multiple_specific_sheets(self): basename = 'test_multisheet' # Explicitly request duplicates. Only the set should be returned. expected_keys = [2, 'Charlie', 'Charlie'] - dfs = self.get_exceldf(basename, sheet_name=expected_keys) + dfs = self.get_exceldf(basename, ext, sheet_name=expected_keys) expected_keys = list(set(expected_keys)) tm.assert_contains_all(expected_keys, dfs.keys()) assert len(expected_keys) == len(dfs.keys()) - def test_reading_all_sheets_with_blank(self): + def test_reading_all_sheets_with_blank(self, ext): # Test reading all sheetnames by setting sheetname to None, # In the case where some sheets are blank. # Issue #11711 basename = 'blank_with_header' - dfs = self.get_exceldf(basename, sheet_name=None) + dfs = self.get_exceldf(basename, ext, sheet_name=None) expected_keys = ['Sheet1', 'Sheet2', 'Sheet3'] tm.assert_contains_all(expected_keys, dfs.keys()) # GH6403 - def test_read_excel_blank(self): - actual = self.get_exceldf('blank', 'Sheet1') + def test_read_excel_blank(self, ext): + actual = self.get_exceldf('blank', ext, 'Sheet1') tm.assert_frame_equal(actual, DataFrame()) - def test_read_excel_blank_with_header(self): + def test_read_excel_blank_with_header(self, ext): expected = DataFrame(columns=['col_1', 'col_2']) - actual = self.get_exceldf('blank_with_header', 'Sheet1') + actual = self.get_exceldf('blank_with_header', ext, 'Sheet1') tm.assert_frame_equal(actual, expected) + @td.skip_if_no('openpyxl') + @td.skip_if_no('xlwt') # GH 12292 : error when read one empty column from excel file - def test_read_one_empty_col_no_header(self): - _skip_if_no_xlwt() - _skip_if_no_openpyxl() - + def test_read_one_empty_col_no_header(self, ext): df = pd.DataFrame( [["", 1, 100], ["", 2, 200], ["", 3, 300], ["", 4, 400]] ) - with ensure_clean(self.ext) as path: + with ensure_clean(ext) as path: df.to_excel(path, 'no_header', index=False, header=False) actual_header_none = read_excel( path, @@ -484,17 +436,16 @@ def test_read_one_empty_col_no_header(self): tm.assert_frame_equal(actual_header_none, expected) tm.assert_frame_equal(actual_header_zero, expected) - def test_read_one_empty_col_with_header(self): - _skip_if_no_xlwt() - _skip_if_no_openpyxl() - + @td.skip_if_no('openpyxl') + @td.skip_if_no('xlwt') + def test_read_one_empty_col_with_header(self, ext): df = pd.DataFrame( [["", 1, 100], ["", 2, 200], ["", 3, 300], ["", 4, 400]] ) - with ensure_clean(self.ext) as path: + with ensure_clean(ext) as path: df.to_excel(path, 'with_header', index=False, header=True) actual_header_none = read_excel( path, @@ -514,16 +465,15 @@ def test_read_one_empty_col_with_header(self): expected_header_zero = DataFrame(columns=[0], dtype='int64') tm.assert_frame_equal(actual_header_zero, expected_header_zero) - def test_set_column_names_in_parameter(self): - _skip_if_no_xlwt() - _skip_if_no_openpyxl() - + @td.skip_if_no('openpyxl') + @td.skip_if_no('xlwt') + def test_set_column_names_in_parameter(self, ext): # GH 12870 : pass down column names associated with # keyword argument names refdf = pd.DataFrame([[1, 'foo'], [2, 'bar'], [3, 'baz']], columns=['a', 'b']) - with ensure_clean(self.ext) as pth: + with ensure_clean(ext) as pth: with ExcelWriter(pth) as writer: refdf.to_excel(writer, 'Data_no_head', header=False, index=False) @@ -540,42 +490,45 @@ def test_set_column_names_in_parameter(self): tm.assert_frame_equal(xlsdf_no_head, refdf) tm.assert_frame_equal(xlsdf_with_head, refdf) - def test_date_conversion_overflow(self): + def test_date_conversion_overflow(self, ext): # GH 10001 : pandas.ExcelFile ignore parse_dates=False expected = pd.DataFrame([[pd.Timestamp('2016-03-12'), 'Marc Johnson'], [pd.Timestamp('2016-03-16'), 'Jack Black'], [1e+20, 'Timothy Brown']], columns=['DateColWithBigInt', 'StringCol']) - result = self.get_exceldf('testdateoverflow') + result = self.get_exceldf('testdateoverflow', ext) tm.assert_frame_equal(result, expected) - def test_sheet_name_and_sheetname(self): + def test_sheet_name_and_sheetname(self, ext): # GH10559: Minor improvement: Change "sheet_name" to "sheetname" # GH10969: DOC: Consistent var names (sheetname vs sheet_name) # GH12604: CLN GH10559 Rename sheetname variable to sheet_name dfref = self.get_csv_refdf('test1') - df1 = self.get_exceldf('test1', sheet_name='Sheet1') # doc + df1 = self.get_exceldf('test1', ext, sheet_name='Sheet1') # doc with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - df2 = self.get_exceldf('test1', sheetname='Sheet1') # bkwrd compat + df2 = self.get_exceldf('test1', ext, + sheetname='Sheet1') # bkwrd compat tm.assert_frame_equal(df1, dfref, check_names=False) tm.assert_frame_equal(df2, dfref, check_names=False) - def test_sheet_name_both_raises(self): + def test_sheet_name_both_raises(self, ext): with tm.assert_raises_regex(TypeError, "Cannot specify both"): - self.get_exceldf('test1', sheetname='Sheet1', sheet_name='Sheet1') + self.get_exceldf('test1', ext, sheetname='Sheet1', + sheet_name='Sheet1') -class XlrdTests(ReadingTestsBase): +@pytest.mark.parametrize("ext", ['.xls', '.xlsx', '.xlsm']) +class TestXlrdReader(ReadingTestsBase): """ This is the base class for the xlrd tests, and 3 different file formats are supported: xls, xlsx, xlsm """ - def test_excel_read_buffer(self): + def test_excel_read_buffer(self, ext): - pth = os.path.join(self.dirpath, 'test1' + self.ext) + pth = os.path.join(self.dirpath, 'test1' + ext) expected = read_excel(pth, 'Sheet1', index_col=0) with open(pth, 'rb') as f: actual = read_excel(f, 'Sheet1', index_col=0) @@ -586,10 +539,10 @@ def test_excel_read_buffer(self): actual = read_excel(xls, 'Sheet1', index_col=0) tm.assert_frame_equal(expected, actual) - def test_read_xlrd_Book(self): - _skip_if_no_xlwt() - + @td.skip_if_no('xlwt') + def test_read_xlrd_Book(self, ext): import xlrd + df = self.frame with ensure_clean('.xls') as pth: df.to_excel(pth, "SheetA") @@ -603,39 +556,39 @@ def test_read_xlrd_Book(self): tm.assert_frame_equal(df, result) @tm.network - def test_read_from_http_url(self): + def test_read_from_http_url(self, ext): url = ('https://raw.github.com/pandas-dev/pandas/master/' - 'pandas/tests/io/data/test1' + self.ext) + 'pandas/tests/io/data/test1' + ext) url_table = read_excel(url) - local_table = self.get_exceldf('test1') + local_table = self.get_exceldf('test1', ext) tm.assert_frame_equal(url_table, local_table) - def test_read_from_s3_url(self): + @td.skip_if_no('s3fs') + def test_read_from_s3_url(self, ext): boto3 = pytest.importorskip('boto3') - pytest.importorskip('s3fs') moto = pytest.importorskip('moto') with moto.mock_s3(): conn = boto3.resource("s3", region_name="us-east-1") conn.create_bucket(Bucket="pandas-test") - file_name = os.path.join(self.dirpath, 'test1' + self.ext) + file_name = os.path.join(self.dirpath, 'test1' + ext) with open(file_name, 'rb') as f: - conn.Bucket("pandas-test").put_object(Key="test1" + self.ext, + conn.Bucket("pandas-test").put_object(Key="test1" + ext, Body=f) - url = ('s3://pandas-test/test1' + self.ext) + url = ('s3://pandas-test/test1' + ext) url_table = read_excel(url) - local_table = self.get_exceldf('test1') + local_table = self.get_exceldf('test1', ext) tm.assert_frame_equal(url_table, local_table) @pytest.mark.slow - def test_read_from_file_url(self): + def test_read_from_file_url(self, ext): # FILE if sys.version_info[:2] < (2, 6): pytest.skip("file:// not supported with Python < 2.6") - localtable = os.path.join(self.dirpath, 'test1' + self.ext) + localtable = os.path.join(self.dirpath, 'test1' + ext) local_table = read_excel(localtable) try: @@ -649,37 +602,37 @@ def test_read_from_file_url(self): tm.assert_frame_equal(url_table, local_table) @td.skip_if_no('pathlib') - def test_read_from_pathlib_path(self): + def test_read_from_pathlib_path(self, ext): # GH12655 from pathlib import Path - str_path = os.path.join(self.dirpath, 'test1' + self.ext) + str_path = os.path.join(self.dirpath, 'test1' + ext) expected = read_excel(str_path, 'Sheet1', index_col=0) - path_obj = Path(self.dirpath, 'test1' + self.ext) + path_obj = Path(self.dirpath, 'test1' + ext) actual = read_excel(path_obj, 'Sheet1', index_col=0) tm.assert_frame_equal(expected, actual) @td.skip_if_no('py.path') - def test_read_from_py_localpath(self): + def test_read_from_py_localpath(self, ext): # GH12655 from py.path import local as LocalPath - str_path = os.path.join(self.dirpath, 'test1' + self.ext) + str_path = os.path.join(self.dirpath, 'test1' + ext) expected = read_excel(str_path, 'Sheet1', index_col=0) abs_dir = os.path.abspath(self.dirpath) - path_obj = LocalPath(abs_dir).join('test1' + self.ext) + path_obj = LocalPath(abs_dir).join('test1' + ext) actual = read_excel(path_obj, 'Sheet1', index_col=0) tm.assert_frame_equal(expected, actual) - def test_reader_closes_file(self): + def test_reader_closes_file(self, ext): - pth = os.path.join(self.dirpath, 'test1' + self.ext) + pth = os.path.join(self.dirpath, 'test1' + ext) f = open(pth, 'rb') with ExcelFile(f) as xlsx: # parses okay @@ -687,14 +640,12 @@ def test_reader_closes_file(self): assert f.closed - def test_creating_and_reading_multiple_sheets(self): + @td.skip_if_no('openpyxl') + @td.skip_if_no('xlwt') + def test_creating_and_reading_multiple_sheets(self, ext): # Test reading multiple sheets, from a runtime created excel file # with multiple sheets. # See PR #9450 - - _skip_if_no_xlwt() - _skip_if_no_openpyxl() - def tdf(sheetname): d, i = [11, 22, 33], [1, 2, 3] return DataFrame(d, i, columns=[sheetname]) @@ -704,7 +655,7 @@ def tdf(sheetname): dfs = [tdf(s) for s in sheets] dfs = dict(zip(sheets, dfs)) - with ensure_clean(self.ext) as pth: + with ensure_clean(ext) as pth: with ExcelWriter(pth) as ew: for sheetname, df in iteritems(dfs): df.to_excel(ew, sheetname) @@ -712,10 +663,10 @@ def tdf(sheetname): for s in sheets: tm.assert_frame_equal(dfs[s], dfs_returned[s]) - def test_reader_seconds(self): - # Test reading times with and without milliseconds. GH5945. + def test_reader_seconds(self, ext): import xlrd + # Test reading times with and without milliseconds. GH5945. if LooseVersion(xlrd.__VERSION__) >= LooseVersion("0.9.3"): # Xlrd >= 0.9.3 can handle Excel milliseconds. expected = DataFrame.from_dict({"Time": [time(1, 2, 3), @@ -743,16 +694,16 @@ def test_reader_seconds(self): time(16, 37, 1), time(18, 20, 54)]}) - actual = self.get_exceldf('times_1900', 'Sheet1') + actual = self.get_exceldf('times_1900', ext, 'Sheet1') tm.assert_frame_equal(actual, expected) - actual = self.get_exceldf('times_1904', 'Sheet1') + actual = self.get_exceldf('times_1904', ext, 'Sheet1') tm.assert_frame_equal(actual, expected) - def test_read_excel_multiindex(self): + def test_read_excel_multiindex(self, ext): # GH 4679 mi = MultiIndex.from_product([['foo', 'bar'], ['a', 'b']]) - mi_file = os.path.join(self.dirpath, 'testmultiindex' + self.ext) + mi_file = os.path.join(self.dirpath, 'testmultiindex' + ext) expected = DataFrame([[1, 2.5, pd.Timestamp('2015-01-01'), True], [2, 3.5, pd.Timestamp('2015-01-02'), False], @@ -806,9 +757,9 @@ def test_read_excel_multiindex(self): header=[0, 1], skiprows=2) tm.assert_frame_equal(actual, expected) - def test_read_excel_multiindex_empty_level(self): + @td.skip_if_no('xlsxwriter') + def test_read_excel_multiindex_empty_level(self, ext): # GH 12453 - _skip_if_no_xlsxwriter() with ensure_clean('.xlsx') as path: df = DataFrame({ ('Zero', ''): {0: 0}, @@ -846,9 +797,9 @@ def test_read_excel_multiindex_empty_level(self): actual = pd.read_excel(path, header=[0, 1]) tm.assert_frame_equal(actual, expected) - def test_excel_multindex_roundtrip(self): + @td.skip_if_no('xlsxwriter') + def test_excel_multindex_roundtrip(self, ext): # GH 4679 - _skip_if_no_xlsxwriter() with ensure_clean('.xlsx') as pth: for c_idx_names in [True, False]: for r_idx_names in [True, False]: @@ -891,9 +842,9 @@ def test_excel_multindex_roundtrip(self): tm.assert_frame_equal( df, act, check_names=check_names) - def test_excel_old_index_format(self): + def test_excel_old_index_format(self, ext): # see gh-4679 - filename = 'test_index_name_pre17' + self.ext + filename = 'test_index_name_pre17' + ext in_file = os.path.join(self.dirpath, filename) # We detect headers to determine if index names exist, so @@ -952,31 +903,30 @@ def test_excel_old_index_format(self): actual = pd.read_excel(in_file, 'multi_no_names', index_col=[0, 1]) tm.assert_frame_equal(actual, expected, check_names=False) - def test_read_excel_bool_header_arg(self): + def test_read_excel_bool_header_arg(self, ext): # GH 6114 for arg in [True, False]: with pytest.raises(TypeError): - pd.read_excel(os.path.join(self.dirpath, 'test1' + self.ext), + pd.read_excel(os.path.join(self.dirpath, 'test1' + ext), header=arg) - def test_read_excel_chunksize(self): + def test_read_excel_chunksize(self, ext): # GH 8011 with pytest.raises(NotImplementedError): - pd.read_excel(os.path.join(self.dirpath, 'test1' + self.ext), + pd.read_excel(os.path.join(self.dirpath, 'test1' + ext), chunksize=100) - def test_read_excel_parse_dates(self): + @td.skip_if_no('openpyxl') + @td.skip_if_no('xlwt') + def test_read_excel_parse_dates(self, ext): # GH 11544, 12051 - _skip_if_no_openpyxl() - _skip_if_no_xlwt() # for df2.to_excel - df = DataFrame( {'col': [1, 2, 3], 'date_strings': pd.date_range('2012-01-01', periods=3)}) df2 = df.copy() df2['date_strings'] = df2['date_strings'].dt.strftime('%m/%d/%Y') - with ensure_clean(self.ext) as pth: + with ensure_clean(ext) as pth: df2.to_excel(pth) res = read_excel(pth) @@ -995,10 +945,10 @@ def test_read_excel_parse_dates(self): date_parser=dateparser, index_col=0) tm.assert_frame_equal(df, res) - def test_read_excel_skiprows_list(self): + def test_read_excel_skiprows_list(self, ext): # GH 4903 actual = pd.read_excel(os.path.join(self.dirpath, - 'testskiprows' + self.ext), + 'testskiprows' + ext), 'skiprows_list', skiprows=[0, 2]) expected = DataFrame([[1, 2.5, pd.Timestamp('2015-01-01'), True], [2, 3.5, pd.Timestamp('2015-01-02'), False], @@ -1008,40 +958,40 @@ def test_read_excel_skiprows_list(self): tm.assert_frame_equal(actual, expected) actual = pd.read_excel(os.path.join(self.dirpath, - 'testskiprows' + self.ext), + 'testskiprows' + ext), 'skiprows_list', skiprows=np.array([0, 2])) tm.assert_frame_equal(actual, expected) - def test_read_excel_nrows(self): + def test_read_excel_nrows(self, ext): # GH 16645 num_rows_to_pull = 5 - actual = pd.read_excel(os.path.join(self.dirpath, 'test1' + self.ext), + actual = pd.read_excel(os.path.join(self.dirpath, 'test1' + ext), nrows=num_rows_to_pull) expected = pd.read_excel(os.path.join(self.dirpath, - 'test1' + self.ext)) + 'test1' + ext)) expected = expected[:num_rows_to_pull] tm.assert_frame_equal(actual, expected) - def test_read_excel_nrows_greater_than_nrows_in_file(self): + def test_read_excel_nrows_greater_than_nrows_in_file(self, ext): # GH 16645 expected = pd.read_excel(os.path.join(self.dirpath, - 'test1' + self.ext)) + 'test1' + ext)) num_records_in_file = len(expected) num_rows_to_pull = num_records_in_file + 10 - actual = pd.read_excel(os.path.join(self.dirpath, 'test1' + self.ext), + actual = pd.read_excel(os.path.join(self.dirpath, 'test1' + ext), nrows=num_rows_to_pull) tm.assert_frame_equal(actual, expected) - def test_read_excel_nrows_non_integer_parameter(self): + def test_read_excel_nrows_non_integer_parameter(self, ext): # GH 16645 msg = "'nrows' must be an integer >=0" with tm.assert_raises_regex(ValueError, msg): - pd.read_excel(os.path.join(self.dirpath, 'test1' + self.ext), + pd.read_excel(os.path.join(self.dirpath, 'test1' + ext), nrows='5') - def test_read_excel_squeeze(self): + def test_read_excel_squeeze(self, ext): # GH 12157 - f = os.path.join(self.dirpath, 'test_squeeze' + self.ext) + f = os.path.join(self.dirpath, 'test_squeeze' + ext) actual = pd.read_excel(f, 'two_columns', index_col=0, squeeze=True) expected = pd.Series([2, 3, 4], [4, 5, 6], name='b') @@ -1058,351 +1008,308 @@ def test_read_excel_squeeze(self): tm.assert_series_equal(actual, expected) -class TestXlsReaderTests(XlrdTests): - ext = '.xls' - engine_name = 'xlrd' - check_skip = staticmethod(_skip_if_no_xlrd) - - -class TestXlsxReaderTests(XlrdTests): - ext = '.xlsx' - engine_name = 'xlrd' - check_skip = staticmethod(_skip_if_no_xlrd) +class _WriterBase(SharedItems): + @pytest.fixture(autouse=True) + def set_engine_and_path(self, request, merge_cells, engine, ext): + """Fixture to set engine and open file for use in each test case -class TestXlsmReaderTests(XlrdTests): - ext = '.xlsm' - engine_name = 'xlrd' - check_skip = staticmethod(_skip_if_no_xlrd) + Rather than requiring `engine=...` to be provided explictly as an + argument in each test, this fixture sets a global option to dictate + which engine should be used to write Excel files. After executing + the test it rolls back said change to the global option. + It also uses a context manager to open a temporary excel file for + the function to write to, accessible via `self.path` -class ExcelWriterBase(SharedItems): + Notes + ----- + This fixture will run as part of each test method defined in the + class and any subclasses, on account of the `autouse=True` + argument + """ + option_name = 'io.excel.{ext}.writer'.format(ext=ext.strip('.')) + prev_engine = get_option(option_name) + set_option(option_name, engine) + with ensure_clean(ext) as path: + self.path = path + yield + set_option(option_name, prev_engine) # Roll back option change + + +@pytest.mark.parametrize("merge_cells", [True, False]) +@pytest.mark.parametrize("engine,ext", [ + pytest.param('openpyxl', '.xlsx', marks=pytest.mark.skipif( + not td.safe_import('openpyxl'), reason='No openpyxl')), + pytest.param('openpyxl', '.xlsm', marks=pytest.mark.skipif( + not td.safe_import('openpyxl'), reason='No openpyxl')), + pytest.param('xlwt', '.xls', marks=pytest.mark.skipif( + not td.safe_import('xlwt'), reason='No xlwt')), + pytest.param('xlsxwriter', '.xlsx', marks=pytest.mark.skipif( + not td.safe_import('xlsxwriter'), reason='No xlsxwriter')) +]) +class TestExcelWriter(_WriterBase): # Base class for test cases to run with different Excel writers. - # To add a writer test, define the following: - # 1. A check_skip function that skips your tests if your writer isn't - # installed. - # 2. Add a property ext, which is the file extension that your writer - # writes to. (needs to start with '.' so it's a valid path) - # 3. Add a property engine_name, which is the name of the writer class. - - # Test with MultiIndex and Hierarchical Rows as merged cells. - merge_cells = True - - def setup_method(self, method): - self.check_skip() - super(ExcelWriterBase, self).setup_method(method) - self.option_name = 'io.excel.%s.writer' % self.ext.strip('.') - self.prev_engine = get_option(self.option_name) - set_option(self.option_name, self.engine_name) - - def teardown_method(self, method): - set_option(self.option_name, self.prev_engine) - def test_excel_sheet_by_name_raise(self): - _skip_if_no_xlrd() + def test_excel_sheet_by_name_raise(self, merge_cells, engine, ext): import xlrd - with ensure_clean(self.ext) as pth: - gt = DataFrame(np.random.randn(10, 2)) - gt.to_excel(pth) - xl = ExcelFile(pth) - df = read_excel(xl, 0) - tm.assert_frame_equal(gt, df) - - with pytest.raises(xlrd.XLRDError): - read_excel(xl, '0') - - def test_excelwriter_contextmanager(self): - _skip_if_no_xlrd() - - with ensure_clean(self.ext) as pth: - with ExcelWriter(pth) as writer: - self.frame.to_excel(writer, 'Data1') - self.frame2.to_excel(writer, 'Data2') - - with ExcelFile(pth) as reader: - found_df = read_excel(reader, 'Data1') - found_df2 = read_excel(reader, 'Data2') - tm.assert_frame_equal(found_df, self.frame) - tm.assert_frame_equal(found_df2, self.frame2) - - def test_roundtrip(self): - _skip_if_no_xlrd() - - with ensure_clean(self.ext) as path: - self.frame['A'][:5] = nan - - self.frame.to_excel(path, 'test1') - self.frame.to_excel(path, 'test1', columns=['A', 'B']) - self.frame.to_excel(path, 'test1', header=False) - self.frame.to_excel(path, 'test1', index=False) - - # test roundtrip - self.frame.to_excel(path, 'test1') - recons = read_excel(path, 'test1', index_col=0) - tm.assert_frame_equal(self.frame, recons) - - self.frame.to_excel(path, 'test1', index=False) - recons = read_excel(path, 'test1', index_col=None) - recons.index = self.frame.index - tm.assert_frame_equal(self.frame, recons) - - self.frame.to_excel(path, 'test1', na_rep='NA') - recons = read_excel(path, 'test1', index_col=0, na_values=['NA']) - tm.assert_frame_equal(self.frame, recons) - - # GH 3611 - self.frame.to_excel(path, 'test1', na_rep='88') - recons = read_excel(path, 'test1', index_col=0, na_values=['88']) - tm.assert_frame_equal(self.frame, recons) - - self.frame.to_excel(path, 'test1', na_rep='88') - recons = read_excel(path, 'test1', index_col=0, - na_values=[88, 88.0]) - tm.assert_frame_equal(self.frame, recons) - - # GH 6573 - self.frame.to_excel(path, 'Sheet1') - recons = read_excel(path, index_col=0) - tm.assert_frame_equal(self.frame, recons) - - self.frame.to_excel(path, '0') - recons = read_excel(path, index_col=0) - tm.assert_frame_equal(self.frame, recons) - - # GH 8825 Pandas Series should provide to_excel method - s = self.frame["A"] - s.to_excel(path) - recons = read_excel(path, index_col=0) - tm.assert_frame_equal(s.to_frame(), recons) - - def test_mixed(self): - _skip_if_no_xlrd() - - with ensure_clean(self.ext) as path: - self.mixed_frame.to_excel(path, 'test1') - reader = ExcelFile(path) - recons = read_excel(reader, 'test1', index_col=0) - tm.assert_frame_equal(self.mixed_frame, recons) - - def test_tsframe(self): - _skip_if_no_xlrd() + gt = DataFrame(np.random.randn(10, 2)) + gt.to_excel(self.path) + xl = ExcelFile(self.path) + df = read_excel(xl, 0) + tm.assert_frame_equal(gt, df) + with pytest.raises(xlrd.XLRDError): + read_excel(xl, '0') + + def test_excelwriter_contextmanager(self, merge_cells, engine, ext): + with ExcelWriter(self.path) as writer: + self.frame.to_excel(writer, 'Data1') + self.frame2.to_excel(writer, 'Data2') + + with ExcelFile(self.path) as reader: + found_df = read_excel(reader, 'Data1') + found_df2 = read_excel(reader, 'Data2') + tm.assert_frame_equal(found_df, self.frame) + tm.assert_frame_equal(found_df2, self.frame2) + + def test_roundtrip(self, merge_cells, engine, ext): + self.frame['A'][:5] = nan + + self.frame.to_excel(self.path, 'test1') + self.frame.to_excel(self.path, 'test1', columns=['A', 'B']) + self.frame.to_excel(self.path, 'test1', header=False) + self.frame.to_excel(self.path, 'test1', index=False) + + # test roundtrip + self.frame.to_excel(self.path, 'test1') + recons = read_excel(self.path, 'test1', index_col=0) + tm.assert_frame_equal(self.frame, recons) + + self.frame.to_excel(self.path, 'test1', index=False) + recons = read_excel(self.path, 'test1', index_col=None) + recons.index = self.frame.index + tm.assert_frame_equal(self.frame, recons) + + self.frame.to_excel(self.path, 'test1', na_rep='NA') + recons = read_excel(self.path, 'test1', index_col=0, na_values=['NA']) + tm.assert_frame_equal(self.frame, recons) + + # GH 3611 + self.frame.to_excel(self.path, 'test1', na_rep='88') + recons = read_excel(self.path, 'test1', index_col=0, na_values=['88']) + tm.assert_frame_equal(self.frame, recons) + + self.frame.to_excel(self.path, 'test1', na_rep='88') + recons = read_excel(self.path, 'test1', index_col=0, + na_values=[88, 88.0]) + tm.assert_frame_equal(self.frame, recons) + + # GH 6573 + self.frame.to_excel(self.path, 'Sheet1') + recons = read_excel(self.path, index_col=0) + tm.assert_frame_equal(self.frame, recons) + + self.frame.to_excel(self.path, '0') + recons = read_excel(self.path, index_col=0) + tm.assert_frame_equal(self.frame, recons) + + # GH 8825 Pandas Series should provide to_excel method + s = self.frame["A"] + s.to_excel(self.path) + recons = read_excel(self.path, index_col=0) + tm.assert_frame_equal(s.to_frame(), recons) + + def test_mixed(self, merge_cells, engine, ext): + self.mixed_frame.to_excel(self.path, 'test1') + reader = ExcelFile(self.path) + recons = read_excel(reader, 'test1', index_col=0) + tm.assert_frame_equal(self.mixed_frame, recons) + + def test_tsframe(self, merge_cells, engine, ext): df = tm.makeTimeDataFrame()[:5] - with ensure_clean(self.ext) as path: - df.to_excel(path, 'test1') - reader = ExcelFile(path) - recons = read_excel(reader, 'test1') - tm.assert_frame_equal(df, recons) - - def test_basics_with_nan(self): - _skip_if_no_xlrd() - with ensure_clean(self.ext) as path: - self.frame['A'][:5] = nan - self.frame.to_excel(path, 'test1') - self.frame.to_excel(path, 'test1', columns=['A', 'B']) - self.frame.to_excel(path, 'test1', header=False) - self.frame.to_excel(path, 'test1', index=False) - - def test_int_types(self): - _skip_if_no_xlrd() - - for np_type in (np.int8, np.int16, np.int32, np.int64): - - with ensure_clean(self.ext) as path: - # Test np.int values read come back as int (rather than float - # which is Excel's format). - frame = DataFrame(np.random.randint(-10, 10, size=(10, 2)), - dtype=np_type) - frame.to_excel(path, 'test1') - reader = ExcelFile(path) - recons = read_excel(reader, 'test1') - int_frame = frame.astype(np.int64) - tm.assert_frame_equal(int_frame, recons) - recons2 = read_excel(path, 'test1') - tm.assert_frame_equal(int_frame, recons2) - - # test with convert_float=False comes back as float - float_frame = frame.astype(float) - recons = read_excel(path, 'test1', convert_float=False) - tm.assert_frame_equal(recons, float_frame, - check_index_type=False, - check_column_type=False) - - def test_float_types(self): - _skip_if_no_xlrd() - - for np_type in (np.float16, np.float32, np.float64): - with ensure_clean(self.ext) as path: - # Test np.float values read come back as float. - frame = DataFrame(np.random.random_sample(10), dtype=np_type) - frame.to_excel(path, 'test1') - reader = ExcelFile(path) - recons = read_excel(reader, 'test1').astype(np_type) - tm.assert_frame_equal(frame, recons, check_dtype=False) - - def test_bool_types(self): - _skip_if_no_xlrd() - - for np_type in (np.bool8, np.bool_): - with ensure_clean(self.ext) as path: - # Test np.bool values read come back as float. - frame = (DataFrame([1, 0, True, False], dtype=np_type)) - frame.to_excel(path, 'test1') - reader = ExcelFile(path) - recons = read_excel(reader, 'test1').astype(np_type) - tm.assert_frame_equal(frame, recons) - - def test_inf_roundtrip(self): - _skip_if_no_xlrd() - + df.to_excel(self.path, 'test1') + reader = ExcelFile(self.path) + recons = read_excel(reader, 'test1') + tm.assert_frame_equal(df, recons) + + def test_basics_with_nan(self, merge_cells, engine, ext): + self.frame['A'][:5] = nan + self.frame.to_excel(self.path, 'test1') + self.frame.to_excel(self.path, 'test1', columns=['A', 'B']) + self.frame.to_excel(self.path, 'test1', header=False) + self.frame.to_excel(self.path, 'test1', index=False) + + @pytest.mark.parametrize("np_type", [ + np.int8, np.int16, np.int32, np.int64]) + def test_int_types(self, merge_cells, engine, ext, np_type): + # Test np.int values read come back as int (rather than float + # which is Excel's format). + frame = DataFrame(np.random.randint(-10, 10, size=(10, 2)), + dtype=np_type) + frame.to_excel(self.path, 'test1') + reader = ExcelFile(self.path) + recons = read_excel(reader, 'test1') + int_frame = frame.astype(np.int64) + tm.assert_frame_equal(int_frame, recons) + recons2 = read_excel(self.path, 'test1') + tm.assert_frame_equal(int_frame, recons2) + + # test with convert_float=False comes back as float + float_frame = frame.astype(float) + recons = read_excel(self.path, 'test1', convert_float=False) + tm.assert_frame_equal(recons, float_frame, + check_index_type=False, + check_column_type=False) + + @pytest.mark.parametrize("np_type", [ + np.float16, np.float32, np.float64]) + def test_float_types(self, merge_cells, engine, ext, np_type): + # Test np.float values read come back as float. + frame = DataFrame(np.random.random_sample(10), dtype=np_type) + frame.to_excel(self.path, 'test1') + reader = ExcelFile(self.path) + recons = read_excel(reader, 'test1').astype(np_type) + tm.assert_frame_equal(frame, recons, check_dtype=False) + + @pytest.mark.parametrize("np_type", [np.bool8, np.bool_]) + def test_bool_types(self, merge_cells, engine, ext, np_type): + # Test np.bool values read come back as float. + frame = (DataFrame([1, 0, True, False], dtype=np_type)) + frame.to_excel(self.path, 'test1') + reader = ExcelFile(self.path) + recons = read_excel(reader, 'test1').astype(np_type) + tm.assert_frame_equal(frame, recons) + + def test_inf_roundtrip(self, merge_cells, engine, ext): frame = DataFrame([(1, np.inf), (2, 3), (5, -np.inf)]) - with ensure_clean(self.ext) as path: - frame.to_excel(path, 'test1') - reader = ExcelFile(path) - recons = read_excel(reader, 'test1') - tm.assert_frame_equal(frame, recons) - - def test_sheets(self): - _skip_if_no_xlrd() - - with ensure_clean(self.ext) as path: - self.frame['A'][:5] = nan - - self.frame.to_excel(path, 'test1') - self.frame.to_excel(path, 'test1', columns=['A', 'B']) - self.frame.to_excel(path, 'test1', header=False) - self.frame.to_excel(path, 'test1', index=False) - - # Test writing to separate sheets - writer = ExcelWriter(path) - self.frame.to_excel(writer, 'test1') - self.tsframe.to_excel(writer, 'test2') - writer.save() - reader = ExcelFile(path) - recons = read_excel(reader, 'test1', index_col=0) - tm.assert_frame_equal(self.frame, recons) - recons = read_excel(reader, 'test2', index_col=0) - tm.assert_frame_equal(self.tsframe, recons) - assert 2 == len(reader.sheet_names) - assert 'test1' == reader.sheet_names[0] - assert 'test2' == reader.sheet_names[1] - - def test_colaliases(self): - _skip_if_no_xlrd() - - with ensure_clean(self.ext) as path: - self.frame['A'][:5] = nan - - self.frame.to_excel(path, 'test1') - self.frame.to_excel(path, 'test1', columns=['A', 'B']) - self.frame.to_excel(path, 'test1', header=False) - self.frame.to_excel(path, 'test1', index=False) - - # column aliases - col_aliases = Index(['AA', 'X', 'Y', 'Z']) - self.frame2.to_excel(path, 'test1', header=col_aliases) - reader = ExcelFile(path) - rs = read_excel(reader, 'test1', index_col=0) - xp = self.frame2.copy() - xp.columns = col_aliases - tm.assert_frame_equal(xp, rs) - - def test_roundtrip_indexlabels(self): - _skip_if_no_xlrd() - - with ensure_clean(self.ext) as path: - - self.frame['A'][:5] = nan - - self.frame.to_excel(path, 'test1') - self.frame.to_excel(path, 'test1', columns=['A', 'B']) - self.frame.to_excel(path, 'test1', header=False) - self.frame.to_excel(path, 'test1', index=False) - - # test index_label - frame = (DataFrame(np.random.randn(10, 2)) >= 0) - frame.to_excel(path, 'test1', - index_label=['test'], - merge_cells=self.merge_cells) - reader = ExcelFile(path) - recons = read_excel(reader, 'test1', - index_col=0, - ).astype(np.int64) - frame.index.names = ['test'] - assert frame.index.names == recons.index.names - - frame = (DataFrame(np.random.randn(10, 2)) >= 0) - frame.to_excel(path, - 'test1', - index_label=['test', 'dummy', 'dummy2'], - merge_cells=self.merge_cells) - reader = ExcelFile(path) - recons = read_excel(reader, 'test1', - index_col=0, - ).astype(np.int64) - frame.index.names = ['test'] - assert frame.index.names == recons.index.names - - frame = (DataFrame(np.random.randn(10, 2)) >= 0) - frame.to_excel(path, - 'test1', - index_label='test', - merge_cells=self.merge_cells) - reader = ExcelFile(path) - recons = read_excel(reader, 'test1', - index_col=0, - ).astype(np.int64) - frame.index.names = ['test'] - tm.assert_frame_equal(frame, recons.astype(bool)) - - with ensure_clean(self.ext) as path: - - self.frame.to_excel(path, - 'test1', - columns=['A', 'B', 'C', 'D'], - index=False, merge_cells=self.merge_cells) - # take 'A' and 'B' as indexes (same row as cols 'C', 'D') - df = self.frame.copy() - df = df.set_index(['A', 'B']) - - reader = ExcelFile(path) - recons = read_excel(reader, 'test1', index_col=[0, 1]) - tm.assert_frame_equal(df, recons, check_less_precise=True) - - def test_excel_roundtrip_indexname(self): - _skip_if_no_xlrd() - + frame.to_excel(self.path, 'test1') + reader = ExcelFile(self.path) + recons = read_excel(reader, 'test1') + tm.assert_frame_equal(frame, recons) + + def test_sheets(self, merge_cells, engine, ext): + self.frame['A'][:5] = nan + + self.frame.to_excel(self.path, 'test1') + self.frame.to_excel(self.path, 'test1', columns=['A', 'B']) + self.frame.to_excel(self.path, 'test1', header=False) + self.frame.to_excel(self.path, 'test1', index=False) + + # Test writing to separate sheets + writer = ExcelWriter(self.path) + self.frame.to_excel(writer, 'test1') + self.tsframe.to_excel(writer, 'test2') + writer.save() + reader = ExcelFile(self.path) + recons = read_excel(reader, 'test1', index_col=0) + tm.assert_frame_equal(self.frame, recons) + recons = read_excel(reader, 'test2', index_col=0) + tm.assert_frame_equal(self.tsframe, recons) + assert 2 == len(reader.sheet_names) + assert 'test1' == reader.sheet_names[0] + assert 'test2' == reader.sheet_names[1] + + def test_colaliases(self, merge_cells, engine, ext): + self.frame['A'][:5] = nan + + self.frame.to_excel(self.path, 'test1') + self.frame.to_excel(self.path, 'test1', columns=['A', 'B']) + self.frame.to_excel(self.path, 'test1', header=False) + self.frame.to_excel(self.path, 'test1', index=False) + + # column aliases + col_aliases = Index(['AA', 'X', 'Y', 'Z']) + self.frame2.to_excel(self.path, 'test1', header=col_aliases) + reader = ExcelFile(self.path) + rs = read_excel(reader, 'test1', index_col=0) + xp = self.frame2.copy() + xp.columns = col_aliases + tm.assert_frame_equal(xp, rs) + + def test_roundtrip_indexlabels(self, merge_cells, engine, ext): + self.frame['A'][:5] = nan + + self.frame.to_excel(self.path, 'test1') + self.frame.to_excel(self.path, 'test1', columns=['A', 'B']) + self.frame.to_excel(self.path, 'test1', header=False) + self.frame.to_excel(self.path, 'test1', index=False) + + # test index_label + frame = (DataFrame(np.random.randn(10, 2)) >= 0) + frame.to_excel(self.path, 'test1', + index_label=['test'], + merge_cells=merge_cells) + reader = ExcelFile(self.path) + recons = read_excel(reader, 'test1', + index_col=0, + ).astype(np.int64) + frame.index.names = ['test'] + assert frame.index.names == recons.index.names + + frame = (DataFrame(np.random.randn(10, 2)) >= 0) + frame.to_excel(self.path, + 'test1', + index_label=['test', 'dummy', 'dummy2'], + merge_cells=merge_cells) + reader = ExcelFile(self.path) + recons = read_excel(reader, 'test1', + index_col=0, + ).astype(np.int64) + frame.index.names = ['test'] + assert frame.index.names == recons.index.names + + frame = (DataFrame(np.random.randn(10, 2)) >= 0) + frame.to_excel(self.path, + 'test1', + index_label='test', + merge_cells=merge_cells) + reader = ExcelFile(self.path) + recons = read_excel(reader, 'test1', + index_col=0, + ).astype(np.int64) + frame.index.names = ['test'] + tm.assert_frame_equal(frame, recons.astype(bool)) + + self.frame.to_excel(self.path, + 'test1', + columns=['A', 'B', 'C', 'D'], + index=False, merge_cells=merge_cells) + # take 'A' and 'B' as indexes (same row as cols 'C', 'D') + df = self.frame.copy() + df = df.set_index(['A', 'B']) + + reader = ExcelFile(self.path) + recons = read_excel(reader, 'test1', index_col=[0, 1]) + tm.assert_frame_equal(df, recons, check_less_precise=True) + + def test_excel_roundtrip_indexname(self, merge_cells, engine, ext): df = DataFrame(np.random.randn(10, 4)) df.index.name = 'foo' - with ensure_clean(self.ext) as path: - df.to_excel(path, merge_cells=self.merge_cells) + df.to_excel(self.path, merge_cells=merge_cells) - xf = ExcelFile(path) - result = read_excel(xf, xf.sheet_names[0], - index_col=0) + xf = ExcelFile(self.path) + result = read_excel(xf, xf.sheet_names[0], + index_col=0) - tm.assert_frame_equal(result, df) - assert result.index.name == 'foo' - - def test_excel_roundtrip_datetime(self): - _skip_if_no_xlrd() + tm.assert_frame_equal(result, df) + assert result.index.name == 'foo' + def test_excel_roundtrip_datetime(self, merge_cells, engine, ext): # datetime.date, not sure what to test here exactly tsf = self.tsframe.copy() - with ensure_clean(self.ext) as path: - tsf.index = [x.date() for x in self.tsframe.index] - tsf.to_excel(path, 'test1', merge_cells=self.merge_cells) - reader = ExcelFile(path) - recons = read_excel(reader, 'test1') - tm.assert_frame_equal(self.tsframe, recons) + tsf.index = [x.date() for x in self.tsframe.index] + tsf.to_excel(self.path, 'test1', merge_cells=merge_cells) + reader = ExcelFile(self.path) + recons = read_excel(reader, 'test1') + tm.assert_frame_equal(self.tsframe, recons) # GH4133 - excel output format strings - def test_excel_date_datetime_format(self): - _skip_if_no_xlrd() + def test_excel_date_datetime_format(self, merge_cells, engine, ext): df = DataFrame([[date(2014, 1, 31), date(1999, 9, 24)], [datetime(1998, 5, 26, 23, 33, 4), @@ -1414,133 +1321,117 @@ def test_excel_date_datetime_format(self): datetime(2014, 2, 28, 13, 5, 13)]], index=['DATE', 'DATETIME'], columns=['X', 'Y']) - with ensure_clean(self.ext) as filename1: - with ensure_clean(self.ext) as filename2: - writer1 = ExcelWriter(filename1) - writer2 = ExcelWriter(filename2, - date_format='DD.MM.YYYY', - datetime_format='DD.MM.YYYY HH-MM-SS') + with ensure_clean(ext) as filename2: + writer1 = ExcelWriter(self.path) + writer2 = ExcelWriter(filename2, + date_format='DD.MM.YYYY', + datetime_format='DD.MM.YYYY HH-MM-SS') - df.to_excel(writer1, 'test1') - df.to_excel(writer2, 'test1') + df.to_excel(writer1, 'test1') + df.to_excel(writer2, 'test1') - writer1.close() - writer2.close() + writer1.close() + writer2.close() - reader1 = ExcelFile(filename1) - reader2 = ExcelFile(filename2) + reader1 = ExcelFile(self.path) + reader2 = ExcelFile(filename2) - rs1 = read_excel(reader1, 'test1', index_col=None) - rs2 = read_excel(reader2, 'test1', index_col=None) + rs1 = read_excel(reader1, 'test1', index_col=None) + rs2 = read_excel(reader2, 'test1', index_col=None) - tm.assert_frame_equal(rs1, rs2) + tm.assert_frame_equal(rs1, rs2) - # since the reader returns a datetime object for dates, we need - # to use df_expected to check the result - tm.assert_frame_equal(rs2, df_expected) + # since the reader returns a datetime object for dates, we need + # to use df_expected to check the result + tm.assert_frame_equal(rs2, df_expected) - def test_to_excel_interval_no_labels(self): + def test_to_excel_interval_no_labels(self, merge_cells, engine, ext): # GH19242 - test writing Interval without labels - _skip_if_no_xlrd() - - with ensure_clean(self.ext) as path: - frame = DataFrame(np.random.randint(-10, 10, size=(20, 1)), - dtype=np.int64) - expected = frame.copy() - frame['new'] = pd.cut(frame[0], 10) - expected['new'] = pd.cut(expected[0], 10).astype(str) - frame.to_excel(path, 'test1') - reader = ExcelFile(path) - recons = read_excel(reader, 'test1') - tm.assert_frame_equal(expected, recons) - - def test_to_excel_interval_labels(self): + frame = DataFrame(np.random.randint(-10, 10, size=(20, 1)), + dtype=np.int64) + expected = frame.copy() + frame['new'] = pd.cut(frame[0], 10) + expected['new'] = pd.cut(expected[0], 10).astype(str) + frame.to_excel(self.path, 'test1') + reader = ExcelFile(self.path) + recons = read_excel(reader, 'test1') + tm.assert_frame_equal(expected, recons) + + def test_to_excel_interval_labels(self, merge_cells, engine, ext): # GH19242 - test writing Interval with labels - _skip_if_no_xlrd() - - with ensure_clean(self.ext) as path: - frame = DataFrame(np.random.randint(-10, 10, size=(20, 1)), - dtype=np.int64) - expected = frame.copy() - intervals = pd.cut(frame[0], 10, labels=['A', 'B', 'C', 'D', 'E', - 'F', 'G', 'H', 'I', 'J']) - frame['new'] = intervals - expected['new'] = pd.Series(list(intervals)) - frame.to_excel(path, 'test1') - reader = ExcelFile(path) - recons = read_excel(reader, 'test1') - tm.assert_frame_equal(expected, recons) - - def test_to_excel_timedelta(self): + frame = DataFrame(np.random.randint(-10, 10, size=(20, 1)), + dtype=np.int64) + expected = frame.copy() + intervals = pd.cut(frame[0], 10, labels=['A', 'B', 'C', 'D', 'E', + 'F', 'G', 'H', 'I', 'J']) + frame['new'] = intervals + expected['new'] = pd.Series(list(intervals)) + frame.to_excel(self.path, 'test1') + reader = ExcelFile(self.path) + recons = read_excel(reader, 'test1') + tm.assert_frame_equal(expected, recons) + + def test_to_excel_timedelta(self, merge_cells, engine, ext): # GH 19242, GH9155 - test writing timedelta to xls - _skip_if_no_xlrd() - - with ensure_clean('.xls') as path: - frame = DataFrame(np.random.randint(-10, 10, size=(20, 1)), - columns=['A'], - dtype=np.int64 - ) - expected = frame.copy() - frame['new'] = frame['A'].apply(lambda x: timedelta(seconds=x)) - expected['new'] = expected['A'].apply( - lambda x: timedelta(seconds=x).total_seconds() / float(86400)) - frame.to_excel(path, 'test1') - reader = ExcelFile(path) - recons = read_excel(reader, 'test1') - tm.assert_frame_equal(expected, recons) - - def test_to_excel_periodindex(self): - _skip_if_no_xlrd() - + if engine == 'openpyxl': + pytest.xfail('Timedelta roundtrip broken with openpyxl') + if engine == 'xlsxwriter' and (sys.version_info[0] == 2 and + sys.platform.startswith('linux')): + pytest.xfail('Not working on linux with Py2 and xlsxwriter') + frame = DataFrame(np.random.randint(-10, 10, size=(20, 1)), + columns=['A'], + dtype=np.int64 + ) + expected = frame.copy() + frame['new'] = frame['A'].apply(lambda x: timedelta(seconds=x)) + expected['new'] = expected['A'].apply( + lambda x: timedelta(seconds=x).total_seconds() / float(86400)) + frame.to_excel(self.path, 'test1') + reader = ExcelFile(self.path) + recons = read_excel(reader, 'test1') + tm.assert_frame_equal(expected, recons) + + def test_to_excel_periodindex(self, merge_cells, engine, ext): frame = self.tsframe xp = frame.resample('M', kind='period').mean() - with ensure_clean(self.ext) as path: - xp.to_excel(path, 'sht1') - - reader = ExcelFile(path) - rs = read_excel(reader, 'sht1', index_col=0) - tm.assert_frame_equal(xp, rs.to_period('M')) + xp.to_excel(self.path, 'sht1') - def test_to_excel_multiindex(self): - _skip_if_no_xlrd() + reader = ExcelFile(self.path) + rs = read_excel(reader, 'sht1', index_col=0) + tm.assert_frame_equal(xp, rs.to_period('M')) + def test_to_excel_multiindex(self, merge_cells, engine, ext): frame = self.frame arrays = np.arange(len(frame.index) * 2).reshape(2, -1) new_index = MultiIndex.from_arrays(arrays, names=['first', 'second']) frame.index = new_index - with ensure_clean(self.ext) as path: - frame.to_excel(path, 'test1', header=False) - frame.to_excel(path, 'test1', columns=['A', 'B']) + frame.to_excel(self.path, 'test1', header=False) + frame.to_excel(self.path, 'test1', columns=['A', 'B']) - # round trip - frame.to_excel(path, 'test1', merge_cells=self.merge_cells) - reader = ExcelFile(path) - df = read_excel(reader, 'test1', index_col=[0, 1]) - tm.assert_frame_equal(frame, df) + # round trip + frame.to_excel(self.path, 'test1', merge_cells=merge_cells) + reader = ExcelFile(self.path) + df = read_excel(reader, 'test1', index_col=[0, 1]) + tm.assert_frame_equal(frame, df) # GH13511 - def test_to_excel_multiindex_nan_label(self): - _skip_if_no_xlrd() - + def test_to_excel_multiindex_nan_label(self, merge_cells, engine, ext): frame = pd.DataFrame({'A': [None, 2, 3], 'B': [10, 20, 30], 'C': np.random.sample(3)}) frame = frame.set_index(['A', 'B']) - with ensure_clean(self.ext) as path: - frame.to_excel(path, merge_cells=self.merge_cells) - df = read_excel(path, index_col=[0, 1]) - tm.assert_frame_equal(frame, df) + frame.to_excel(self.path, merge_cells=merge_cells) + df = read_excel(self.path, index_col=[0, 1]) + tm.assert_frame_equal(frame, df) # Test for Issue 11328. If column indices are integers, make # sure they are handled correctly for either setting of # merge_cells - def test_to_excel_multiindex_cols(self): - _skip_if_no_xlrd() - + def test_to_excel_multiindex_cols(self, merge_cells, engine, ext): frame = self.frame arrays = np.arange(len(frame.index) * 2).reshape(2, -1) new_index = MultiIndex.from_arrays(arrays, @@ -1551,42 +1442,37 @@ def test_to_excel_multiindex_cols(self): (50, 1), (50, 2)]) frame.columns = new_cols_index header = [0, 1] - if not self.merge_cells: + if not merge_cells: header = 0 - with ensure_clean(self.ext) as path: - # round trip - frame.to_excel(path, 'test1', merge_cells=self.merge_cells) - reader = ExcelFile(path) - df = read_excel(reader, 'test1', header=header, - index_col=[0, 1]) - if not self.merge_cells: - fm = frame.columns.format(sparsify=False, - adjoin=False, names=False) - frame.columns = [".".join(map(str, q)) for q in zip(*fm)] - tm.assert_frame_equal(frame, df) - - def test_to_excel_multiindex_dates(self): - _skip_if_no_xlrd() - + # round trip + frame.to_excel(self.path, 'test1', merge_cells=merge_cells) + reader = ExcelFile(self.path) + df = read_excel(reader, 'test1', header=header, + index_col=[0, 1]) + if not merge_cells: + fm = frame.columns.format(sparsify=False, + adjoin=False, names=False) + frame.columns = [".".join(map(str, q)) for q in zip(*fm)] + tm.assert_frame_equal(frame, df) + + def test_to_excel_multiindex_dates(self, merge_cells, engine, ext): # try multiindex with dates tsframe = self.tsframe.copy() new_index = [tsframe.index, np.arange(len(tsframe.index))] tsframe.index = MultiIndex.from_arrays(new_index) - with ensure_clean(self.ext) as path: - tsframe.index.names = ['time', 'foo'] - tsframe.to_excel(path, 'test1', merge_cells=self.merge_cells) - reader = ExcelFile(path) - recons = read_excel(reader, 'test1', - index_col=[0, 1]) - - tm.assert_frame_equal(tsframe, recons) - assert recons.index.names == ('time', 'foo') + tsframe.index.names = ['time', 'foo'] + tsframe.to_excel(self.path, 'test1', merge_cells=merge_cells) + reader = ExcelFile(self.path) + recons = read_excel(reader, 'test1', + index_col=[0, 1]) - def test_to_excel_multiindex_no_write_index(self): - _skip_if_no_xlrd() + tm.assert_frame_equal(tsframe, recons) + assert recons.index.names == ('time', 'foo') + def test_to_excel_multiindex_no_write_index(self, merge_cells, engine, + ext): # Test writing and re-reading a MI witout the index. GH 5616. # Initial non-MI frame. @@ -1597,53 +1483,44 @@ def test_to_excel_multiindex_no_write_index(self): multi_index = MultiIndex.from_tuples([(70, 80), (90, 100)]) frame2.index = multi_index - with ensure_clean(self.ext) as path: + # Write out to Excel without the index. + frame2.to_excel(self.path, 'test1', index=False) - # Write out to Excel without the index. - frame2.to_excel(path, 'test1', index=False) + # Read it back in. + reader = ExcelFile(self.path) + frame3 = read_excel(reader, 'test1') - # Read it back in. - reader = ExcelFile(path) - frame3 = read_excel(reader, 'test1') - - # Test that it is the same as the initial frame. - tm.assert_frame_equal(frame1, frame3) - - def test_to_excel_float_format(self): - _skip_if_no_xlrd() + # Test that it is the same as the initial frame. + tm.assert_frame_equal(frame1, frame3) + def test_to_excel_float_format(self, merge_cells, engine, ext): df = DataFrame([[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], index=['A', 'B'], columns=['X', 'Y', 'Z']) - with ensure_clean(self.ext) as filename: - df.to_excel(filename, 'test1', float_format='%.2f') + df.to_excel(self.path, 'test1', float_format='%.2f') - reader = ExcelFile(filename) - rs = read_excel(reader, 'test1', index_col=None) - xp = DataFrame([[0.12, 0.23, 0.57], - [12.32, 123123.20, 321321.20]], - index=['A', 'B'], columns=['X', 'Y', 'Z']) - tm.assert_frame_equal(rs, xp) - - def test_to_excel_output_encoding(self): - _skip_if_no_xlrd() + reader = ExcelFile(self.path) + rs = read_excel(reader, 'test1', index_col=None) + xp = DataFrame([[0.12, 0.23, 0.57], + [12.32, 123123.20, 321321.20]], + index=['A', 'B'], columns=['X', 'Y', 'Z']) + tm.assert_frame_equal(rs, xp) + def test_to_excel_output_encoding(self, merge_cells, engine, ext): # avoid mixed inferred_type df = DataFrame([[u'\u0192', u'\u0193', u'\u0194'], [u'\u0195', u'\u0196', u'\u0197']], index=[u'A\u0192', u'B'], columns=[u'X\u0193', u'Y', u'Z']) - with ensure_clean('__tmp_to_excel_float_format__.' + self.ext)\ - as filename: + with ensure_clean('__tmp_to_excel_float_format__.' + ext) as filename: df.to_excel(filename, sheet_name='TestSheet', encoding='utf8') result = read_excel(filename, 'TestSheet', encoding='utf8') tm.assert_frame_equal(result, df) - def test_to_excel_unicode_filename(self): - _skip_if_no_xlrd() - with ensure_clean(u('\u0192u.') + self.ext) as filename: + def test_to_excel_unicode_filename(self, merge_cells, engine, ext): + with ensure_clean(u('\u0192u.') + ext) as filename: try: f = open(filename, 'wb') except UnicodeEncodeError: @@ -1664,7 +1541,7 @@ def test_to_excel_unicode_filename(self): index=['A', 'B'], columns=['X', 'Y', 'Z']) tm.assert_frame_equal(rs, xp) - # def test_to_excel_header_styling_xls(self): + # def test_to_excel_header_styling_xls(self, merge_cells, engine, ext): # import StringIO # s = StringIO( @@ -1711,7 +1588,7 @@ def test_to_excel_unicode_filename(self): # assert 1 == cell_xf.border.left_line_style # assert 2 == cell_xf.alignment.hor_align # os.remove(filename) - # def test_to_excel_header_styling_xlsx(self): + # def test_to_excel_header_styling_xlsx(self, merge_cells, engine, ext): # import StringIO # s = StringIO( # """Date,ticker,type,value @@ -1764,10 +1641,8 @@ def test_to_excel_unicode_filename(self): # assert ws.cell(maddr).merged # os.remove(filename) - def test_excel_010_hemstring(self): - _skip_if_no_xlrd() - - if self.merge_cells: + def test_excel_010_hemstring(self, merge_cells, engine, ext): + if merge_cells: pytest.skip('Skip tests for merged MI format.') from pandas.util.testing import makeCustomDataframe as mkdf @@ -1776,12 +1651,11 @@ def test_excel_010_hemstring(self): def roundtrip(df, header=True, parser_hdr=0, index=True): - with ensure_clean(self.ext) as path: - df.to_excel(path, header=header, - merge_cells=self.merge_cells, index=index) - xf = ExcelFile(path) - res = read_excel(xf, xf.sheet_names[0], header=parser_hdr) - return res + df.to_excel(self.path, header=header, + merge_cells=merge_cells, index=index) + xf = ExcelFile(self.path) + res = read_excel(xf, xf.sheet_names[0], header=parser_hdr) + return res nrows = 5 ncols = 3 @@ -1817,12 +1691,11 @@ def roundtrip(df, header=True, parser_hdr=0, index=True): assert res.shape == (1, 2) assert res.iloc[0, 0] is not np.nan - def test_excel_010_hemstring_raises_NotImplementedError(self): + def test_excel_010_hemstring_raises_NotImplementedError(self, merge_cells, + engine, ext): # This test was failing only for j>1 and header=False, # So I reproduced a simple test. - _skip_if_no_xlrd() - - if self.merge_cells: + if merge_cells: pytest.skip('Skip tests for merged MI format.') from pandas.util.testing import makeCustomDataframe as mkdf @@ -1831,12 +1704,11 @@ def test_excel_010_hemstring_raises_NotImplementedError(self): def roundtrip2(df, header=True, parser_hdr=0, index=True): - with ensure_clean(self.ext) as path: - df.to_excel(path, header=header, - merge_cells=self.merge_cells, index=index) - xf = ExcelFile(path) - res = read_excel(xf, xf.sheet_names[0], header=parser_hdr) - return res + df.to_excel(self.path, header=header, + merge_cells=merge_cells, index=index) + xf = ExcelFile(self.path) + res = read_excel(xf, xf.sheet_names[0], header=parser_hdr) + return res nrows = 5 ncols = 3 @@ -1846,134 +1718,119 @@ def roundtrip2(df, header=True, parser_hdr=0, index=True): with pytest.raises(NotImplementedError): roundtrip2(df, header=False, index=False) - def test_duplicated_columns(self): + def test_duplicated_columns(self, merge_cells, engine, ext): # Test for issue #5235 - _skip_if_no_xlrd() - - with ensure_clean(self.ext) as path: - write_frame = DataFrame([[1, 2, 3], [1, 2, 3], [1, 2, 3]]) - colnames = ['A', 'B', 'B'] - - write_frame.columns = colnames - write_frame.to_excel(path, 'test1') - - read_frame = read_excel(path, 'test1') - read_frame.columns = colnames - tm.assert_frame_equal(write_frame, read_frame) - - # 11007 / #10970 - write_frame = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], - columns=['A', 'B', 'A', 'B']) - write_frame.to_excel(path, 'test1') - read_frame = read_excel(path, 'test1') - read_frame.columns = ['A', 'B', 'A', 'B'] - tm.assert_frame_equal(write_frame, read_frame) - - # 10982 - write_frame.to_excel(path, 'test1', index=False, header=False) - read_frame = read_excel(path, 'test1', header=None) - write_frame.columns = [0, 1, 2, 3] - tm.assert_frame_equal(write_frame, read_frame) - - def test_swapped_columns(self): - # Test for issue #5427. - _skip_if_no_xlrd() + write_frame = DataFrame([[1, 2, 3], [1, 2, 3], [1, 2, 3]]) + colnames = ['A', 'B', 'B'] - with ensure_clean(self.ext) as path: - write_frame = DataFrame({'A': [1, 1, 1], - 'B': [2, 2, 2]}) - write_frame.to_excel(path, 'test1', columns=['B', 'A']) + write_frame.columns = colnames + write_frame.to_excel(self.path, 'test1') - read_frame = read_excel(path, 'test1', header=0) + read_frame = read_excel(self.path, 'test1') + read_frame.columns = colnames + tm.assert_frame_equal(write_frame, read_frame) - tm.assert_series_equal(write_frame['A'], read_frame['A']) - tm.assert_series_equal(write_frame['B'], read_frame['B']) + # 11007 / #10970 + write_frame = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], + columns=['A', 'B', 'A', 'B']) + write_frame.to_excel(self.path, 'test1') + read_frame = read_excel(self.path, 'test1') + read_frame.columns = ['A', 'B', 'A', 'B'] + tm.assert_frame_equal(write_frame, read_frame) - def test_invalid_columns(self): # 10982 - _skip_if_no_xlrd() + write_frame.to_excel(self.path, 'test1', index=False, header=False) + read_frame = read_excel(self.path, 'test1', header=None) + write_frame.columns = [0, 1, 2, 3] + tm.assert_frame_equal(write_frame, read_frame) + + def test_swapped_columns(self, merge_cells, engine, ext): + # Test for issue #5427. + write_frame = DataFrame({'A': [1, 1, 1], + 'B': [2, 2, 2]}) + write_frame.to_excel(self.path, 'test1', columns=['B', 'A']) + + read_frame = read_excel(self.path, 'test1', header=0) + + tm.assert_series_equal(write_frame['A'], read_frame['A']) + tm.assert_series_equal(write_frame['B'], read_frame['B']) - with ensure_clean(self.ext) as path: - write_frame = DataFrame({'A': [1, 1, 1], - 'B': [2, 2, 2]}) + def test_invalid_columns(self, merge_cells, engine, ext): + # 10982 + write_frame = DataFrame({'A': [1, 1, 1], + 'B': [2, 2, 2]}) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - write_frame.to_excel(path, 'test1', columns=['B', 'C']) - expected = write_frame.reindex(columns=['B', 'C']) - read_frame = read_excel(path, 'test1') - tm.assert_frame_equal(expected, read_frame) + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + write_frame.to_excel(self.path, 'test1', columns=['B', 'C']) + expected = write_frame.reindex(columns=['B', 'C']) + read_frame = read_excel(self.path, 'test1') + tm.assert_frame_equal(expected, read_frame) - with pytest.raises(KeyError): - write_frame.to_excel(path, 'test1', columns=['C', 'D']) + with pytest.raises(KeyError): + write_frame.to_excel(self.path, 'test1', columns=['C', 'D']) - def test_comment_arg(self): + def test_comment_arg(self, merge_cells, engine, ext): # Re issue #18735 # Test the comment argument functionality to read_excel - with ensure_clean(self.ext) as path: - - # Create file to read in - df = DataFrame({'A': ['one', '#one', 'one'], - 'B': ['two', 'two', '#two']}) - df.to_excel(path, 'test_c') - - # Read file without comment arg - result1 = read_excel(path, 'test_c') - result1.iloc[1, 0] = None - result1.iloc[1, 1] = None - result1.iloc[2, 1] = None - result2 = read_excel(path, 'test_c', comment='#') - tm.assert_frame_equal(result1, result2) - - def test_comment_default(self): + + # Create file to read in + df = DataFrame({'A': ['one', '#one', 'one'], + 'B': ['two', 'two', '#two']}) + df.to_excel(self.path, 'test_c') + + # Read file without comment arg + result1 = read_excel(self.path, 'test_c') + result1.iloc[1, 0] = None + result1.iloc[1, 1] = None + result1.iloc[2, 1] = None + result2 = read_excel(self.path, 'test_c', comment='#') + tm.assert_frame_equal(result1, result2) + + def test_comment_default(self, merge_cells, engine, ext): # Re issue #18735 # Test the comment argument default to read_excel - with ensure_clean(self.ext) as path: - # Create file to read in - df = DataFrame({'A': ['one', '#one', 'one'], - 'B': ['two', 'two', '#two']}) - df.to_excel(path, 'test_c') + # Create file to read in + df = DataFrame({'A': ['one', '#one', 'one'], + 'B': ['two', 'two', '#two']}) + df.to_excel(self.path, 'test_c') - # Read file with default and explicit comment=None - result1 = read_excel(path, 'test_c') - result2 = read_excel(path, 'test_c', comment=None) - tm.assert_frame_equal(result1, result2) + # Read file with default and explicit comment=None + result1 = read_excel(self.path, 'test_c') + result2 = read_excel(self.path, 'test_c', comment=None) + tm.assert_frame_equal(result1, result2) - def test_comment_used(self): + def test_comment_used(self, merge_cells, engine, ext): # Re issue #18735 # Test the comment argument is working as expected when used - with ensure_clean(self.ext) as path: - # Create file to read in - df = DataFrame({'A': ['one', '#one', 'one'], - 'B': ['two', 'two', '#two']}) - df.to_excel(path, 'test_c') + # Create file to read in + df = DataFrame({'A': ['one', '#one', 'one'], + 'B': ['two', 'two', '#two']}) + df.to_excel(self.path, 'test_c') - # Test read_frame_comment against manually produced expected output - expected = DataFrame({'A': ['one', None, 'one'], - 'B': ['two', None, None]}) - result = read_excel(path, 'test_c', comment='#') - tm.assert_frame_equal(result, expected) + # Test read_frame_comment against manually produced expected output + expected = DataFrame({'A': ['one', None, 'one'], + 'B': ['two', None, None]}) + result = read_excel(self.path, 'test_c', comment='#') + tm.assert_frame_equal(result, expected) - def test_comment_emptyline(self): + def test_comment_emptyline(self, merge_cells, engine, ext): # Re issue #18735 # Test that read_excel ignores commented lines at the end of file - with ensure_clean(self.ext) as path: - df = DataFrame({'a': ['1', '#2'], 'b': ['2', '3']}) - df.to_excel(path, index=False) + df = DataFrame({'a': ['1', '#2'], 'b': ['2', '3']}) + df.to_excel(self.path, index=False) - # Test that all-comment lines at EoF are ignored - expected = DataFrame({'a': [1], 'b': [2]}) - result = read_excel(path, comment='#') - tm.assert_frame_equal(result, expected) + # Test that all-comment lines at EoF are ignored + expected = DataFrame({'a': [1], 'b': [2]}) + result = read_excel(self.path, comment='#') + tm.assert_frame_equal(result, expected) - def test_datetimes(self): + def test_datetimes(self, merge_cells, engine, ext): # Test writing and reading datetimes. For issue #9139. (xref #9185) - _skip_if_no_xlrd() - datetimes = [datetime(2013, 1, 13, 1, 2, 3), datetime(2013, 1, 13, 2, 45, 56), datetime(2013, 1, 13, 4, 29, 49), @@ -1986,21 +1843,18 @@ def test_datetimes(self): datetime(2013, 1, 13, 16, 37, 0), datetime(2013, 1, 13, 18, 20, 52)] - with ensure_clean(self.ext) as path: - write_frame = DataFrame({'A': datetimes}) - write_frame.to_excel(path, 'Sheet1') - read_frame = read_excel(path, 'Sheet1', header=0) + write_frame = DataFrame({'A': datetimes}) + write_frame.to_excel(self.path, 'Sheet1') + read_frame = read_excel(self.path, 'Sheet1', header=0) - tm.assert_series_equal(write_frame['A'], read_frame['A']) + tm.assert_series_equal(write_frame['A'], read_frame['A']) # GH7074 - def test_bytes_io(self): - _skip_if_no_xlrd() - + def test_bytes_io(self, merge_cells, engine, ext): bio = BytesIO() df = DataFrame(np.random.randn(10, 2)) # pass engine explicitly as there is no file path to infer from - writer = ExcelWriter(bio, engine=self.engine_name) + writer = ExcelWriter(bio, engine=engine) df.to_excel(writer) writer.save() bio.seek(0) @@ -2008,62 +1862,59 @@ def test_bytes_io(self): tm.assert_frame_equal(df, reread_df) # GH8188 - def test_write_lists_dict(self): - _skip_if_no_xlrd() - + def test_write_lists_dict(self, merge_cells, engine, ext): df = DataFrame({'mixed': ['a', ['b', 'c'], {'d': 'e', 'f': 2}], 'numeric': [1, 2, 3.0], 'str': ['apple', 'banana', 'cherry']}) expected = df.copy() expected.mixed = expected.mixed.apply(str) expected.numeric = expected.numeric.astype('int64') - with ensure_clean(self.ext) as path: - df.to_excel(path, 'Sheet1') - read = read_excel(path, 'Sheet1', header=0) - tm.assert_frame_equal(read, expected) + + df.to_excel(self.path, 'Sheet1') + read = read_excel(self.path, 'Sheet1', header=0) + tm.assert_frame_equal(read, expected) # GH13347 - def test_true_and_false_value_options(self): + def test_true_and_false_value_options(self, merge_cells, engine, ext): df = pd.DataFrame([['foo', 'bar']], columns=['col1', 'col2']) expected = df.replace({'foo': True, 'bar': False}) - with ensure_clean(self.ext) as path: - df.to_excel(path) - read_frame = read_excel(path, true_values=['foo'], - false_values=['bar']) - tm.assert_frame_equal(read_frame, expected) - def test_freeze_panes(self): + df.to_excel(self.path) + read_frame = read_excel(self.path, true_values=['foo'], + false_values=['bar']) + tm.assert_frame_equal(read_frame, expected) + + def test_freeze_panes(self, merge_cells, engine, ext): # GH15160 expected = DataFrame([[1, 2], [3, 4]], columns=['col1', 'col2']) - with ensure_clean(self.ext) as path: - expected.to_excel(path, "Sheet1", freeze_panes=(1, 1)) - result = read_excel(path) - tm.assert_frame_equal(expected, result) + expected.to_excel(self.path, "Sheet1", freeze_panes=(1, 1)) + result = read_excel(self.path) + tm.assert_frame_equal(expected, result) - def test_path_pathlib(self): + def test_path_pathlib(self, merge_cells, engine, ext): df = tm.makeDataFrame() - writer = partial(df.to_excel, engine=self.engine_name) + writer = partial(df.to_excel, engine=engine) reader = partial(pd.read_excel) result = tm.round_trip_pathlib(writer, reader, - path="foo.{}".format(self.ext)) + path="foo.{}".format(ext)) tm.assert_frame_equal(df, result) - def test_path_localpath(self): + def test_path_localpath(self, merge_cells, engine, ext): df = tm.makeDataFrame() - writer = partial(df.to_excel, engine=self.engine_name) + writer = partial(df.to_excel, engine=engine) reader = partial(pd.read_excel) result = tm.round_trip_pathlib(writer, reader, - path="foo.{}".format(self.ext)) + path="foo.{}".format(ext)) tm.assert_frame_equal(df, result) -class TestOpenpyxlTests(ExcelWriterBase): - engine_name = 'openpyxl' - ext = '.xlsx' - check_skip = staticmethod(_skip_if_no_openpyxl) +@td.skip_if_no('openpyxl') +@pytest.mark.parametrize("merge_cells,ext,engine", [ + (None, '.xlsx', 'openpyxl')]) +class TestOpenpyxlTests(_WriterBase): - def test_to_excel_styleconverter(self): + def test_to_excel_styleconverter(self, merge_cells, ext, engine): from openpyxl import styles hstyle = { @@ -2117,7 +1968,7 @@ def test_to_excel_styleconverter(self): assert kw['number_format'] == number_format assert kw['protection'] == protection - def test_write_cells_merge_styled(self): + def test_write_cells_merge_styled(self, merge_cells, ext, engine): from pandas.io.formats.excel import ExcelCell sheet_name = 'merge_styled' @@ -2138,7 +1989,7 @@ def test_write_cells_merge_styled(self): mergestart=1, mergeend=1, style=sty_merged), ] - with ensure_clean('.xlsx') as path: + with ensure_clean(ext) as path: writer = _OpenpyxlWriter(path) writer.write_cells(initial_cells, sheet_name=sheet_name) writer.write_cells(merge_cells, sheet_name=sheet_name) @@ -2150,44 +2001,41 @@ def test_write_cells_merge_styled(self): assert xcell_a2.font == openpyxl_sty_merged -class TestXlwtTests(ExcelWriterBase): - ext = '.xls' - engine_name = 'xlwt' - check_skip = staticmethod(_skip_if_no_xlwt) +@td.skip_if_no('xlwt') +@pytest.mark.parametrize("merge_cells,ext,engine", [ + (None, '.xls', 'xlwt')]) +class TestXlwtTests(_WriterBase): - def test_excel_raise_error_on_multiindex_columns_and_no_index(self): - _skip_if_no_xlwt() + def test_excel_raise_error_on_multiindex_columns_and_no_index( + self, merge_cells, ext, engine): # MultiIndex as columns is not yet implemented 9794 cols = MultiIndex.from_tuples([('site', ''), ('2014', 'height'), ('2014', 'weight')]) df = DataFrame(np.random.randn(10, 3), columns=cols) with pytest.raises(NotImplementedError): - with ensure_clean(self.ext) as path: + with ensure_clean(ext) as path: df.to_excel(path, index=False) - def test_excel_multiindex_columns_and_index_true(self): - _skip_if_no_xlwt() + def test_excel_multiindex_columns_and_index_true(self, merge_cells, ext, + engine): cols = MultiIndex.from_tuples([('site', ''), ('2014', 'height'), ('2014', 'weight')]) df = pd.DataFrame(np.random.randn(10, 3), columns=cols) - with ensure_clean(self.ext) as path: + with ensure_clean(ext) as path: df.to_excel(path, index=True) - def test_excel_multiindex_index(self): - _skip_if_no_xlwt() + def test_excel_multiindex_index(self, merge_cells, ext, engine): # MultiIndex as index works so assert no error #9794 cols = MultiIndex.from_tuples([('site', ''), ('2014', 'height'), ('2014', 'weight')]) df = DataFrame(np.random.randn(3, 10), index=cols) - with ensure_clean(self.ext) as path: + with ensure_clean(ext) as path: df.to_excel(path, index=False) - def test_to_excel_styleconverter(self): - _skip_if_no_xlwt() - + def test_to_excel_styleconverter(self, merge_cells, ext, engine): import xlwt hstyle = {"font": {"bold": True}, @@ -2207,23 +2055,21 @@ def test_to_excel_styleconverter(self): assert xlwt.Alignment.VERT_TOP == xls_style.alignment.vert -class TestXlsxWriterTests(ExcelWriterBase): - ext = '.xlsx' - engine_name = 'xlsxwriter' - check_skip = staticmethod(_skip_if_no_xlsxwriter) +@td.skip_if_no('xlsxwriter') +@pytest.mark.parametrize("merge_cells,ext,engine", [ + (None, '.xlsx', 'xlsxwriter')]) +class TestXlsxWriterTests(_WriterBase): - def test_column_format(self): + @td.skip_if_no('openpyxl') + def test_column_format(self, merge_cells, ext, engine): # Test that column formats are applied to cells. Test for issue #9167. # Applicable to xlsxwriter only. - _skip_if_no_xlsxwriter() - with warnings.catch_warnings(): # Ignore the openpyxl lxml warning. warnings.simplefilter("ignore") - _skip_if_no_openpyxl() import openpyxl - with ensure_clean(self.ext) as path: + with ensure_clean(ext) as path: frame = DataFrame({'A': [123456, 123456], 'B': [123456, 123456]}) @@ -2260,54 +2106,28 @@ def test_column_format(self): assert read_num_format == num_format -class TestOpenpyxlTests_NoMerge(ExcelWriterBase): - ext = '.xlsx' - engine_name = 'openpyxl' - check_skip = staticmethod(_skip_if_no_openpyxl) - - # Test < 0.13 non-merge behaviour for MultiIndex and Hierarchical Rows. - merge_cells = False - - -class TestXlwtTests_NoMerge(ExcelWriterBase): - ext = '.xls' - engine_name = 'xlwt' - check_skip = staticmethod(_skip_if_no_xlwt) - - # Test < 0.13 non-merge behaviour for MultiIndex and Hierarchical Rows. - merge_cells = False - - -class TestXlsxWriterTests_NoMerge(ExcelWriterBase): - ext = '.xlsx' - engine_name = 'xlsxwriter' - check_skip = staticmethod(_skip_if_no_xlsxwriter) - - # Test < 0.13 non-merge behaviour for MultiIndex and Hierarchical Rows. - merge_cells = False - - class TestExcelWriterEngineTests(object): - def test_ExcelWriter_dispatch(self): - with tm.assert_raises_regex(ValueError, 'No engine'): - ExcelWriter('nothing') - - try: - import xlsxwriter # noqa - writer_klass = _XlsxWriter - except ImportError: - _skip_if_no_openpyxl() - writer_klass = _OpenpyxlWriter - - with ensure_clean('.xlsx') as path: + @pytest.mark.parametrize('klass,ext', [ + pytest.param(_XlsxWriter, '.xlsx', marks=pytest.mark.skipif( + not td.safe_import('xlsxwriter'), reason='No xlsxwriter')), + pytest.param(_OpenpyxlWriter, '.xlsx', marks=pytest.mark.skipif( + not td.safe_import('openpyxl'), reason='No openpyxl')), + pytest.param(_XlwtWriter, '.xls', marks=pytest.mark.skipif( + not td.safe_import('xlwt'), reason='No xlwt')) + ]) + def test_ExcelWriter_dispatch(self, klass, ext): + with ensure_clean(ext) as path: writer = ExcelWriter(path) - assert isinstance(writer, writer_klass) + if ext == '.xlsx' and td.safe_import('xlsxwriter'): + # xlsxwriter has preference over openpyxl if both installed + assert isinstance(writer, _XlsxWriter) + else: + assert isinstance(writer, klass) - _skip_if_no_xlwt() - with ensure_clean('.xls') as path: - writer = ExcelWriter(path) - assert isinstance(writer, _XlwtWriter) + def test_ExcelWriter_dispatch_raises(self): + with tm.assert_raises_regex(ValueError, 'No engine'): + ExcelWriter('nothing') def test_register_writer(self): # some awkward mocking to test out dispatch and such actually works @@ -2498,11 +2318,11 @@ def custom_converter(css): assert n_cells == (10 + 1) * (3 + 1) +@td.skip_if_no('openpyxl') class TestFSPath(object): @pytest.mark.skipif(sys.version_info < (3, 6), reason='requires fspath') def test_excelfile_fspath(self): - _skip_if_no_openpyxl() with tm.ensure_clean('foo.xlsx') as path: df = DataFrame({"A": [1, 2]}) df.to_excel(path) @@ -2513,7 +2333,6 @@ def test_excelfile_fspath(self): @pytest.mark.skipif(sys.version_info < (3, 6), reason='requires fspath') # @pytest.mark.xfail def test_excelwriter_fspath(self): - _skip_if_no_openpyxl() with tm.ensure_clean('foo.xlsx') as path: writer = ExcelWriter(path) assert os.fspath(writer) == str(path) diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index 0fd5648739e5c..b2745ab5eec77 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -57,7 +57,11 @@ def safe_import(mod_name, min_version=None): return mod else: import sys - version = getattr(sys.modules[mod_name], '__version__') + try: + version = getattr(sys.modules[mod_name], '__version__') + except AttributeError: + # xlrd uses a capitalized attribute name + version = getattr(sys.modules[mod_name], '__VERSION__') if version: from distutils.version import LooseVersion if LooseVersion(version) >= LooseVersion(min_version): diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 0009e26f8b100..942416408e4f0 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -2165,7 +2165,7 @@ def network(t, url="http://www.google.com", from pytest import skip t.network = True - @wraps(t) + @compat.wraps(t) def wrapper(*args, **kwargs): if check_before_test and not raise_on_error: if not can_connect(url, error_classes): From e51800b2545f315ebfebd7f7a2a3cbbd5b374cc4 Mon Sep 17 00:00:00 2001 From: cbertinato Date: Mon, 26 Feb 2018 20:13:00 -0500 Subject: [PATCH 209/214] BUG: Fix Series constructor for Categorical with index (#19714) --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/core/series.py | 13 ++++++++++- pandas/tests/io/formats/test_style.py | 2 +- pandas/tests/series/test_constructors.py | 28 ++++++++++++++++++++++++ 4 files changed, 42 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index fb22dc40e335f..5330f7e7e998b 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -919,6 +919,7 @@ Reshaping - Comparisons between :class:`Series` and :class:`Index` would return a ``Series`` with an incorrect name, ignoring the ``Index``'s name attribute (:issue:`19582`) - Bug in :func:`qcut` where datetime and timedelta data with ``NaT`` present raised a ``ValueError`` (:issue:`19768`) - Bug in :func:`DataFrame.iterrows`, which would infers strings not compliant to `ISO8601 `_ to datetimes (:issue:`19671`) +- Bug in :class:`Series` constructor with ``Categorical`` where a ```ValueError`` is not raised when an index of different length is given (:issue:`19342`) Other ^^^^^ diff --git a/pandas/core/series.py b/pandas/core/series.py index 26b7fd552b062..8053651a4877a 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -212,7 +212,6 @@ def __init__(self, data=None, index=None, dtype=None, name=None, 'be False.') elif is_extension_array_dtype(data) and dtype is not None: - # GH12574: Allow dtype=category only, otherwise error if not data.dtype.is_dtype(dtype): raise ValueError("Cannot specify a dtype '{}' with an " "extension array of a different " @@ -235,6 +234,18 @@ def __init__(self, data=None, index=None, dtype=None, name=None, if not is_list_like(data): data = [data] index = com._default_index(len(data)) + elif is_list_like(data): + + # a scalar numpy array is list-like but doesn't + # have a proper length + try: + if len(index) != len(data): + raise ValueError( + 'Length of passed values is {val}, ' + 'index implies {ind}' + .format(val=len(data), ind=len(index))) + except TypeError: + pass # create/copy the manager if isinstance(data, SingleBlockManager): diff --git a/pandas/tests/io/formats/test_style.py b/pandas/tests/io/formats/test_style.py index bedb11d4fc4ae..adf8e14b756c2 100644 --- a/pandas/tests/io/formats/test_style.py +++ b/pandas/tests/io/formats/test_style.py @@ -24,7 +24,7 @@ def setup_method(self, method): def h(x, foo='bar'): return pd.Series( - ['color: {foo}'.format(foo=foo)], index=x.index, name=x.name) + 'color: {foo}'.format(foo=foo), index=x.index, name=x.name) self.h = h self.styler = Styler(self.df) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 77f9dfcce686d..25f425ffa0021 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -400,6 +400,34 @@ def test_constructor_default_index(self): s = Series([0, 1, 2]) tm.assert_index_equal(s.index, pd.Index(np.arange(3))) + @pytest.mark.parametrize('input', [[1, 2, 3], + (1, 2, 3), + list(range(3)), + pd.Categorical(['a', 'b', 'a']), + (i for i in range(3)), + map(lambda x: x, range(3))]) + def test_constructor_index_mismatch(self, input): + # GH 19342 + # test that construction of a Series with an index of different length + # raises an error + msg = 'Length of passed values is 3, index implies 4' + with pytest.raises(ValueError, message=msg): + Series(input, index=np.arange(4)) + + def test_constructor_numpy_scalar(self): + # GH 19342 + # construction with a numpy scalar + # should not raise + result = Series(np.array(100), index=np.arange(4), dtype='int64') + expected = Series(100, index=np.arange(4), dtype='int64') + tm.assert_series_equal(result, expected) + + def test_constructor_broadcast_list(self): + # GH 19342 + # construction with single-element container and index + # should raise + pytest.raises(ValueError, Series, ['foo'], index=['a', 'b', 'c']) + def test_constructor_corner(self): df = tm.makeTimeDataFrame() objs = [df, df] From ceb90319f736736b47e79f9fd330d78b98acc872 Mon Sep 17 00:00:00 2001 From: Jaume Bonet Date: Tue, 27 Feb 2018 02:15:31 +0100 Subject: [PATCH 210/214] CLN: Remove Series._from_array (#19893) --- pandas/core/dtypes/concat.py | 21 +++++++++++++++++++++ pandas/core/frame.py | 8 ++++---- pandas/core/series.py | 18 ++---------------- pandas/core/sparse/series.py | 6 ------ 4 files changed, 27 insertions(+), 26 deletions(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index d306d0d78f1f4..0501493e718d0 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -101,6 +101,27 @@ def _get_frame_result_type(result, objs): ABCSparseDataFrame)) +def _get_sliced_frame_result_type(data, obj): + """ + return appropriate class of Series. When data is sparse + it will return a SparseSeries, otherwise it will return + the Series. + + Parameters + ---------- + data : array-like + obj : DataFrame + + Returns + ------- + Series or SparseSeries + """ + if is_sparse(data): + from pandas.core.sparse.api import SparseSeries + return SparseSeries + return obj._constructor_sliced + + def _concat_compat(to_concat, axis=0): """ provide concatenation of an array of arrays each of which is a single diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e4ef1b97882d9..1f26a367334c6 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -60,6 +60,7 @@ is_iterator, is_sequence, is_named_tuple) +from pandas.core.dtypes.concat import _get_sliced_frame_result_type from pandas.core.dtypes.missing import isna, notna @@ -2166,8 +2167,7 @@ def _ixs(self, i, axis=0): if index_len and not len(values): values = np.array([np.nan] * index_len, dtype=object) - result = self._constructor_sliced._from_array( - values, index=self.index, name=label, fastpath=True) + result = self._box_col_values(values, label) # this is a cached value, mark it so result._set_as_cached(label, self) @@ -2563,8 +2563,8 @@ def _box_item_values(self, key, values): def _box_col_values(self, values, items): """ provide boxed values for a column """ - return self._constructor_sliced._from_array(values, index=self.index, - name=items, fastpath=True) + klass = _get_sliced_frame_result_type(values, self) + return klass(values, index=self.index, name=items, fastpath=True) def __setitem__(self, key, value): key = com._apply_if_callable(key, self) diff --git a/pandas/core/series.py b/pandas/core/series.py index 8053651a4877a..6822f1f6b58b5 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -316,25 +316,11 @@ def from_array(cls, arr, index=None, name=None, dtype=None, copy=False, warnings.warn("'from_array' is deprecated and will be removed in a " "future version. Please use the pd.Series(..) " "constructor instead.", FutureWarning, stacklevel=2) - return cls._from_array(arr, index=index, name=name, dtype=dtype, - copy=copy, fastpath=fastpath) - - @classmethod - def _from_array(cls, arr, index=None, name=None, dtype=None, copy=False, - fastpath=False): - """ - Internal method used in DataFrame.__setitem__/__getitem__. - Difference with Series(..) is that this method checks if a sparse - array is passed. - - """ - # return a sparse series here if isinstance(arr, ABCSparseArray): from pandas.core.sparse.series import SparseSeries cls = SparseSeries - - return cls(arr, index=index, name=name, dtype=dtype, copy=copy, - fastpath=fastpath) + return cls(arr, index=index, name=name, dtype=dtype, + copy=copy, fastpath=fastpath) @property def _constructor(self): diff --git a/pandas/core/sparse/series.py b/pandas/core/sparse/series.py index 7a1496bf11117..f8b98a1a40081 100644 --- a/pandas/core/sparse/series.py +++ b/pandas/core/sparse/series.py @@ -216,12 +216,6 @@ def from_array(cls, arr, index=None, name=None, copy=False, warnings.warn("'from_array' is deprecated and will be removed in a " "future version. Please use the pd.SparseSeries(..) " "constructor instead.", FutureWarning, stacklevel=2) - return cls._from_array(arr, index=index, name=name, copy=copy, - fill_value=fill_value, fastpath=fastpath) - - @classmethod - def _from_array(cls, arr, index=None, name=None, copy=False, - fill_value=None, fastpath=False): return cls(arr, index=index, name=name, copy=copy, fill_value=fill_value, fastpath=fastpath) From 74dbfd0aa9b44936a4db7969bfe25f526fef53ce Mon Sep 17 00:00:00 2001 From: Anh Le Date: Mon, 26 Feb 2018 20:24:57 -0500 Subject: [PATCH 211/214] DOC fix incorrect example in DataFrame.to_dict docstring. Close GH19868 (#19915) --- pandas/core/frame.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1f26a367334c6..ae8fb48a61fce 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -958,8 +958,8 @@ def to_dict(self, orient='dict', into=dict): {'col1': [1, 2], 'col2': [0.5, 0.75]}, index=['a', 'b']) >>> df col1 col2 - a 1 0.1 - b 2 0.2 + a 1 0.50 + b 2 0.75 >>> df.to_dict() {'col1': {'a': 1, 'b': 2}, 'col2': {'a': 0.5, 'b': 0.75}} From f4c9d966fb20d9ca3221d31bd9eb31117065808c Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 27 Feb 2018 03:36:22 -0800 Subject: [PATCH 212/214] handle NaT add/sub in one place (#19903) --- pandas/core/indexes/datetimelike.py | 47 +++++++++++++++++++++++++---- pandas/core/indexes/datetimes.py | 20 +++--------- pandas/core/indexes/period.py | 17 +---------- pandas/core/indexes/timedeltas.py | 18 +++-------- 4 files changed, 52 insertions(+), 50 deletions(-) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 8e56fc2775a56..4c6effc65a4d3 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- """ Base and utility classes for tseries type pandas objects. """ @@ -640,6 +641,28 @@ def _add_datelike(self, other): def _sub_datelike(self, other): raise com.AbstractMethodError(self) + def _add_nat(self): + """Add pd.NaT to self""" + if is_period_dtype(self): + raise TypeError('Cannot add {cls} and {typ}' + .format(cls=type(self).__name__, + typ=type(NaT).__name__)) + + # GH#19124 pd.NaT is treated like a timedelta for both timedelta + # and datetime dtypes + return self._nat_new(box=True) + + def _sub_nat(self): + """Subtract pd.NaT from self""" + # GH#19124 Timedelta - datetime is not in general well-defined. + # We make an exception for pd.NaT, which in this case quacks + # like a timedelta. + # For datetime64 dtypes by convention we treat NaT as a datetime, so + # this subtraction returns a timedelta64 dtype. + # For period dtype, timedelta64 is a close-enough return dtype. + result = self._nat_new(box=False) + return result.view('timedelta64[ns]') + def _sub_period(self, other): return NotImplemented @@ -686,6 +709,8 @@ def __add__(self, other): return NotImplemented # scalar others + elif other is NaT: + result = self._add_nat() elif isinstance(other, (DateOffset, timedelta, np.timedelta64)): result = self._add_delta(other) elif isinstance(other, (datetime, np.datetime64)): @@ -711,9 +736,13 @@ def __add__(self, other): else: # pragma: no cover return NotImplemented - if result is not NotImplemented: - res_name = ops.get_op_result_name(self, other) - result.name = res_name + if result is NotImplemented: + return NotImplemented + elif not isinstance(result, Index): + # Index.__new__ will choose appropriate subclass for dtype + result = Index(result) + res_name = ops.get_op_result_name(self, other) + result.name = res_name return result cls.__add__ = __add__ @@ -731,6 +760,8 @@ def __sub__(self, other): return NotImplemented # scalar others + elif other is NaT: + result = self._sub_nat() elif isinstance(other, (DateOffset, timedelta, np.timedelta64)): result = self._add_delta(-other) elif isinstance(other, (datetime, np.datetime64)): @@ -762,9 +793,13 @@ def __sub__(self, other): else: # pragma: no cover return NotImplemented - if result is not NotImplemented: - res_name = ops.get_op_result_name(self, other) - result.name = res_name + if result is NotImplemented: + return NotImplemented + elif not isinstance(result, Index): + # Index.__new__ will choose appropriate subclass for dtype + result = Index(result) + res_name = ops.get_op_result_name(self, other) + result.name = res_name return result cls.__sub__ = __sub__ diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 55d8b7c18a622..eb8133a1bbf97 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -853,22 +853,11 @@ def __setstate__(self, state): raise Exception("invalid pickle state") _unpickle_compat = __setstate__ - def _add_datelike(self, other): - # adding a timedeltaindex to a datetimelike - if other is libts.NaT: - return self._nat_new(box=True) - raise TypeError("cannot add {0} and {1}" - .format(type(self).__name__, - type(other).__name__)) - def _sub_datelike(self, other): - # subtract a datetime from myself, yielding a TimedeltaIndex - from pandas import TimedeltaIndex - + # subtract a datetime from myself, yielding a ndarray[timedelta64[ns]] if isinstance(other, (DatetimeIndex, np.ndarray)): # if other is an ndarray, we assume it is datetime64-dtype other = DatetimeIndex(other) - # require tz compat if not self._has_same_tz(other): raise TypeError("{cls} subtraction must have the same " @@ -876,9 +865,10 @@ def _sub_datelike(self, other): .format(cls=type(self).__name__)) result = self._sub_datelike_dti(other) elif isinstance(other, (datetime, np.datetime64)): + assert other is not libts.NaT other = Timestamp(other) if other is libts.NaT: - result = self._nat_new(box=False) + return self - libts.NaT # require tz compat elif not self._has_same_tz(other): raise TypeError("Timestamp subtraction must have the same " @@ -893,7 +883,7 @@ def _sub_datelike(self, other): raise TypeError("cannot subtract {cls} and {typ}" .format(cls=type(self).__name__, typ=type(other).__name__)) - return TimedeltaIndex(result) + return result.view('timedelta64[ns]') def _sub_datelike_dti(self, other): """subtraction of two DatetimeIndexes""" @@ -906,7 +896,7 @@ def _sub_datelike_dti(self, other): if self.hasnans or other.hasnans: mask = (self._isnan) | (other._isnan) new_values[mask] = libts.iNaT - return new_values.view('i8') + return new_values.view('timedelta64[ns]') def _maybe_update_attributes(self, attrs): """ Update Index attributes (e.g. freq) depending on op """ diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index f0567c9c963af..b936a4e26af60 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -12,7 +12,6 @@ is_scalar, is_datetime64_dtype, is_datetime64_any_dtype, - is_timedelta64_dtype, is_period_dtype, is_bool_dtype, pandas_dtype, @@ -23,7 +22,6 @@ import pandas.tseries.frequencies as frequencies from pandas.tseries.frequencies import get_freq_code as _gfc from pandas.core.indexes.datetimes import DatetimeIndex, Int64Index, Index -from pandas.core.indexes.timedeltas import TimedeltaIndex from pandas.core.indexes.datetimelike import DatelikeOps, DatetimeIndexOpsMixin from pandas.core.tools.datetimes import parse_time_string import pandas.tseries.offsets as offsets @@ -700,16 +698,6 @@ def _maybe_convert_timedelta(self, other): return other.n msg = _DIFFERENT_FREQ_INDEX.format(self.freqstr, other.freqstr) raise IncompatibleFrequency(msg) - elif isinstance(other, np.ndarray): - if is_integer_dtype(other): - return other - elif is_timedelta64_dtype(other): - offset = frequencies.to_offset(self.freq) - if isinstance(offset, offsets.Tick): - nanos = delta_to_nanoseconds(other) - offset_nanos = delta_to_nanoseconds(offset) - if (nanos % offset_nanos).all() == 0: - return nanos // offset_nanos elif is_integer(other): # integer is passed to .shift via # _add_datetimelike_methods basically @@ -724,10 +712,7 @@ def _add_delta(self, other): return self.shift(ordinal_delta) def _sub_datelike(self, other): - if other is tslib.NaT: - new_data = np.empty(len(self), dtype=np.int64) - new_data.fill(tslib.iNaT) - return TimedeltaIndex(new_data) + assert other is not tslib.NaT return NotImplemented def _sub_period(self, other): diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index eebd52d7fb801..c42c0656c585a 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -414,16 +414,13 @@ def _evaluate_with_timedelta_like(self, other, op): def _add_datelike(self, other): # adding a timedeltaindex to a datetimelike from pandas import Timestamp, DatetimeIndex - - if other is NaT: - # GH#19124 pd.NaT is treated like a timedelta - return self._nat_new() - elif isinstance(other, (DatetimeIndex, np.ndarray)): + if isinstance(other, (DatetimeIndex, np.ndarray)): # if other is an ndarray, we assume it is datetime64-dtype # defer to implementation in DatetimeIndex other = DatetimeIndex(other) return other + self else: + assert other is not NaT other = Timestamp(other) i8 = self.asi8 result = checked_add_with_arr(i8, other.value, @@ -432,14 +429,9 @@ def _add_datelike(self, other): return DatetimeIndex(result) def _sub_datelike(self, other): - # GH#19124 Timedelta - datetime is not in general well-defined. - # We make an exception for pd.NaT, which in this case quacks - # like a timedelta. - if other is NaT: - return self._nat_new() - else: - raise TypeError("cannot subtract a datelike from a {cls}" - .format(cls=type(self).__name__)) + assert other is not NaT + raise TypeError("cannot subtract a datelike from a {cls}" + .format(cls=type(self).__name__)) def _addsub_offset_array(self, other, op): # Add or subtract Array-like of DateOffset objects From 169af2ccd34bad1222984677840d14034d444c7b Mon Sep 17 00:00:00 2001 From: William Ayd Date: Tue, 27 Feb 2018 13:28:22 -0800 Subject: [PATCH 213/214] ASV: Added seek to buffer to fix xlwt asv failure (#19926) * Added seek to buffer to fix xlwt asv failure * Added conditional to check for seek on xlrd object --- pandas/io/excel.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 0d3d4286f5a3c..78af86cc00f7f 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -387,6 +387,10 @@ def __init__(self, io, **kwds): self.book = io elif not isinstance(io, xlrd.Book) and hasattr(io, "read"): # N.B. xlrd.Book has a read attribute too + if hasattr(io, 'seek'): + # GH 19779 + io.seek(0) + data = io.read() self.book = xlrd.open_workbook(file_contents=data) elif isinstance(self._io, compat.string_types): From 7561eb8a2a20e7e896cf82a5b8796ae93ccd5574 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 27 Feb 2018 15:31:55 -0600 Subject: [PATCH 214/214] TST: Debug flaky plotting test (#19925) --- pandas/tests/plotting/test_datetimelike.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index 94adf349fe2cd..08a047a2e7707 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -689,14 +689,17 @@ def test_mixed_freq_regular_first(self): s2 = s1[[0, 5, 10, 11, 12, 13, 14, 15]] # it works! - s1.plot() + _, ax = self.plt.subplots() + s1.plot(ax=ax) - ax2 = s2.plot(style='g') + ax2 = s2.plot(style='g', ax=ax) lines = ax2.get_lines() idx1 = PeriodIndex(lines[0].get_xdata()) idx2 = PeriodIndex(lines[1].get_xdata()) - assert idx1.equals(s1.index.to_period('B')) - assert idx2.equals(s2.index.to_period('B')) + + tm.assert_index_equal(idx1, s1.index.to_period('B')) + tm.assert_index_equal(idx2, s2.index.to_period('B')) + left, right = ax2.get_xlim() pidx = s1.index.to_period() assert left <= pidx[0].ordinal