From 3be2de63e4c6cfbd04671f86d07869dfc984e9ed Mon Sep 17 00:00:00 2001 From: gfyoung Date: Mon, 10 Jul 2017 03:12:50 -0700 Subject: [PATCH 01/54] MAINT: Drop the get_offset_name method (#16863) Deprecated since 0.18.0 xref gh-11834 --- doc/source/whatsnew/v0.21.0.txt | 1 + pandas/tseries/frequencies.py | 14 -------------- 2 files changed, 1 insertion(+), 14 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index d5cc3d6ddca8e..43bfebd0c2e59 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -126,6 +126,7 @@ Removal of prior version deprecations/changes - ``Index`` has dropped the ``.sym_diff()`` method in favor of ``.symmetric_difference()`` (:issue:`12591`) - ``Categorical`` has dropped the ``.order()`` and ``.sort()`` methods in favor of ``.sort_values()`` (:issue:`12882`) - :func:`eval` and :method:`DataFrame.eval` have changed the default of ``inplace`` from ``None`` to ``False`` (:issue:`11149`) +- The function ``get_offset_name`` has been dropped in favor of the ``.freqstr`` attribute for an offset (:issue:`11834`) .. _whatsnew_0210.performance: diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index 8640f106a048a..c5f6c00a4005a 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -637,20 +637,6 @@ def get_offset(name): getOffset = get_offset -def get_offset_name(offset): - """ - Return rule name associated with a DateOffset object - - Examples - -------- - get_offset_name(BMonthEnd(1)) --> 'EOM' - """ - - msg = "get_offset_name(offset) is deprecated. Use offset.freqstr instead" - warnings.warn(msg, FutureWarning, stacklevel=2) - return offset.freqstr - - def get_standard_freq(freq): """ Return the standardized frequency string From a5477b760d939a1f62ab5d38c75bf9d802a2bcf3 Mon Sep 17 00:00:00 2001 From: Adrian Liaw Date: Mon, 10 Jul 2017 18:13:49 +0800 Subject: [PATCH 02/54] DOC: Fix missing parentheses in documentation (#16862) --- doc/source/groupby.rst | 2 +- doc/source/io.rst | 4 ++-- doc/source/whatsnew/v0.13.0.txt | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index 61f43146aba85..937d682d238b3 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -933,7 +933,7 @@ The dimension of the returned result can also change: d = pd.DataFrame({"a":["x", "y"], "b":[1,2]}) def identity(df): - print df + print(df) return df d.groupby("a").apply(identity) diff --git a/doc/source/io.rst b/doc/source/io.rst index e1e82f686f182..9bf84e5419ffa 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -3194,7 +3194,7 @@ You can pass ``iterator=True`` to iterate over the unpacked results .. ipython:: python for o in pd.read_msgpack('foo.msg',iterator=True): - print o + print(o) You can pass ``append=True`` to the writer to append to an existing pack @@ -3912,7 +3912,7 @@ chunks. evens = [2,4,6,8,10] coordinates = store.select_as_coordinates('dfeq','number=evens') for c in chunks(coordinates, 2): - print store.select('dfeq',where=c) + print(store.select('dfeq',where=c)) Advanced Queries ++++++++++++++++ diff --git a/doc/source/whatsnew/v0.13.0.txt b/doc/source/whatsnew/v0.13.0.txt index 3347b05a5df37..f440be1ddd56e 100644 --- a/doc/source/whatsnew/v0.13.0.txt +++ b/doc/source/whatsnew/v0.13.0.txt @@ -790,7 +790,7 @@ Experimental .. ipython:: python for o in pd.read_msgpack('foo.msg',iterator=True): - print o + print(o) .. ipython:: python :suppress: From a43c1576ce3d94bc82f7cdd63531280ced5a9fa0 Mon Sep 17 00:00:00 2001 From: Guillem Borrell Date: Mon, 10 Jul 2017 12:15:08 +0200 Subject: [PATCH 03/54] BUG: rolling.quantile does not return an interpolated result (#16247) --- asv_bench/benchmarks/rolling.py | 185 ++++++++++++++++++++++++++++++++ doc/source/whatsnew/v0.21.0.txt | 5 +- pandas/_libs/window.pyx | 15 ++- pandas/core/window.py | 11 +- pandas/tests/test_window.py | 41 ++++++- 5 files changed, 249 insertions(+), 8 deletions(-) create mode 100644 asv_bench/benchmarks/rolling.py diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py new file mode 100644 index 0000000000000..9da9d0b855323 --- /dev/null +++ b/asv_bench/benchmarks/rolling.py @@ -0,0 +1,185 @@ +from .pandas_vb_common import * +import pandas as pd +import numpy as np + + +class DataframeRolling(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.Ns = 10000 + self.df = pd.DataFrame({'a': np.random.random(self.N)}) + self.dfs = pd.DataFrame({'a': np.random.random(self.Ns)}) + self.wins = 10 + self.winl = 1000 + + def time_rolling_quantile_0(self): + (self.df.rolling(self.wins).quantile(0.0)) + + def time_rolling_quantile_1(self): + (self.df.rolling(self.wins).quantile(1.0)) + + def time_rolling_quantile_median(self): + (self.df.rolling(self.wins).quantile(0.5)) + + def time_rolling_median(self): + (self.df.rolling(self.wins).median()) + + def time_rolling_median(self): + (self.df.rolling(self.wins).mean()) + + def time_rolling_max(self): + (self.df.rolling(self.wins).max()) + + def time_rolling_min(self): + (self.df.rolling(self.wins).min()) + + def time_rolling_std(self): + (self.df.rolling(self.wins).std()) + + def time_rolling_count(self): + (self.df.rolling(self.wins).count()) + + def time_rolling_skew(self): + (self.df.rolling(self.wins).skew()) + + def time_rolling_kurt(self): + (self.df.rolling(self.wins).kurt()) + + def time_rolling_sum(self): + (self.df.rolling(self.wins).sum()) + + def time_rolling_corr(self): + (self.dfs.rolling(self.wins).corr()) + + def time_rolling_cov(self): + (self.dfs.rolling(self.wins).cov()) + + def time_rolling_quantile_0_l(self): + (self.df.rolling(self.winl).quantile(0.0)) + + def time_rolling_quantile_1_l(self): + (self.df.rolling(self.winl).quantile(1.0)) + + def time_rolling_quantile_median_l(self): + (self.df.rolling(self.winl).quantile(0.5)) + + def time_rolling_median_l(self): + (self.df.rolling(self.winl).median()) + + def time_rolling_median_l(self): + (self.df.rolling(self.winl).mean()) + + def time_rolling_max_l(self): + (self.df.rolling(self.winl).max()) + + def time_rolling_min_l(self): + (self.df.rolling(self.winl).min()) + + def time_rolling_std_l(self): + (self.df.rolling(self.wins).std()) + + def time_rolling_count_l(self): + (self.df.rolling(self.wins).count()) + + def time_rolling_skew_l(self): + (self.df.rolling(self.wins).skew()) + + def time_rolling_kurt_l(self): + (self.df.rolling(self.wins).kurt()) + + def time_rolling_sum_l(self): + (self.df.rolling(self.wins).sum()) + + +class SeriesRolling(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.Ns = 10000 + self.df = pd.DataFrame({'a': np.random.random(self.N)}) + self.dfs = pd.DataFrame({'a': np.random.random(self.Ns)}) + self.sr = self.df.a + self.srs = self.dfs.a + self.wins = 10 + self.winl = 1000 + + def time_rolling_quantile_0(self): + (self.sr.rolling(self.wins).quantile(0.0)) + + def time_rolling_quantile_1(self): + (self.sr.rolling(self.wins).quantile(1.0)) + + def time_rolling_quantile_median(self): + (self.sr.rolling(self.wins).quantile(0.5)) + + def time_rolling_median(self): + (self.sr.rolling(self.wins).median()) + + def time_rolling_median(self): + (self.sr.rolling(self.wins).mean()) + + def time_rolling_max(self): + (self.sr.rolling(self.wins).max()) + + def time_rolling_min(self): + (self.sr.rolling(self.wins).min()) + + def time_rolling_std(self): + (self.sr.rolling(self.wins).std()) + + def time_rolling_count(self): + (self.sr.rolling(self.wins).count()) + + def time_rolling_skew(self): + (self.sr.rolling(self.wins).skew()) + + def time_rolling_kurt(self): + (self.sr.rolling(self.wins).kurt()) + + def time_rolling_sum(self): + (self.sr.rolling(self.wins).sum()) + + def time_rolling_corr(self): + (self.srs.rolling(self.wins).corr()) + + def time_rolling_cov(self): + (self.srs.rolling(self.wins).cov()) + + def time_rolling_quantile_0_l(self): + (self.sr.rolling(self.winl).quantile(0.0)) + + def time_rolling_quantile_1_l(self): + (self.sr.rolling(self.winl).quantile(1.0)) + + def time_rolling_quantile_median_l(self): + (self.sr.rolling(self.winl).quantile(0.5)) + + def time_rolling_median_l(self): + (self.sr.rolling(self.winl).median()) + + def time_rolling_median_l(self): + (self.sr.rolling(self.winl).mean()) + + def time_rolling_max_l(self): + (self.sr.rolling(self.winl).max()) + + def time_rolling_min_l(self): + (self.sr.rolling(self.winl).min()) + + def time_rolling_std_l(self): + (self.sr.rolling(self.wins).std()) + + def time_rolling_count_l(self): + (self.sr.rolling(self.wins).count()) + + def time_rolling_skew_l(self): + (self.sr.rolling(self.wins).skew()) + + def time_rolling_kurt_l(self): + (self.sr.rolling(self.wins).kurt()) + + def time_rolling_sum_l(self): + (self.sr.rolling(self.wins).sum()) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 43bfebd0c2e59..1edbf1638d233 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -168,9 +168,11 @@ Plotting Groupby/Resample/Rolling ^^^^^^^^^^^^^^^^^^^^^^^^ -- Bug in ``DataFrame.resample().size()`` where an empty ``DataFrame`` did not return a ``Series`` (:issue:`14962`) +- Bug in ``DataFrame.resample().size()`` where an empty ``DataFrame`` did not return a ``Series`` (:issue:`14962`) - Bug in ``infer_freq`` causing indices with 2-day gaps during the working week to be wrongly inferred as business daily (:issue:`16624`) +- Bug in ``.rolling.quantile()`` which incorrectly used different defaults than :func:`Series.quantile()` and :func:`DataFrame.quantile()` (:issue:`9413`, :issue:`16211`) + Sparse ^^^^^^ @@ -191,6 +193,7 @@ Categorical ^^^^^^^^^^^ + Other ^^^^^ - Bug in :func:`eval` where the ``inplace`` parameter was being incorrectly handled (:issue:`16732`) diff --git a/pandas/_libs/window.pyx b/pandas/_libs/window.pyx index 3bb8abe26c781..2450eea5500cd 100644 --- a/pandas/_libs/window.pyx +++ b/pandas/_libs/window.pyx @@ -1348,8 +1348,9 @@ def roll_quantile(ndarray[float64_t, cast=True] input, int64_t win, bint is_variable ndarray[int64_t] start, end ndarray[double_t] output + double vlow, vhigh - if quantile < 0.0 or quantile > 1.0: + if quantile <= 0.0 or quantile >= 1.0: raise ValueError("quantile value {0} not in [0, 1]".format(quantile)) # we use the Fixed/Variable Indexer here as the @@ -1391,7 +1392,17 @@ def roll_quantile(ndarray[float64_t, cast=True] input, int64_t win, if nobs >= minp: idx = int(quantile * (nobs - 1)) - output[i] = skiplist.get(idx) + + # Single value in skip list + if nobs == 1: + output[i] = skiplist.get(0) + + # Interpolated quantile + else: + vlow = skiplist.get(idx) + vhigh = skiplist.get(idx + 1) + output[i] = (vlow + (vhigh - vlow) * + (quantile * (nobs - 1) - idx)) else: output[i] = NaN diff --git a/pandas/core/window.py b/pandas/core/window.py index 02b508bb94e4c..57611794c375f 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -975,8 +975,15 @@ def quantile(self, quantile, **kwargs): def f(arg, *args, **kwargs): minp = _use_window(self.min_periods, window) - return _window.roll_quantile(arg, window, minp, indexi, - self.closed, quantile) + if quantile == 1.0: + return _window.roll_max(arg, window, minp, indexi, + self.closed) + elif quantile == 0.0: + return _window.roll_min(arg, window, minp, indexi, + self.closed) + else: + return _window.roll_quantile(arg, window, minp, indexi, + self.closed, quantile) return self._apply(f, 'quantile', quantile=quantile, **kwargs) diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index 9c3765ffdb716..3ba5d2065cddf 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -1122,8 +1122,19 @@ def test_rolling_quantile(self): def scoreatpercentile(a, per): values = np.sort(a, axis=0) - idx = per / 1. * (values.shape[0] - 1) - return values[int(idx)] + idx = int(per / 1. * (values.shape[0] - 1)) + + if idx == values.shape[0] - 1: + retval = values[-1] + + else: + qlow = float(idx) / float(values.shape[0] - 1) + qhig = float(idx + 1) / float(values.shape[0] - 1) + vlow = values[idx] + vhig = values[idx + 1] + retval = vlow + (vhig - vlow) * (per - qlow) / (qhig - qlow) + + return retval for q in qs: @@ -1138,6 +1149,30 @@ def alt(x): self._check_moment_func(f, alt, name='quantile', quantile=q) + def test_rolling_quantile_np_percentile(self): + # #9413: Tests that rolling window's quantile default behavior + # is analogus to Numpy's percentile + row = 10 + col = 5 + idx = pd.date_range(20100101, periods=row, freq='B') + df = pd.DataFrame(np.random.rand(row * col).reshape((row, -1)), + index=idx) + + df_quantile = df.quantile([0.25, 0.5, 0.75], axis=0) + np_percentile = np.percentile(df, [25, 50, 75], axis=0) + + tm.assert_almost_equal(df_quantile.values, np.array(np_percentile)) + + def test_rolling_quantile_series(self): + # #16211: Tests that rolling window's quantile default behavior + # is analogus to pd.Series' quantile + arr = np.arange(100) + s = pd.Series(arr) + q1 = s.quantile(0.1) + q2 = s.rolling(100).quantile(0.1).iloc[-1] + + tm.assert_almost_equal(q1, q2) + def test_rolling_quantile_param(self): ser = Series([0.0, .1, .5, .9, 1.0]) @@ -3558,7 +3593,7 @@ def test_ragged_quantile(self): result = df.rolling(window='2s', min_periods=1).quantile(0.5) expected = df.copy() - expected['B'] = [0.0, 1, 1.0, 3.0, 3.0] + expected['B'] = [0.0, 1, 1.5, 3.0, 3.5] tm.assert_frame_equal(result, expected) def test_ragged_std(self): From 3e20eab7ad5639810b4824790cd559367b326b0b Mon Sep 17 00:00:00 2001 From: Keiron Pizzey Date: Mon, 10 Jul 2017 11:36:01 +0100 Subject: [PATCH 04/54] ENH - Modify Dataframe.select_dtypes to accept scalar values (#16860) --- doc/source/basics.rst | 4 - doc/source/style.ipynb | 2 +- doc/source/whatsnew/v0.21.0.txt | 1 + pandas/core/frame.py | 26 +++--- pandas/tests/frame/test_dtypes.py | 130 +++++++++++++++++++++++++----- 5 files changed, 130 insertions(+), 33 deletions(-) diff --git a/doc/source/basics.rst b/doc/source/basics.rst index 134cc5106015b..d8b1602fb104d 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -2229,7 +2229,3 @@ All numpy dtypes are subclasses of ``numpy.generic``: Pandas also defines the types ``category``, and ``datetime64[ns, tz]``, which are not integrated into the normal numpy hierarchy and wont show up with the above function. - -.. note:: - - The ``include`` and ``exclude`` parameters must be non-string sequences. diff --git a/doc/source/style.ipynb b/doc/source/style.ipynb index 4eeda491426b1..c250787785e14 100644 --- a/doc/source/style.ipynb +++ b/doc/source/style.ipynb @@ -935,7 +935,7 @@ "\n", "*Experimental: This is a new feature and still under development. We'll be adding features and possibly making breaking changes in future releases. We'd love to hear your feedback.*\n", "\n", - "Some support is available for exporting styled `DataFrames` to Excel worksheets using the `OpenPyXL` engine. CSS2.2 properties handled include:\n", + "Some support is available for exporting styled `DataFrames` to Excel worksheets using the `OpenPyXL` engine. CSS2.2 properties handled include:\n", "\n", "- `background-color`\n", "- `border-style`, `border-width`, `border-color` and their {`top`, `right`, `bottom`, `left` variants}\n", diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 1edbf1638d233..8c71681582063 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -39,6 +39,7 @@ Other Enhancements - :func:`read_feather` has gained the ``nthreads`` parameter for multi-threaded operations (:issue:`16359`) - :func:`DataFrame.clip()` and :func:`Series.clip()` have gained an ``inplace`` argument. (:issue:`15388`) - :func:`crosstab` has gained a ``margins_name`` parameter to define the name of the row / column that will contain the totals when ``margins=True``. (:issue:`15972`) +- :func:`Dataframe.select_dtypes` now accepts scalar values for include/exclude as well as list-like. (:issue:`16855`) .. _whatsnew_0210.api_breaking: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 80cdebc24c39d..6559fc4c24ce2 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2285,9 +2285,9 @@ def select_dtypes(self, include=None, exclude=None): Parameters ---------- - include, exclude : list-like - A list of dtypes or strings to be included/excluded. You must pass - in a non-empty sequence for at least one of these. + include, exclude : scalar or list-like + A selection of dtypes or strings to be included/excluded. At least + one of these parameters must be supplied. Raises ------ @@ -2295,8 +2295,6 @@ def select_dtypes(self, include=None, exclude=None): * If both of ``include`` and ``exclude`` are empty * If ``include`` and ``exclude`` have overlapping elements * If any kind of string dtype is passed in. - TypeError - * If either of ``include`` or ``exclude`` is not a sequence Returns ------- @@ -2331,6 +2329,14 @@ def select_dtypes(self, include=None, exclude=None): 3 0.0764 False 2 4 -0.9703 True 1 5 -1.2094 False 2 + >>> df.select_dtypes(include='bool') + c + 0 True + 1 False + 2 True + 3 False + 4 True + 5 False >>> df.select_dtypes(include=['float64']) c 0 1 @@ -2348,10 +2354,12 @@ def select_dtypes(self, include=None, exclude=None): 4 True 5 False """ - include, exclude = include or (), exclude or () - if not (is_list_like(include) and is_list_like(exclude)): - raise TypeError('include and exclude must both be non-string' - ' sequences') + + if not is_list_like(include): + include = (include,) if include is not None else () + if not is_list_like(exclude): + exclude = (exclude,) if exclude is not None else () + selection = tuple(map(frozenset, (include, exclude))) if not any(selection): diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index 335b76ff2aade..065580d56a683 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -104,7 +104,7 @@ def test_dtypes_are_correct_after_column_slice(self): ('b', np.float_), ('c', np.float_)]))) - def test_select_dtypes_include(self): + def test_select_dtypes_include_using_list_like(self): df = DataFrame({'a': list('abc'), 'b': list(range(1, 4)), 'c': np.arange(3, 6).astype('u1'), @@ -145,14 +145,10 @@ def test_select_dtypes_include(self): ei = df[['h', 'i']] assert_frame_equal(ri, ei) - ri = df.select_dtypes(include=['timedelta']) - ei = df[['k']] - assert_frame_equal(ri, ei) - pytest.raises(NotImplementedError, lambda: df.select_dtypes(include=['period'])) - def test_select_dtypes_exclude(self): + def test_select_dtypes_exclude_using_list_like(self): df = DataFrame({'a': list('abc'), 'b': list(range(1, 4)), 'c': np.arange(3, 6).astype('u1'), @@ -162,7 +158,7 @@ def test_select_dtypes_exclude(self): ee = df[['a', 'e']] assert_frame_equal(re, ee) - def test_select_dtypes_exclude_include(self): + def test_select_dtypes_exclude_include_using_list_like(self): df = DataFrame({'a': list('abc'), 'b': list(range(1, 4)), 'c': np.arange(3, 6).astype('u1'), @@ -181,6 +177,114 @@ def test_select_dtypes_exclude_include(self): e = df[['b', 'e']] assert_frame_equal(r, e) + def test_select_dtypes_include_using_scalars(self): + df = DataFrame({'a': list('abc'), + 'b': list(range(1, 4)), + 'c': np.arange(3, 6).astype('u1'), + 'd': np.arange(4.0, 7.0, dtype='float64'), + 'e': [True, False, True], + 'f': pd.Categorical(list('abc')), + 'g': pd.date_range('20130101', periods=3), + 'h': pd.date_range('20130101', periods=3, + tz='US/Eastern'), + 'i': pd.date_range('20130101', periods=3, + tz='CET'), + 'j': pd.period_range('2013-01', periods=3, + freq='M'), + 'k': pd.timedelta_range('1 day', periods=3)}) + + ri = df.select_dtypes(include=np.number) + ei = df[['b', 'c', 'd', 'k']] + assert_frame_equal(ri, ei) + + ri = df.select_dtypes(include='datetime') + ei = df[['g']] + assert_frame_equal(ri, ei) + + ri = df.select_dtypes(include='datetime64') + ei = df[['g']] + assert_frame_equal(ri, ei) + + ri = df.select_dtypes(include='category') + ei = df[['f']] + assert_frame_equal(ri, ei) + + pytest.raises(NotImplementedError, + lambda: df.select_dtypes(include='period')) + + def test_select_dtypes_exclude_using_scalars(self): + df = DataFrame({'a': list('abc'), + 'b': list(range(1, 4)), + 'c': np.arange(3, 6).astype('u1'), + 'd': np.arange(4.0, 7.0, dtype='float64'), + 'e': [True, False, True], + 'f': pd.Categorical(list('abc')), + 'g': pd.date_range('20130101', periods=3), + 'h': pd.date_range('20130101', periods=3, + tz='US/Eastern'), + 'i': pd.date_range('20130101', periods=3, + tz='CET'), + 'j': pd.period_range('2013-01', periods=3, + freq='M'), + 'k': pd.timedelta_range('1 day', periods=3)}) + + ri = df.select_dtypes(exclude=np.number) + ei = df[['a', 'e', 'f', 'g', 'h', 'i', 'j']] + assert_frame_equal(ri, ei) + + ri = df.select_dtypes(exclude='category') + ei = df[['a', 'b', 'c', 'd', 'e', 'g', 'h', 'i', 'j', 'k']] + assert_frame_equal(ri, ei) + + pytest.raises(NotImplementedError, + lambda: df.select_dtypes(exclude='period')) + + def test_select_dtypes_include_exclude_using_scalars(self): + df = DataFrame({'a': list('abc'), + 'b': list(range(1, 4)), + 'c': np.arange(3, 6).astype('u1'), + 'd': np.arange(4.0, 7.0, dtype='float64'), + 'e': [True, False, True], + 'f': pd.Categorical(list('abc')), + 'g': pd.date_range('20130101', periods=3), + 'h': pd.date_range('20130101', periods=3, + tz='US/Eastern'), + 'i': pd.date_range('20130101', periods=3, + tz='CET'), + 'j': pd.period_range('2013-01', periods=3, + freq='M'), + 'k': pd.timedelta_range('1 day', periods=3)}) + + ri = df.select_dtypes(include=np.number, exclude='floating') + ei = df[['b', 'c', 'k']] + assert_frame_equal(ri, ei) + + def test_select_dtypes_include_exclude_mixed_scalars_lists(self): + df = DataFrame({'a': list('abc'), + 'b': list(range(1, 4)), + 'c': np.arange(3, 6).astype('u1'), + 'd': np.arange(4.0, 7.0, dtype='float64'), + 'e': [True, False, True], + 'f': pd.Categorical(list('abc')), + 'g': pd.date_range('20130101', periods=3), + 'h': pd.date_range('20130101', periods=3, + tz='US/Eastern'), + 'i': pd.date_range('20130101', periods=3, + tz='CET'), + 'j': pd.period_range('2013-01', periods=3, + freq='M'), + 'k': pd.timedelta_range('1 day', periods=3)}) + + ri = df.select_dtypes(include=np.number, + exclude=['floating', 'timedelta']) + ei = df[['b', 'c']] + assert_frame_equal(ri, ei) + + ri = df.select_dtypes(include=[np.number, 'category'], + exclude='floating') + ei = df[['b', 'c', 'f', 'k']] + assert_frame_equal(ri, ei) + def test_select_dtypes_not_an_attr_but_still_valid_dtype(self): df = DataFrame({'a': list('abc'), 'b': list(range(1, 4)), @@ -205,18 +309,6 @@ def test_select_dtypes_empty(self): 'must be nonempty'): df.select_dtypes() - def test_select_dtypes_raises_on_string(self): - df = DataFrame({'a': list('abc'), 'b': list(range(1, 4))}) - with tm.assert_raises_regex(TypeError, 'include and exclude ' - '.+ non-'): - df.select_dtypes(include='object') - with tm.assert_raises_regex(TypeError, 'include and exclude ' - '.+ non-'): - df.select_dtypes(exclude='object') - with tm.assert_raises_regex(TypeError, 'include and exclude ' - '.+ non-'): - df.select_dtypes(include=int, exclude='object') - def test_select_dtypes_bad_datetime64(self): df = DataFrame({'a': list('abc'), 'b': list(range(1, 4)), From f4b12d8488434d5f9a45fba1cbe7ad5a77c776ff Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 10 Jul 2017 06:36:32 -0400 Subject: [PATCH 05/54] COMPAT: moar 32-bit compat for testing of indexers (#16869) xref #16826 --- pandas/tests/indexes/test_category.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 9dc2cfdecb98f..14f344acbefb2 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -393,7 +393,7 @@ def test_reindex_dtype(self): res, indexer = c.reindex(['a', 'c']) tm.assert_index_equal(res, Index(['a', 'a', 'c']), exact=True) tm.assert_numpy_array_equal(indexer, - np.array([0, 3, 2], dtype=np.int64)) + np.array([0, 3, 2], dtype=np.intp)) c = CategoricalIndex(['a', 'b', 'c', 'a']) res, indexer = c.reindex(Categorical(['a', 'c'])) From 114feb9290c684b5e5b3a2456307f9116372e89f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 11 Jul 2017 03:01:12 -0700 Subject: [PATCH 06/54] Confirm that select was *not* clearer in 0.12 (#16878) --- pandas/core/generic.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 7d1a8adf381fe..5722539b87aec 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2098,7 +2098,6 @@ def xs(self, key, axis=0, level=None, drop_level=True): _xs = xs - # TODO: Check if this was clearer in 0.12 def select(self, crit, axis=0): """ Return data corresponding to axis labels matching criteria From 6a85e88bee498e7e218f0eeb766f15b9d78e9eaa Mon Sep 17 00:00:00 2001 From: topper-123 Date: Tue, 11 Jul 2017 11:08:57 +0100 Subject: [PATCH 07/54] Added tests for _get_dtype (#16845) --- pandas/tests/dtypes/test_common.py | 39 ++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index ba510e68f9a21..c32e8590c5675 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -524,3 +524,42 @@ def test_is_complex_dtype(): assert com.is_complex_dtype(np.complex) assert com.is_complex_dtype(np.array([1 + 1j, 5])) + + +@pytest.mark.parametrize('input_param,result', [ + (int, np.dtype(int)), + ('int32', np.dtype('int32')), + (float, np.dtype(float)), + ('float64', np.dtype('float64')), + (np.dtype('float64'), np.dtype('float64')), + pytest.mark.xfail((str, np.dtype(' Date: Tue, 11 Jul 2017 12:40:50 +0200 Subject: [PATCH 08/54] BUG: Series.isin fails or categoricals (#16858) --- doc/source/whatsnew/v0.21.0.txt | 2 +- pandas/core/algorithms.py | 4 ++-- pandas/tests/test_algos.py | 10 ++++++++++ 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 8c71681582063..015fdf1f45f47 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -192,7 +192,7 @@ Numeric Categorical ^^^^^^^^^^^ - +- Bug in ``:func:Series.isin()`` when called with a categorical (:issue`16639`) Other diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index d74c5e66ea1a9..b490bf787a037 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -38,7 +38,6 @@ # --------------- # # dtype access # # --------------- # - def _ensure_data(values, dtype=None): """ routine to ensure that our data is of the correct @@ -113,7 +112,8 @@ def _ensure_data(values, dtype=None): return values.asi8, dtype, 'int64' - elif is_categorical_dtype(values) or is_categorical_dtype(dtype): + elif (is_categorical_dtype(values) and + (is_categorical_dtype(dtype) or dtype is None)): values = getattr(values, 'values', values) values = values.codes dtype = 'category' diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 063dcea5c76d6..9504d2a9426f0 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -586,6 +586,16 @@ def test_large(self): expected[1] = True tm.assert_numpy_array_equal(result, expected) + def test_categorical_from_codes(self): + # GH 16639 + vals = np.array([0, 1, 2, 0]) + cats = ['a', 'b', 'c'] + Sd = pd.Series(pd.Categorical(1).from_codes(vals, cats)) + St = pd.Series(pd.Categorical(1).from_codes(np.array([0, 1]), cats)) + expected = np.array([True, True, False, True]) + result = algos.isin(Sd, St) + tm.assert_numpy_array_equal(expected, result) + class TestValueCounts(object): From 55af1ab626baf62dbbc00c2521c20be29b819a06 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 11 Jul 2017 12:39:39 -0400 Subject: [PATCH 09/54] COMPAT with dateutil 2.6.1, fixed ambiguous tz dst behavior (#16880) --- ci/requirements-3.5.run | 1 - ci/requirements-3.5.sh | 4 ++++ ci/requirements-3.6_NUMPY_DEV.run | 1 - pandas/tests/tseries/test_offsets.py | 5 ++++- pandas/tests/tseries/test_timezones.py | 21 +++++++++++++++++---- 5 files changed, 25 insertions(+), 7 deletions(-) diff --git a/ci/requirements-3.5.run b/ci/requirements-3.5.run index 43e6814ed6c8e..52828b5220997 100644 --- a/ci/requirements-3.5.run +++ b/ci/requirements-3.5.run @@ -1,4 +1,3 @@ -python-dateutil pytz numpy=1.11.3 openpyxl diff --git a/ci/requirements-3.5.sh b/ci/requirements-3.5.sh index d0f0b81802dc6..917439a8765a2 100644 --- a/ci/requirements-3.5.sh +++ b/ci/requirements-3.5.sh @@ -5,3 +5,7 @@ source activate pandas echo "install 35" conda install -n pandas -c conda-forge feather-format + +# pip install python-dateutil to get latest +conda remove -n pandas python-dateutil --force +pip install python-dateutil diff --git a/ci/requirements-3.6_NUMPY_DEV.run b/ci/requirements-3.6_NUMPY_DEV.run index 0aa987baefb1d..af44f198c687e 100644 --- a/ci/requirements-3.6_NUMPY_DEV.run +++ b/ci/requirements-3.6_NUMPY_DEV.run @@ -1,2 +1 @@ -python-dateutil pytz diff --git a/pandas/tests/tseries/test_offsets.py b/pandas/tests/tseries/test_offsets.py index 47b15a2b66fc4..e03b3e0a85e5e 100644 --- a/pandas/tests/tseries/test_offsets.py +++ b/pandas/tests/tseries/test_offsets.py @@ -4844,7 +4844,7 @@ def test_fallback_plural(self): hrs_pre = utc_offsets['utc_offset_daylight'] hrs_post = utc_offsets['utc_offset_standard'] - if dateutil.__version__ != LooseVersion('2.6.0'): + if dateutil.__version__ < LooseVersion('2.6.0'): # buggy ambiguous behavior in 2.6.0 # GH 14621 # https://github.com/dateutil/dateutil/issues/321 @@ -4852,6 +4852,9 @@ def test_fallback_plural(self): n=3, tstart=self._make_timestamp(self.ts_pre_fallback, hrs_pre, tz), expected_utc_offset=hrs_post) + elif dateutil.__version__ > LooseVersion('2.6.0'): + # fixed, but skip the test + continue def test_springforward_plural(self): # test moving from standard to daylight savings diff --git a/pandas/tests/tseries/test_timezones.py b/pandas/tests/tseries/test_timezones.py index de6978d52968b..c034a9c60ef1b 100644 --- a/pandas/tests/tseries/test_timezones.py +++ b/pandas/tests/tseries/test_timezones.py @@ -552,8 +552,16 @@ def f(): tz=tz, ambiguous='infer') assert times[0] == Timestamp('2013-10-26 23:00', tz=tz, freq="H") - if dateutil.__version__ != LooseVersion('2.6.0'): - # see gh-14621 + if str(tz).startswith('dateutil'): + if dateutil.__version__ < LooseVersion('2.6.0'): + # see gh-14621 + assert times[-1] == Timestamp('2013-10-27 01:00:00+0000', + tz=tz, freq="H") + elif dateutil.__version__ > LooseVersion('2.6.0'): + # fixed ambiguous behavior + assert times[-1] == Timestamp('2013-10-27 01:00:00+0100', + tz=tz, freq="H") + else: assert times[-1] == Timestamp('2013-10-27 01:00:00+0000', tz=tz, freq="H") @@ -1233,13 +1241,18 @@ def test_ambiguous_compat(self): assert result_pytz.value == result_dateutil.value assert result_pytz.value == 1382835600000000000 - # dateutil 2.6 buggy w.r.t. ambiguous=0 - if dateutil.__version__ != LooseVersion('2.6.0'): + if dateutil.__version__ < LooseVersion('2.6.0'): + # dateutil 2.6 buggy w.r.t. ambiguous=0 # see gh-14621 # see https://github.com/dateutil/dateutil/issues/321 assert (result_pytz.to_pydatetime().tzname() == result_dateutil.to_pydatetime().tzname()) assert str(result_pytz) == str(result_dateutil) + elif dateutil.__version__ > LooseVersion('2.6.0'): + # fixed ambiguous behavior + assert result_pytz.to_pydatetime().tzname() == 'GMT' + assert result_dateutil.to_pydatetime().tzname() == 'BST' + assert str(result_pytz) != str(result_dateutil) # 1 hour difference result_pytz = (Timestamp('2013-10-27 01:00:00') From a9421af1aac906cc38d025ed5db4a2b55cb8b9bc Mon Sep 17 00:00:00 2001 From: Jean Helie Date: Tue, 11 Jul 2017 17:40:20 +0100 Subject: [PATCH 10/54] fix wrongly named method (#16881) --- asv_bench/benchmarks/rolling.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index 9da9d0b855323..899349cd21f84 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -26,7 +26,7 @@ def time_rolling_quantile_median(self): def time_rolling_median(self): (self.df.rolling(self.wins).median()) - def time_rolling_median(self): + def time_rolling_mean(self): (self.df.rolling(self.wins).mean()) def time_rolling_max(self): @@ -68,7 +68,7 @@ def time_rolling_quantile_median_l(self): def time_rolling_median_l(self): (self.df.rolling(self.winl).median()) - def time_rolling_median_l(self): + def time_rolling_mean_l(self): (self.df.rolling(self.winl).mean()) def time_rolling_max_l(self): @@ -118,7 +118,7 @@ def time_rolling_quantile_median(self): def time_rolling_median(self): (self.sr.rolling(self.wins).median()) - def time_rolling_median(self): + def time_rolling_mean(self): (self.sr.rolling(self.wins).mean()) def time_rolling_max(self): @@ -160,7 +160,7 @@ def time_rolling_quantile_median_l(self): def time_rolling_median_l(self): (self.sr.rolling(self.winl).median()) - def time_rolling_median_l(self): + def time_rolling_mean_l(self): (self.sr.rolling(self.winl).mean()) def time_rolling_max_l(self): From 9d13227345882daaa90f03078c09a9b44a18ce72 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 12 Jul 2017 10:51:07 -0500 Subject: [PATCH 11/54] TST/PKG: Removed pandas.util.testing.slow definition (#16852) --- doc/source/whatsnew/v0.21.0.txt | 1 + pandas/tests/computation/test_eval.py | 16 +-- pandas/tests/frame/test_repr_info.py | 5 +- pandas/tests/frame/test_to_csv.py | 8 +- pandas/tests/indexing/test_indexing_slow.py | 7 +- pandas/tests/io/parser/common.py | 2 +- pandas/tests/io/test_excel.py | 2 +- pandas/tests/io/test_html.py | 36 ++--- pandas/tests/plotting/test_boxplot_method.py | 23 ++-- pandas/tests/plotting/test_datetimelike.py | 102 +++++++------- pandas/tests/plotting/test_deprecated.py | 10 +- pandas/tests/plotting/test_frame.py | 137 +++++++++---------- pandas/tests/plotting/test_hist_method.py | 35 +++-- pandas/tests/plotting/test_misc.py | 17 ++- pandas/tests/plotting/test_series.py | 59 ++++---- pandas/tests/series/test_indexing.py | 5 +- pandas/tests/test_expressions.py | 10 +- pandas/tests/test_window.py | 6 +- pandas/util/testing.py | 7 - 19 files changed, 239 insertions(+), 249 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 015fdf1f45f47..a5ee0e0ce2653 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -101,6 +101,7 @@ the target. Now, a ``ValueError`` will be raised when such an input is passed in - :class:`pandas.HDFStore`'s string representation is now faster and less detailed. For the previous behavior, use ``pandas.HDFStore.info()``. (:issue:`16503`). - Compression defaults in HDF stores now follow pytable standards. Default is no compression and if ``complib`` is missing and ``complevel`` > 0 ``zlib`` is used (:issue:`15943`) - ``Index.get_indexer_non_unique()`` now returns a ndarray indexer rather than an ``Index``; this is consistent with ``Index.get_indexer()`` (:issue:`16819`) +- Removed the ``@slow`` decorator from ``pandas.util.testing``, which caused issues for some downstream packages' test suites. Use ``@pytest.mark.slow`` instead, which achieves the same thing (:issue:`16850`) .. _whatsnew_0210.api: diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index 589f612802fb9..7fc091ebb1892 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -30,7 +30,7 @@ import pandas.util.testing as tm from pandas.util.testing import (assert_frame_equal, randbool, assert_numpy_array_equal, assert_series_equal, - assert_produces_warning, slow) + assert_produces_warning) from pandas.compat import PY3, reduce _series_frame_incompatible = _bool_ops_syms @@ -144,7 +144,7 @@ def teardown_method(self, method): del self.lhses, self.rhses, self.scalar_rhses, self.scalar_lhses del self.pandas_rhses, self.pandas_lhses, self.current_engines - @slow + @pytest.mark.slow def test_complex_cmp_ops(self): cmp_ops = ('!=', '==', '<=', '>=', '<', '>') cmp2_ops = ('>', '<') @@ -161,7 +161,7 @@ def test_simple_cmp_ops(self): for lhs, rhs, cmp_op in product(bool_lhses, bool_rhses, self.cmp_ops): self.check_simple_cmp_op(lhs, cmp_op, rhs) - @slow + @pytest.mark.slow def test_binary_arith_ops(self): for lhs, op, rhs in product(self.lhses, self.arith_ops, self.rhses): self.check_binary_arith_op(lhs, op, rhs) @@ -181,17 +181,17 @@ def test_pow(self): for lhs, rhs in product(self.lhses, self.rhses): self.check_pow(lhs, '**', rhs) - @slow + @pytest.mark.slow def test_single_invert_op(self): for lhs, op, rhs in product(self.lhses, self.cmp_ops, self.rhses): self.check_single_invert_op(lhs, op, rhs) - @slow + @pytest.mark.slow def test_compound_invert_op(self): for lhs, op, rhs in product(self.lhses, self.cmp_ops, self.rhses): self.check_compound_invert_op(lhs, op, rhs) - @slow + @pytest.mark.slow def test_chained_cmp_op(self): mids = self.lhses cmp_ops = '<', '>' @@ -870,7 +870,7 @@ def test_frame_comparison(self, engine, parser): res = pd.eval('df < df3', engine=engine, parser=parser) assert_frame_equal(res, df < df3) - @slow + @pytest.mark.slow def test_medium_complex_frame_alignment(self, engine, parser): args = product(self.lhs_index_types, self.index_types, self.index_types, self.index_types) @@ -974,7 +974,7 @@ def test_series_frame_commutativity(self, engine, parser): if engine == 'numexpr': assert_frame_equal(a, b) - @slow + @pytest.mark.slow def test_complex_series_frame_alignment(self, engine, parser): import random args = product(self.lhs_index_types, self.index_types, diff --git a/pandas/tests/frame/test_repr_info.py b/pandas/tests/frame/test_repr_info.py index cc37f8cc3cb02..c317ad542659a 100644 --- a/pandas/tests/frame/test_repr_info.py +++ b/pandas/tests/frame/test_repr_info.py @@ -8,6 +8,7 @@ from numpy import nan import numpy as np +import pytest from pandas import (DataFrame, compat, option_context) from pandas.compat import StringIO, lrange, u @@ -40,7 +41,7 @@ def test_repr_mixed(self): foo = repr(self.mixed_frame) # noqa self.mixed_frame.info(verbose=False, buf=buf) - @tm.slow + @pytest.mark.slow def test_repr_mixed_big(self): # big mixed biggie = DataFrame({'A': np.random.randn(200), @@ -87,7 +88,7 @@ def test_repr_dimensions(self): with option_context('display.show_dimensions', 'truncate'): assert "2 rows x 2 columns" not in repr(df) - @tm.slow + @pytest.mark.slow def test_repr_big(self): # big one biggie = DataFrame(np.zeros((200, 4)), columns=lrange(4), diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index 69bd2b008416f..6a4b1686a31e2 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -17,7 +17,7 @@ from pandas.util.testing import (assert_almost_equal, assert_series_equal, assert_frame_equal, - ensure_clean, slow, + ensure_clean, makeCustomDataframe as mkdf) import pandas.util.testing as tm @@ -205,7 +205,7 @@ def _check_df(df, cols=None): cols = ['b', 'a'] _check_df(df, cols) - @slow + @pytest.mark.slow def test_to_csv_dtnat(self): # GH3437 from pandas import NaT @@ -236,7 +236,7 @@ def make_dtnat_arr(n, nnat=None): assert_frame_equal(df, recons, check_names=False, check_less_precise=True) - @slow + @pytest.mark.slow def test_to_csv_moar(self): def _do_test(df, r_dtype=None, c_dtype=None, @@ -728,7 +728,7 @@ def test_to_csv_chunking(self): rs = read_csv(filename, index_col=0) assert_frame_equal(rs, aa) - @slow + @pytest.mark.slow def test_to_csv_wide_frame_formatting(self): # Issue #8621 df = DataFrame(np.random.randn(1, 100010), columns=None, index=None) diff --git a/pandas/tests/indexing/test_indexing_slow.py b/pandas/tests/indexing/test_indexing_slow.py index 08d390a6a213e..1b3fb18d9ff1d 100644 --- a/pandas/tests/indexing/test_indexing_slow.py +++ b/pandas/tests/indexing/test_indexing_slow.py @@ -6,11 +6,12 @@ import pandas as pd from pandas.core.api import Series, DataFrame, MultiIndex import pandas.util.testing as tm +import pytest class TestIndexingSlow(object): - @tm.slow + @pytest.mark.slow def test_multiindex_get_loc(self): # GH7724, GH2646 with warnings.catch_warnings(record=True): @@ -80,7 +81,7 @@ def loop(mi, df, keys): assert not mi.index.lexsort_depth < i loop(mi, df, keys) - @tm.slow + @pytest.mark.slow def test_large_dataframe_indexing(self): # GH10692 result = DataFrame({'x': range(10 ** 6)}, dtype='int64') @@ -88,7 +89,7 @@ def test_large_dataframe_indexing(self): expected = DataFrame({'x': range(10 ** 6 + 1)}, dtype='int64') tm.assert_frame_equal(result, expected) - @tm.slow + @pytest.mark.slow def test_large_mi_dataframe_indexing(self): # GH10645 result = MultiIndex.from_arrays([range(10 ** 6), range(10 ** 6)]) diff --git a/pandas/tests/io/parser/common.py b/pandas/tests/io/parser/common.py index 4b4f44b44c163..584a6561b505b 100644 --- a/pandas/tests/io/parser/common.py +++ b/pandas/tests/io/parser/common.py @@ -664,7 +664,7 @@ def test_url(self): tm.assert_frame_equal(url_table, local_table) # TODO: ftp testing - @tm.slow + @pytest.mark.slow def test_file(self): dirpath = tm.get_data_path() localtable = os.path.join(dirpath, 'salaries.csv') diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index abe3757ec64f3..856e8d6466526 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -614,7 +614,7 @@ def test_read_from_s3_url(self): local_table = self.get_exceldf('test1') tm.assert_frame_equal(url_table, local_table) - @tm.slow + @pytest.mark.slow def test_read_from_file_url(self): # FILE diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 1e1d653cf94d1..4ef265dcd5113 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -130,7 +130,7 @@ def test_spam_url(self): assert_framelist_equal(df1, df2) - @tm.slow + @pytest.mark.slow def test_banklist(self): df1 = self.read_html(self.banklist_data, '.*Florida.*', attrs={'id': 'table'}) @@ -292,7 +292,7 @@ def test_invalid_url(self): except ValueError as e: assert str(e) == 'No tables found' - @tm.slow + @pytest.mark.slow def test_file_url(self): url = self.banklist_data dfs = self.read_html(file_path_to_url(url), 'First', @@ -301,7 +301,7 @@ def test_file_url(self): for df in dfs: assert isinstance(df, DataFrame) - @tm.slow + @pytest.mark.slow def test_invalid_table_attrs(self): url = self.banklist_data with tm.assert_raises_regex(ValueError, 'No tables found'): @@ -312,39 +312,39 @@ def _bank_data(self, *args, **kwargs): return self.read_html(self.banklist_data, 'Metcalf', attrs={'id': 'table'}, *args, **kwargs) - @tm.slow + @pytest.mark.slow def test_multiindex_header(self): df = self._bank_data(header=[0, 1])[0] assert isinstance(df.columns, MultiIndex) - @tm.slow + @pytest.mark.slow def test_multiindex_index(self): df = self._bank_data(index_col=[0, 1])[0] assert isinstance(df.index, MultiIndex) - @tm.slow + @pytest.mark.slow def test_multiindex_header_index(self): df = self._bank_data(header=[0, 1], index_col=[0, 1])[0] assert isinstance(df.columns, MultiIndex) assert isinstance(df.index, MultiIndex) - @tm.slow + @pytest.mark.slow def test_multiindex_header_skiprows_tuples(self): df = self._bank_data(header=[0, 1], skiprows=1, tupleize_cols=True)[0] assert isinstance(df.columns, Index) - @tm.slow + @pytest.mark.slow def test_multiindex_header_skiprows(self): df = self._bank_data(header=[0, 1], skiprows=1)[0] assert isinstance(df.columns, MultiIndex) - @tm.slow + @pytest.mark.slow def test_multiindex_header_index_skiprows(self): df = self._bank_data(header=[0, 1], index_col=[0, 1], skiprows=1)[0] assert isinstance(df.index, MultiIndex) assert isinstance(df.columns, MultiIndex) - @tm.slow + @pytest.mark.slow def test_regex_idempotency(self): url = self.banklist_data dfs = self.read_html(file_path_to_url(url), @@ -372,7 +372,7 @@ def test_python_docs_table(self): zz = [df.iloc[0, 0][0:4] for df in dfs] assert sorted(zz) == sorted(['Repo', 'What']) - @tm.slow + @pytest.mark.slow def test_thousands_macau_stats(self): all_non_nan_table_index = -2 macau_data = os.path.join(DATA_PATH, 'macau.html') @@ -382,7 +382,7 @@ def test_thousands_macau_stats(self): assert not any(s.isnull().any() for _, s in df.iteritems()) - @tm.slow + @pytest.mark.slow def test_thousands_macau_index_col(self): all_non_nan_table_index = -2 macau_data = os.path.join(DATA_PATH, 'macau.html') @@ -523,7 +523,7 @@ def test_nyse_wsj_commas_table(self): assert df.shape[0] == nrows tm.assert_index_equal(df.columns, columns) - @tm.slow + @pytest.mark.slow def test_banklist_header(self): from pandas.io.html import _remove_whitespace @@ -562,7 +562,7 @@ def try_remove_ws(x): coerce=True) tm.assert_frame_equal(converted, gtnew) - @tm.slow + @pytest.mark.slow def test_gold_canyon(self): gc = 'Gold Canyon' with open(self.banklist_data, 'r') as f: @@ -855,7 +855,7 @@ def test_works_on_valid_markup(self): assert isinstance(dfs, list) assert isinstance(dfs[0], DataFrame) - @tm.slow + @pytest.mark.slow def test_fallback_success(self): _skip_if_none_of(('bs4', 'html5lib')) banklist_data = os.path.join(DATA_PATH, 'banklist.html') @@ -898,7 +898,7 @@ def get_elements_from_file(url, element='table'): return soup.find_all(element) -@tm.slow +@pytest.mark.slow def test_bs4_finds_tables(): filepath = os.path.join(DATA_PATH, "spam.html") with warnings.catch_warnings(): @@ -913,13 +913,13 @@ def get_lxml_elements(url, element): return doc.xpath('.//{0}'.format(element)) -@tm.slow +@pytest.mark.slow def test_lxml_finds_tables(): filepath = os.path.join(DATA_PATH, "spam.html") assert get_lxml_elements(filepath, 'table') -@tm.slow +@pytest.mark.slow def test_lxml_finds_tbody(): filepath = os.path.join(DATA_PATH, "spam.html") assert get_lxml_elements(filepath, 'tbody') diff --git a/pandas/tests/plotting/test_boxplot_method.py b/pandas/tests/plotting/test_boxplot_method.py index ce8fb7a57c912..8fe119d28644c 100644 --- a/pandas/tests/plotting/test_boxplot_method.py +++ b/pandas/tests/plotting/test_boxplot_method.py @@ -8,7 +8,6 @@ from pandas import Series, DataFrame, MultiIndex from pandas.compat import range, lzip import pandas.util.testing as tm -from pandas.util.testing import slow import numpy as np from numpy import random @@ -35,7 +34,7 @@ def _skip_if_mpl_14_or_dev_boxplot(): class TestDataFramePlots(TestPlotBase): - @slow + @pytest.mark.slow def test_boxplot_legacy(self): df = DataFrame(randn(6, 4), index=list(string.ascii_letters[:6]), @@ -93,13 +92,13 @@ def test_boxplot_legacy(self): lines = list(itertools.chain.from_iterable(d.values())) assert len(ax.get_lines()) == len(lines) - @slow + @pytest.mark.slow def test_boxplot_return_type_none(self): # GH 12216; return_type=None & by=None -> axes result = self.hist_df.boxplot() assert isinstance(result, self.plt.Axes) - @slow + @pytest.mark.slow def test_boxplot_return_type_legacy(self): # API change in https://github.com/pandas-dev/pandas/pull/7096 import matplotlib as mpl # noqa @@ -125,7 +124,7 @@ def test_boxplot_return_type_legacy(self): result = df.boxplot(return_type='both') self._check_box_return_type(result, 'both') - @slow + @pytest.mark.slow def test_boxplot_axis_limits(self): def _check_ax_limits(col, ax): @@ -153,14 +152,14 @@ def _check_ax_limits(col, ax): assert age_ax._sharey == height_ax assert dummy_ax._sharey is None - @slow + @pytest.mark.slow def test_boxplot_empty_column(self): _skip_if_mpl_14_or_dev_boxplot() df = DataFrame(np.random.randn(20, 4)) df.loc[:, 0] = np.nan _check_plot_works(df.boxplot, return_type='axes') - @slow + @pytest.mark.slow def test_figsize(self): df = DataFrame(np.random.rand(10, 5), columns=['A', 'B', 'C', 'D', 'E']) @@ -176,7 +175,7 @@ def test_fontsize(self): class TestDataFrameGroupByPlots(TestPlotBase): - @slow + @pytest.mark.slow def test_boxplot_legacy(self): grouped = self.hist_df.groupby(by='gender') with tm.assert_produces_warning(UserWarning): @@ -206,7 +205,7 @@ def test_boxplot_legacy(self): return_type='axes') self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) - @slow + @pytest.mark.slow def test_grouped_plot_fignums(self): n = 10 weight = Series(np.random.normal(166, 20, size=n)) @@ -230,7 +229,7 @@ def test_grouped_plot_fignums(self): res = df.groupby('gender').hist() tm.close() - @slow + @pytest.mark.slow def test_grouped_box_return_type(self): df = self.hist_df @@ -267,7 +266,7 @@ def test_grouped_box_return_type(self): returned = df2.boxplot(by='category', return_type=t) self._check_box_return_type(returned, t, expected_keys=columns2) - @slow + @pytest.mark.slow def test_grouped_box_layout(self): df = self.hist_df @@ -341,7 +340,7 @@ def test_grouped_box_layout(self): return_type='dict') self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(1, 3)) - @slow + @pytest.mark.slow def test_grouped_box_multiple_axes(self): # GH 6970, GH 7069 df = self.hist_df diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index 0cff365be3ec8..e9c7d806fd65d 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -14,7 +14,7 @@ from pandas.core.indexes.period import period_range, Period, PeriodIndex from pandas.core.resample import DatetimeIndex -from pandas.util.testing import assert_series_equal, ensure_clean, slow +from pandas.util.testing import assert_series_equal, ensure_clean import pandas.util.testing as tm from pandas.tests.plotting.common import (TestPlotBase, @@ -45,7 +45,7 @@ def setup_method(self, method): def teardown_method(self, method): tm.close() - @slow + @pytest.mark.slow def test_ts_plot_with_tz(self): # GH2877 index = date_range('1/1/2011', periods=2, freq='H', @@ -61,7 +61,7 @@ def test_fontsize_set_correctly(self): for label in (ax.get_xticklabels() + ax.get_yticklabels()): assert label.get_fontsize() == 2 - @slow + @pytest.mark.slow def test_frame_inferred(self): # inferred freq idx = date_range('1/1/1987', freq='MS', periods=100) @@ -99,7 +99,7 @@ def test_nonnumeric_exclude(self): pytest.raises(TypeError, df['A'].plot) - @slow + @pytest.mark.slow def test_tsplot(self): from pandas.tseries.plotting import tsplot @@ -133,7 +133,7 @@ def test_both_style_and_color(self): s = ts.reset_index(drop=True) pytest.raises(ValueError, s.plot, style='b-', color='#000099') - @slow + @pytest.mark.slow def test_high_freq(self): freaks = ['ms', 'us'] for freq in freaks: @@ -151,7 +151,7 @@ def test_get_datevalue(self): assert (get_datevalue('1/1/1987', 'D') == Period('1987-1-1', 'D').ordinal) - @slow + @pytest.mark.slow def test_ts_plot_format_coord(self): def check_format_of_first_point(ax, expected_string): first_line = ax.get_lines()[0] @@ -185,28 +185,28 @@ def check_format_of_first_point(ax, expected_string): tsplot(daily, self.plt.Axes.plot, ax=ax) check_format_of_first_point(ax, 't = 2014-01-01 y = 1.000000') - @slow + @pytest.mark.slow def test_line_plot_period_series(self): for s in self.period_ser: _check_plot_works(s.plot, s.index.freq) - @slow + @pytest.mark.slow def test_line_plot_datetime_series(self): for s in self.datetime_ser: _check_plot_works(s.plot, s.index.freq.rule_code) - @slow + @pytest.mark.slow def test_line_plot_period_frame(self): for df in self.period_df: _check_plot_works(df.plot, df.index.freq) - @slow + @pytest.mark.slow def test_line_plot_datetime_frame(self): for df in self.datetime_df: freq = df.index.to_period(df.index.freq.rule_code).freq _check_plot_works(df.plot, freq) - @slow + @pytest.mark.slow def test_line_plot_inferred_freq(self): for ser in self.datetime_ser: ser = Series(ser.values, Index(np.asarray(ser.index))) @@ -223,7 +223,7 @@ def test_fake_inferred_business(self): ts.plot(ax=ax) assert not hasattr(ax, 'freq') - @slow + @pytest.mark.slow def test_plot_offset_freq(self): ser = tm.makeTimeSeries() _check_plot_works(ser.plot) @@ -232,14 +232,14 @@ def test_plot_offset_freq(self): ser = Series(np.random.randn(len(dr)), dr) _check_plot_works(ser.plot) - @slow + @pytest.mark.slow def test_plot_multiple_inferred_freq(self): dr = Index([datetime(2000, 1, 1), datetime(2000, 1, 6), datetime( 2000, 1, 11)]) ser = Series(np.random.randn(len(dr)), dr) _check_plot_works(ser.plot) - @slow + @pytest.mark.slow def test_uhf(self): import pandas.plotting._converter as conv idx = date_range('2012-6-22 21:59:51.960928', freq='L', periods=500) @@ -257,7 +257,7 @@ def test_uhf(self): if len(rs): assert xp == rs - @slow + @pytest.mark.slow def test_irreg_hf(self): idx = date_range('2012-6-22 21:59:51', freq='S', periods=100) df = DataFrame(np.random.randn(len(idx), 2), idx) @@ -297,7 +297,7 @@ def test_business_freq(self): idx = ax.get_lines()[0].get_xdata() assert PeriodIndex(data=idx).freqstr == 'B' - @slow + @pytest.mark.slow def test_business_freq_convert(self): n = tm.N tm.N = 300 @@ -327,7 +327,7 @@ def test_dataframe(self): idx = ax.get_lines()[0].get_xdata() tm.assert_index_equal(bts.index.to_period(), PeriodIndex(idx)) - @slow + @pytest.mark.slow def test_axis_limits(self): def _test(ax): @@ -384,7 +384,7 @@ def test_get_finder(self): assert conv.get_finder('A') == conv._annual_finder assert conv.get_finder('W') == conv._daily_finder - @slow + @pytest.mark.slow def test_finder_daily(self): xp = Period('1999-1-1', freq='B').ordinal day_lst = [10, 40, 252, 400, 950, 2750, 10000] @@ -402,7 +402,7 @@ def test_finder_daily(self): assert xp == rs self.plt.close(ax.get_figure()) - @slow + @pytest.mark.slow def test_finder_quarterly(self): xp = Period('1988Q1').ordinal yrs = [3.5, 11] @@ -420,7 +420,7 @@ def test_finder_quarterly(self): assert xp == rs self.plt.close(ax.get_figure()) - @slow + @pytest.mark.slow def test_finder_monthly(self): xp = Period('Jan 1988').ordinal yrs = [1.15, 2.5, 4, 11] @@ -448,7 +448,7 @@ def test_finder_monthly_long(self): xp = Period('1989Q1', 'M').ordinal assert rs == xp - @slow + @pytest.mark.slow def test_finder_annual(self): xp = [1987, 1988, 1990, 1990, 1995, 2020, 2070, 2170] for i, nyears in enumerate([5, 10, 19, 49, 99, 199, 599, 1001]): @@ -461,7 +461,7 @@ def test_finder_annual(self): assert rs == Period(xp[i], freq='A').ordinal self.plt.close(ax.get_figure()) - @slow + @pytest.mark.slow def test_finder_minutely(self): nminutes = 50 * 24 * 60 rng = date_range('1/1/1999', freq='Min', periods=nminutes) @@ -484,7 +484,7 @@ def test_finder_hourly(self): xp = Period('1/1/1999', freq='H').ordinal assert rs == xp - @slow + @pytest.mark.slow def test_gaps(self): ts = tm.makeTimeSeries() ts[5:25] = np.nan @@ -529,7 +529,7 @@ def test_gaps(self): mask = data.mask assert mask[2:5, 1].all() - @slow + @pytest.mark.slow def test_gap_upsample(self): low = tm.makeTimeSeries() low[5:25] = np.nan @@ -551,7 +551,7 @@ def test_gap_upsample(self): mask = data.mask assert mask[5:25, 1].all() - @slow + @pytest.mark.slow def test_secondary_y(self): ser = Series(np.random.randn(10)) ser2 = Series(np.random.randn(10)) @@ -581,7 +581,7 @@ def test_secondary_y(self): assert hasattr(ax2, 'left_ax') assert not hasattr(ax2, 'right_ax') - @slow + @pytest.mark.slow def test_secondary_y_ts(self): idx = date_range('1/1/2000', periods=10) ser = Series(np.random.randn(10), idx) @@ -608,7 +608,7 @@ def test_secondary_y_ts(self): ax2 = ser.plot(secondary_y=True) assert ax.get_yaxis().get_visible() - @slow + @pytest.mark.slow def test_secondary_kde(self): tm._skip_if_no_scipy() _skip_if_no_scipy_gaussian_kde() @@ -621,7 +621,7 @@ def test_secondary_kde(self): axes = fig.get_axes() assert axes[1].get_yaxis().get_ticks_position() == 'right' - @slow + @pytest.mark.slow def test_secondary_bar(self): ser = Series(np.random.randn(10)) fig, ax = self.plt.subplots() @@ -629,7 +629,7 @@ def test_secondary_bar(self): axes = fig.get_axes() assert axes[1].get_yaxis().get_ticks_position() == 'right' - @slow + @pytest.mark.slow def test_secondary_frame(self): df = DataFrame(np.random.randn(5, 3), columns=['a', 'b', 'c']) axes = df.plot(secondary_y=['a', 'c'], subplots=True) @@ -638,7 +638,7 @@ def test_secondary_frame(self): self.default_tick_position) assert axes[2].get_yaxis().get_ticks_position() == 'right' - @slow + @pytest.mark.slow def test_secondary_bar_frame(self): df = DataFrame(np.random.randn(5, 3), columns=['a', 'b', 'c']) axes = df.plot(kind='bar', secondary_y=['a', 'c'], subplots=True) @@ -666,7 +666,7 @@ def test_mixed_freq_regular_first(self): assert left == pidx[0].ordinal assert right == pidx[-1].ordinal - @slow + @pytest.mark.slow def test_mixed_freq_irregular_first(self): s1 = tm.makeTimeSeries() s2 = s1[[0, 5, 10, 11, 12, 13, 14, 15]] @@ -697,7 +697,7 @@ def test_mixed_freq_regular_first_df(self): assert left == pidx[0].ordinal assert right == pidx[-1].ordinal - @slow + @pytest.mark.slow def test_mixed_freq_irregular_first_df(self): # GH 9852 s1 = tm.makeTimeSeries().to_frame() @@ -723,7 +723,7 @@ def test_mixed_freq_hf_first(self): for l in ax.get_lines(): assert PeriodIndex(data=l.get_xdata()).freq == 'D' - @slow + @pytest.mark.slow def test_mixed_freq_alignment(self): ts_ind = date_range('2012-01-01 13:00', '2012-01-02', freq='H') ts_data = np.random.randn(12) @@ -737,7 +737,7 @@ def test_mixed_freq_alignment(self): assert ax.lines[0].get_xdata()[0] == ax.lines[1].get_xdata()[0] - @slow + @pytest.mark.slow def test_mixed_freq_lf_first(self): idxh = date_range('1/1/1999', periods=365, freq='D') @@ -819,7 +819,7 @@ def test_nat_handling(self): assert s.index.min() <= Series(xdata).min() assert Series(xdata).max() <= s.index.max() - @slow + @pytest.mark.slow def test_to_weekly_resampling(self): idxh = date_range('1/1/1999', periods=52, freq='W') idxl = date_range('1/1/1999', periods=12, freq='M') @@ -840,7 +840,7 @@ def test_to_weekly_resampling(self): for l in lines: assert PeriodIndex(data=l.get_xdata()).freq == idxh.freq - @slow + @pytest.mark.slow def test_from_weekly_resampling(self): idxh = date_range('1/1/1999', periods=52, freq='W') idxl = date_range('1/1/1999', periods=12, freq='M') @@ -876,7 +876,7 @@ def test_from_weekly_resampling(self): else: tm.assert_numpy_array_equal(xdata, expected_h) - @slow + @pytest.mark.slow def test_from_resampling_area_line_mixed(self): idxh = date_range('1/1/1999', periods=52, freq='W') idxl = date_range('1/1/1999', periods=12, freq='M') @@ -950,7 +950,7 @@ def test_from_resampling_area_line_mixed(self): tm.assert_numpy_array_equal(l.get_ydata(orig=False), expected_y) - @slow + @pytest.mark.slow def test_mixed_freq_second_millisecond(self): # GH 7772, GH 7760 idxh = date_range('2014-07-01 09:00', freq='S', periods=50) @@ -974,7 +974,7 @@ def test_mixed_freq_second_millisecond(self): for l in ax.get_lines(): assert PeriodIndex(data=l.get_xdata()).freq == 'L' - @slow + @pytest.mark.slow def test_irreg_dtypes(self): # date idx = [date(2000, 1, 1), date(2000, 1, 5), date(2000, 1, 20)] @@ -988,7 +988,7 @@ def test_irreg_dtypes(self): _, ax = self.plt.subplots() _check_plot_works(df.plot, ax=ax) - @slow + @pytest.mark.slow def test_time(self): t = datetime(1, 1, 1, 3, 30, 0) deltas = np.random.randint(1, 20, 3).cumsum() @@ -1024,7 +1024,7 @@ def test_time(self): rs = time(h, m, s).strftime('%H:%M:%S') assert xp == rs - @slow + @pytest.mark.slow def test_time_musec(self): t = datetime(1, 1, 1, 3, 30, 0) deltas = np.random.randint(1, 20, 3).cumsum() @@ -1051,7 +1051,7 @@ def test_time_musec(self): rs = time(h, m, s).strftime('%H:%M:%S.%f') assert xp == rs - @slow + @pytest.mark.slow def test_secondary_upsample(self): idxh = date_range('1/1/1999', periods=365, freq='D') idxl = date_range('1/1/1999', periods=12, freq='M') @@ -1067,7 +1067,7 @@ def test_secondary_upsample(self): for l in ax.left_ax.get_lines(): assert PeriodIndex(l.get_xdata()).freq == 'D' - @slow + @pytest.mark.slow def test_secondary_legend(self): fig = self.plt.figure() ax = fig.add_subplot(211) @@ -1169,7 +1169,7 @@ def test_format_date_axis(self): if len(l.get_text()) > 0: assert l.get_rotation() == 30 - @slow + @pytest.mark.slow def test_ax_plot(self): x = DatetimeIndex(start='2012-01-02', periods=10, freq='D') y = lrange(len(x)) @@ -1177,7 +1177,7 @@ def test_ax_plot(self): lines = ax.plot(x, y, label='Y') tm.assert_index_equal(DatetimeIndex(lines[0].get_xdata()), x) - @slow + @pytest.mark.slow def test_mpl_nopandas(self): dates = [date(2008, 12, 31), date(2009, 1, 31)] values1 = np.arange(10.0, 11.0, 0.5) @@ -1196,7 +1196,7 @@ def test_mpl_nopandas(self): exp = np.array([x.toordinal() for x in dates], dtype=np.float64) tm.assert_numpy_array_equal(line2.get_xydata()[:, 0], exp) - @slow + @pytest.mark.slow def test_irregular_ts_shared_ax_xlim(self): # GH 2960 ts = tm.makeTimeSeries()[:20] @@ -1212,7 +1212,7 @@ def test_irregular_ts_shared_ax_xlim(self): assert left == ts_irregular.index.min().toordinal() assert right == ts_irregular.index.max().toordinal() - @slow + @pytest.mark.slow def test_secondary_y_non_ts_xlim(self): # GH 3490 - non-timeseries with secondary y index_1 = [1, 2, 3, 4] @@ -1229,7 +1229,7 @@ def test_secondary_y_non_ts_xlim(self): assert left_before == left_after assert right_before < right_after - @slow + @pytest.mark.slow def test_secondary_y_regular_ts_xlim(self): # GH 3490 - regular-timeseries with secondary y index_1 = date_range(start='2000-01-01', periods=4, freq='D') @@ -1246,7 +1246,7 @@ def test_secondary_y_regular_ts_xlim(self): assert left_before == left_after assert right_before < right_after - @slow + @pytest.mark.slow def test_secondary_y_mixed_freq_ts_xlim(self): # GH 3490 - mixed frequency timeseries with secondary y rng = date_range('2000-01-01', periods=10000, freq='min') @@ -1262,7 +1262,7 @@ def test_secondary_y_mixed_freq_ts_xlim(self): assert left_before == left_after assert right_before == right_after - @slow + @pytest.mark.slow def test_secondary_y_irregular_ts_xlim(self): # GH 3490 - irregular-timeseries with secondary y ts = tm.makeTimeSeries()[:20] @@ -1361,7 +1361,7 @@ def test_hist(self): _, ax = self.plt.subplots() ax.hist([x, x], weights=[w1, w2]) - @slow + @pytest.mark.slow def test_overlapping_datetime(self): # GB 6608 s1 = Series([1, 2, 3], index=[datetime(1995, 12, 31), diff --git a/pandas/tests/plotting/test_deprecated.py b/pandas/tests/plotting/test_deprecated.py index ca03bcb060e25..970de6ff881ab 100644 --- a/pandas/tests/plotting/test_deprecated.py +++ b/pandas/tests/plotting/test_deprecated.py @@ -4,7 +4,7 @@ import pandas as pd import pandas.util.testing as tm -from pandas.util.testing import slow +import pytest from numpy.random import randn @@ -23,7 +23,7 @@ class TestDeprecatedNameSpace(TestPlotBase): - @slow + @pytest.mark.slow def test_scatter_plot_legacy(self): tm._skip_if_no_scipy() @@ -35,7 +35,7 @@ def test_scatter_plot_legacy(self): with tm.assert_produces_warning(FutureWarning): pd.scatter_matrix(df) - @slow + @pytest.mark.slow def test_boxplot_deprecated(self): df = pd.DataFrame(randn(6, 4), index=list(string.ascii_letters[:6]), @@ -46,13 +46,13 @@ def test_boxplot_deprecated(self): plotting.boxplot(df, column=['one', 'two'], by='indic') - @slow + @pytest.mark.slow def test_radviz_deprecated(self): df = self.iris with tm.assert_produces_warning(FutureWarning): plotting.radviz(frame=df, class_column='Name') - @slow + @pytest.mark.slow def test_plot_params(self): with tm.assert_produces_warning(FutureWarning): diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index 352c03582db93..7878740f64e55 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -15,7 +15,6 @@ from pandas.compat import range, lrange, lmap, lzip, u, zip, PY3 from pandas.io.formats.printing import pprint_thing import pandas.util.testing as tm -from pandas.util.testing import slow import numpy as np from numpy.random import rand, randn @@ -41,7 +40,7 @@ def setup_method(self, method): "C": np.arange(20) + np.random.uniform( size=20)}) - @slow + @pytest.mark.slow def test_plot(self): df = self.tdf _check_plot_works(df.plot, grid=False) @@ -188,13 +187,13 @@ def test_nonnumeric_exclude(self): ax = df.plot() assert len(ax.get_lines()) == 1 # B was plotted - @slow + @pytest.mark.slow def test_implicit_label(self): df = DataFrame(randn(10, 3), columns=['a', 'b', 'c']) ax = df.plot(x='a', y='b') self._check_text_labels(ax.xaxis.get_label(), 'a') - @slow + @pytest.mark.slow def test_donot_overwrite_index_name(self): # GH 8494 df = DataFrame(randn(2, 2), columns=['a', 'b']) @@ -202,7 +201,7 @@ def test_donot_overwrite_index_name(self): df.plot(y='b', label='LABEL') assert df.index.name == 'NAME' - @slow + @pytest.mark.slow def test_plot_xy(self): # columns.inferred_type == 'string' df = self.tdf @@ -228,7 +227,7 @@ def test_plot_xy(self): # columns.inferred_type == 'mixed' # TODO add MultiIndex test - @slow + @pytest.mark.slow def test_logscales(self): df = DataFrame({'a': np.arange(100)}, index=np.arange(100)) ax = df.plot(logy=True) @@ -240,7 +239,7 @@ def test_logscales(self): ax = df.plot(loglog=True) self._check_ax_scales(ax, xaxis='log', yaxis='log') - @slow + @pytest.mark.slow def test_xcompat(self): import pandas as pd @@ -305,7 +304,7 @@ def test_unsorted_index(self): rs = Series(rs[:, 1], rs[:, 0], dtype=np.int64, name='y') tm.assert_series_equal(rs, df.y) - @slow + @pytest.mark.slow def test_subplots(self): df = DataFrame(np.random.rand(10, 3), index=list(string.ascii_letters[:10])) @@ -345,7 +344,7 @@ def test_subplots(self): for ax in axes: assert ax.get_legend() is None - @slow + @pytest.mark.slow def test_subplots_timeseries(self): idx = date_range(start='2014-07-01', freq='M', periods=10) df = DataFrame(np.random.rand(10, 3), index=idx) @@ -381,7 +380,7 @@ def test_subplots_timeseries(self): self._check_ticks_props(ax, xlabelsize=7, xrot=45, ylabelsize=7) - @slow + @pytest.mark.slow def test_subplots_layout(self): # GH 6667 df = DataFrame(np.random.rand(10, 3), @@ -427,7 +426,7 @@ def test_subplots_layout(self): self._check_axes_shape(axes, axes_num=1, layout=(3, 3)) assert axes.shape == (3, 3) - @slow + @pytest.mark.slow def test_subplots_warnings(self): # GH 9464 warnings.simplefilter('error') @@ -442,7 +441,7 @@ def test_subplots_warnings(self): self.fail(w) warnings.simplefilter('default') - @slow + @pytest.mark.slow def test_subplots_multiple_axes(self): # GH 5353, 6970, GH 7069 fig, axes = self.plt.subplots(2, 3) @@ -543,7 +542,7 @@ def test_subplots_sharex_axes_existing_axes(self): for ax in axes.ravel(): self._check_visible(ax.get_yticklabels(), visible=True) - @slow + @pytest.mark.slow def test_subplots_dup_columns(self): # GH 10962 df = DataFrame(np.random.rand(5, 5), columns=list('aaaaa')) @@ -697,7 +696,7 @@ def test_area_lim(self): ymin, ymax = ax.get_ylim() assert ymax == 0 - @slow + @pytest.mark.slow def test_bar_colors(self): import matplotlib.pyplot as plt default_colors = self._maybe_unpack_cycler(plt.rcParams) @@ -733,7 +732,7 @@ def test_bar_colors(self): self._check_colors(ax.patches[::5], facecolors=['green'] * 5) tm.close() - @slow + @pytest.mark.slow def test_bar_linewidth(self): df = DataFrame(randn(5, 5)) @@ -754,7 +753,7 @@ def test_bar_linewidth(self): for r in ax.patches: assert r.get_linewidth() == 2 - @slow + @pytest.mark.slow def test_bar_barwidth(self): df = DataFrame(randn(5, 5)) @@ -792,7 +791,7 @@ def test_bar_barwidth(self): for r in ax.patches: assert r.get_height() == width - @slow + @pytest.mark.slow def test_bar_barwidth_position(self): df = DataFrame(randn(5, 5)) self._check_bar_alignment(df, kind='bar', stacked=False, width=0.9, @@ -808,7 +807,7 @@ def test_bar_barwidth_position(self): self._check_bar_alignment(df, kind='barh', subplots=True, width=0.9, position=0.2) - @slow + @pytest.mark.slow def test_bar_barwidth_position_int(self): # GH 12979 df = DataFrame(randn(5, 5)) @@ -828,7 +827,7 @@ def test_bar_barwidth_position_int(self): self._check_bar_alignment(df, kind='bar', subplots=True, width=1) self._check_bar_alignment(df, kind='barh', subplots=True, width=1) - @slow + @pytest.mark.slow def test_bar_bottom_left(self): df = DataFrame(rand(5, 5)) ax = df.plot.bar(stacked=False, bottom=1) @@ -857,7 +856,7 @@ def test_bar_bottom_left(self): result = [p.get_x() for p in ax.patches] assert result == [1] * 5 - @slow + @pytest.mark.slow def test_bar_nan(self): df = DataFrame({'A': [10, np.nan, 20], 'B': [5, 10, 20], @@ -875,7 +874,7 @@ def test_bar_nan(self): expected = [0.0, 0.0, 0.0, 10.0, 0.0, 20.0, 15.0, 10.0, 40.0] assert result == expected - @slow + @pytest.mark.slow def test_bar_categorical(self): # GH 13019 df1 = pd.DataFrame(np.random.randn(6, 5), @@ -901,7 +900,7 @@ def test_bar_categorical(self): assert ax.patches[0].get_x() == -0.25 assert ax.patches[-1].get_x() == 4.75 - @slow + @pytest.mark.slow def test_plot_scatter(self): df = DataFrame(randn(6, 4), index=list(string.ascii_letters[:6]), @@ -919,7 +918,7 @@ def test_plot_scatter(self): axes = df.plot(x='x', y='y', kind='scatter', subplots=True) self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) - @slow + @pytest.mark.slow def test_plot_scatter_with_categorical_data(self): # GH 16199 df = pd.DataFrame({'x': [1, 2, 3, 4], @@ -937,7 +936,7 @@ def test_plot_scatter_with_categorical_data(self): df.plot(x='y', y='y', kind='scatter') ve.match('requires x column to be numeric') - @slow + @pytest.mark.slow def test_plot_scatter_with_c(self): df = DataFrame(randn(6, 4), index=list(string.ascii_letters[:6]), @@ -1007,7 +1006,7 @@ def test_scatter_colors(self): tm.assert_numpy_array_equal(ax.collections[0].get_facecolor()[0], np.array([1, 1, 1, 1], dtype=np.float64)) - @slow + @pytest.mark.slow def test_plot_bar(self): df = DataFrame(randn(6, 4), index=list(string.ascii_letters[:6]), @@ -1098,7 +1097,7 @@ def _check_bar_alignment(self, df, kind='bar', stacked=False, return axes - @slow + @pytest.mark.slow def test_bar_stacked_center(self): # GH2157 df = DataFrame({'A': [3] * 5, 'B': lrange(5)}, index=lrange(5)) @@ -1107,7 +1106,7 @@ def test_bar_stacked_center(self): self._check_bar_alignment(df, kind='barh', stacked=True) self._check_bar_alignment(df, kind='barh', stacked=True, width=0.9) - @slow + @pytest.mark.slow def test_bar_center(self): df = DataFrame({'A': [3] * 5, 'B': lrange(5)}, index=lrange(5)) self._check_bar_alignment(df, kind='bar', stacked=False) @@ -1115,7 +1114,7 @@ def test_bar_center(self): self._check_bar_alignment(df, kind='barh', stacked=False) self._check_bar_alignment(df, kind='barh', stacked=False, width=0.9) - @slow + @pytest.mark.slow def test_bar_subplots_center(self): df = DataFrame({'A': [3] * 5, 'B': lrange(5)}, index=lrange(5)) self._check_bar_alignment(df, kind='bar', subplots=True) @@ -1123,7 +1122,7 @@ def test_bar_subplots_center(self): self._check_bar_alignment(df, kind='barh', subplots=True) self._check_bar_alignment(df, kind='barh', subplots=True, width=0.9) - @slow + @pytest.mark.slow def test_bar_align_single_column(self): df = DataFrame(randn(5)) self._check_bar_alignment(df, kind='bar', stacked=False) @@ -1133,7 +1132,7 @@ def test_bar_align_single_column(self): self._check_bar_alignment(df, kind='bar', subplots=True) self._check_bar_alignment(df, kind='barh', subplots=True) - @slow + @pytest.mark.slow def test_bar_edge(self): df = DataFrame({'A': [3] * 5, 'B': lrange(5)}, index=lrange(5)) @@ -1158,7 +1157,7 @@ def test_bar_edge(self): self._check_bar_alignment(df, kind='barh', subplots=True, width=0.9, align='edge') - @slow + @pytest.mark.slow def test_bar_log_no_subplots(self): # GH3254, GH3298 matplotlib/matplotlib#1882, #1892 # regressions in 1.2.1 @@ -1172,7 +1171,7 @@ def test_bar_log_no_subplots(self): ax = df.plot.bar(grid=True, log=True) tm.assert_numpy_array_equal(ax.yaxis.get_ticklocs(), expected) - @slow + @pytest.mark.slow def test_bar_log_subplots(self): expected = np.array([1., 10., 100., 1000.]) if not self.mpl_le_1_2_1: @@ -1184,7 +1183,7 @@ def test_bar_log_subplots(self): tm.assert_numpy_array_equal(ax[0].yaxis.get_ticklocs(), expected) tm.assert_numpy_array_equal(ax[1].yaxis.get_ticklocs(), expected) - @slow + @pytest.mark.slow def test_boxplot(self): df = self.hist_df series = df['height'] @@ -1222,7 +1221,7 @@ def test_boxplot(self): tm.assert_numpy_array_equal(ax.xaxis.get_ticklocs(), positions) assert len(ax.lines) == self.bp_n_objects * len(numeric_cols) - @slow + @pytest.mark.slow def test_boxplot_vertical(self): df = self.hist_df numeric_cols = df._get_numeric_data().columns @@ -1250,7 +1249,7 @@ def test_boxplot_vertical(self): tm.assert_numpy_array_equal(ax.yaxis.get_ticklocs(), positions) assert len(ax.lines) == self.bp_n_objects * len(numeric_cols) - @slow + @pytest.mark.slow def test_boxplot_return_type(self): df = DataFrame(randn(6, 4), index=list(string.ascii_letters[:6]), @@ -1270,7 +1269,7 @@ def test_boxplot_return_type(self): result = df.plot.box(return_type='both') self._check_box_return_type(result, 'both') - @slow + @pytest.mark.slow def test_boxplot_subplots_return_type(self): df = self.hist_df @@ -1287,7 +1286,7 @@ def test_boxplot_subplots_return_type(self): expected_keys=['height', 'weight', 'category'], check_ax_title=False) - @slow + @pytest.mark.slow def test_kde_df(self): tm._skip_if_no_scipy() _skip_if_no_scipy_gaussian_kde() @@ -1308,7 +1307,7 @@ def test_kde_df(self): axes = df.plot(kind='kde', logy=True, subplots=True) self._check_ax_scales(axes, yaxis='log') - @slow + @pytest.mark.slow def test_kde_missing_vals(self): tm._skip_if_no_scipy() _skip_if_no_scipy_gaussian_kde() @@ -1316,7 +1315,7 @@ def test_kde_missing_vals(self): df.loc[0, 0] = np.nan _check_plot_works(df.plot, kind='kde') - @slow + @pytest.mark.slow def test_hist_df(self): from matplotlib.patches import Rectangle if self.mpl_le_1_2_1: @@ -1376,7 +1375,7 @@ def _check_box_coord(self, patches, expected_y=None, expected_h=None, tm.assert_numpy_array_equal(result_width, expected_w, check_dtype=False) - @slow + @pytest.mark.slow def test_hist_df_coord(self): normal_df = DataFrame({'A': np.repeat(np.array([1, 2, 3, 4, 5]), np.array([10, 9, 8, 7, 6])), @@ -1467,12 +1466,12 @@ def test_hist_df_coord(self): expected_x=np.array([0, 0, 0, 0, 0]), expected_w=np.array([6, 7, 8, 9, 10])) - @slow + @pytest.mark.slow def test_plot_int_columns(self): df = DataFrame(randn(100, 4)).cumsum() _check_plot_works(df.plot, legend=True) - @slow + @pytest.mark.slow def test_df_legend_labels(self): kinds = ['line', 'bar', 'barh', 'kde', 'area', 'hist'] df = DataFrame(rand(3, 3), columns=['a', 'b', 'c']) @@ -1565,7 +1564,7 @@ def test_legend_name(self): leg_title = ax.legend_.get_title() self._check_text_labels(leg_title, 'new') - @slow + @pytest.mark.slow def test_no_legend(self): kinds = ['line', 'bar', 'barh', 'kde', 'area', 'hist'] df = DataFrame(rand(3, 3), columns=['a', 'b', 'c']) @@ -1577,7 +1576,7 @@ def test_no_legend(self): ax = df.plot(kind=kind, legend=False) self._check_legend_labels(ax, visible=False) - @slow + @pytest.mark.slow def test_style_by_column(self): import matplotlib.pyplot as plt fig = plt.gcf() @@ -1593,7 +1592,7 @@ def test_style_by_column(self): for i, l in enumerate(ax.get_lines()[:len(markers)]): assert l.get_marker() == markers[i] - @slow + @pytest.mark.slow def test_line_label_none(self): s = Series([1, 2]) ax = s.plot() @@ -1602,7 +1601,7 @@ def test_line_label_none(self): ax = s.plot(legend=True) assert ax.get_legend().get_texts()[0].get_text() == 'None' - @slow + @pytest.mark.slow @tm.capture_stdout def test_line_colors(self): from matplotlib import cm @@ -1654,13 +1653,13 @@ def test_line_colors(self): # Forced show plot _check_plot_works(df.plot, color=custom_colors) - @slow + @pytest.mark.slow def test_dont_modify_colors(self): colors = ['r', 'g', 'b'] pd.DataFrame(np.random.rand(10, 2)).plot(color=colors) assert len(colors) == 3 - @slow + @pytest.mark.slow def test_line_colors_and_styles_subplots(self): # GH 9894 from matplotlib import cm @@ -1738,7 +1737,7 @@ def test_line_colors_and_styles_subplots(self): self._check_colors(ax.get_lines(), linecolors=[c]) tm.close() - @slow + @pytest.mark.slow def test_area_colors(self): from matplotlib import cm from matplotlib.collections import PolyCollection @@ -1798,7 +1797,7 @@ def test_area_colors(self): for h in handles: assert h.get_alpha() == 0.5 - @slow + @pytest.mark.slow def test_hist_colors(self): default_colors = self._maybe_unpack_cycler(self.plt.rcParams) @@ -1832,7 +1831,7 @@ def test_hist_colors(self): self._check_colors(ax.patches[::10], facecolors=['green'] * 5) tm.close() - @slow + @pytest.mark.slow def test_kde_colors(self): tm._skip_if_no_scipy() _skip_if_no_scipy_gaussian_kde() @@ -1855,7 +1854,7 @@ def test_kde_colors(self): rgba_colors = lmap(cm.jet, np.linspace(0, 1, len(df))) self._check_colors(ax.get_lines(), linecolors=rgba_colors) - @slow + @pytest.mark.slow def test_kde_colors_and_styles_subplots(self): tm._skip_if_no_scipy() _skip_if_no_scipy_gaussian_kde() @@ -1914,7 +1913,7 @@ def test_kde_colors_and_styles_subplots(self): self._check_colors(ax.get_lines(), linecolors=[c]) tm.close() - @slow + @pytest.mark.slow def test_boxplot_colors(self): def _check_colors(bp, box_c, whiskers_c, medians_c, caps_c='k', fliers_c=None): @@ -2025,7 +2024,7 @@ def test_all_invalid_plot_data(self): with pytest.raises(TypeError): df.plot(kind=kind) - @slow + @pytest.mark.slow def test_partially_invalid_plot_data(self): with tm.RNGContext(42): df = DataFrame(randn(10, 2), dtype=object) @@ -2050,7 +2049,7 @@ def test_invalid_kind(self): with pytest.raises(ValueError): df.plot(kind='aasdf') - @slow + @pytest.mark.slow def test_hexbin_basic(self): df = self.hexbin_df @@ -2066,7 +2065,7 @@ def test_hexbin_basic(self): # return value is single axes self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) - @slow + @pytest.mark.slow def test_hexbin_with_c(self): df = self.hexbin_df @@ -2076,7 +2075,7 @@ def test_hexbin_with_c(self): ax = df.plot.hexbin(x='A', y='B', C='C', reduce_C_function=np.std) assert len(ax.collections) == 1 - @slow + @pytest.mark.slow def test_hexbin_cmap(self): df = self.hexbin_df @@ -2088,14 +2087,14 @@ def test_hexbin_cmap(self): ax = df.plot.hexbin(x='A', y='B', colormap=cm) assert ax.collections[0].cmap.name == cm - @slow + @pytest.mark.slow def test_no_color_bar(self): df = self.hexbin_df ax = df.plot.hexbin(x='A', y='B', colorbar=None) assert ax.collections[0].colorbar is None - @slow + @pytest.mark.slow def test_allow_cmap(self): df = self.hexbin_df @@ -2105,7 +2104,7 @@ def test_allow_cmap(self): with pytest.raises(TypeError): df.plot.hexbin(x='A', y='B', cmap='YlGn', colormap='BuGn') - @slow + @pytest.mark.slow def test_pie_df(self): df = DataFrame(np.random.rand(5, 3), columns=['X', 'Y', 'Z'], index=['a', 'b', 'c', 'd', 'e']) @@ -2159,7 +2158,7 @@ def test_pie_df_nan(self): assert ([x.get_text() for x in ax.get_legend().get_texts()] == base_expected[:i] + base_expected[i + 1:]) - @slow + @pytest.mark.slow def test_errorbar_plot(self): d = {'x': np.arange(12), 'y': np.arange(12, 0, -1)} df = DataFrame(d) @@ -2227,7 +2226,7 @@ def test_errorbar_plot(self): with pytest.raises((ValueError, TypeError)): df.plot(yerr=df_err) - @slow + @pytest.mark.slow def test_errorbar_with_integer_column_names(self): # test with integer column names df = DataFrame(np.random.randn(10, 2)) @@ -2237,7 +2236,7 @@ def test_errorbar_with_integer_column_names(self): ax = _check_plot_works(df.plot, y=0, yerr=1) self._check_has_errorbars(ax, xerr=0, yerr=1) - @slow + @pytest.mark.slow def test_errorbar_with_partial_columns(self): df = DataFrame(np.random.randn(10, 3)) df_err = DataFrame(np.random.randn(10, 2), columns=[0, 2]) @@ -2260,7 +2259,7 @@ def test_errorbar_with_partial_columns(self): ax = _check_plot_works(df.plot, yerr=err) self._check_has_errorbars(ax, xerr=0, yerr=1) - @slow + @pytest.mark.slow def test_errorbar_timeseries(self): d = {'x': np.arange(12), 'y': np.arange(12, 0, -1)} @@ -2370,7 +2369,7 @@ def _check_errorbar_color(containers, expected, has_err='has_xerr'): self._check_has_errorbars(ax, xerr=0, yerr=1) _check_errorbar_color(ax.containers, 'green', has_err='has_yerr') - @slow + @pytest.mark.slow def test_sharex_and_ax(self): # https://github.com/pandas-dev/pandas/issues/9737 using gridspec, # the axis in fig.get_axis() are sorted differently than pandas @@ -2422,7 +2421,7 @@ def _check(axes): self._check_visible(ax.get_xticklabels(minor=True), visible=True) tm.close() - @slow + @pytest.mark.slow def test_sharey_and_ax(self): # https://github.com/pandas-dev/pandas/issues/9737 using gridspec, # the axis in fig.get_axis() are sorted differently than pandas @@ -2505,7 +2504,7 @@ def test_memory_leak(self): # need to actually access something to get an error results[key].lines - @slow + @pytest.mark.slow def test_df_subplots_patterns_minorticks(self): # GH 10657 import matplotlib.pyplot as plt @@ -2550,7 +2549,7 @@ def test_df_subplots_patterns_minorticks(self): self._check_visible(ax.get_xticklabels(minor=True), visible=True) tm.close() - @slow + @pytest.mark.slow def test_df_gridspec_patterns(self): # GH 10819 import matplotlib.pyplot as plt @@ -2673,7 +2672,7 @@ def _get_boxed_grid(): self._check_visible(ax.get_xticklabels(minor=True), visible=True) tm.close() - @slow + @pytest.mark.slow def test_df_grid_settings(self): # Make sure plot defaults to rcParams['axes.grid'] setting, GH 9792 self._check_grid_settings( diff --git a/pandas/tests/plotting/test_hist_method.py b/pandas/tests/plotting/test_hist_method.py index 17a75e5cb287c..5f7b2dd2d6ca9 100644 --- a/pandas/tests/plotting/test_hist_method.py +++ b/pandas/tests/plotting/test_hist_method.py @@ -6,7 +6,6 @@ from pandas import Series, DataFrame import pandas.util.testing as tm -from pandas.util.testing import slow import numpy as np from numpy.random import randn @@ -28,7 +27,7 @@ def setup_method(self, method): self.ts = tm.makeTimeSeries() self.ts.name = 'ts' - @slow + @pytest.mark.slow def test_hist_legacy(self): _check_plot_works(self.ts.hist) _check_plot_works(self.ts.hist, grid=False) @@ -52,13 +51,13 @@ def test_hist_legacy(self): with pytest.raises(ValueError): self.ts.hist(by=self.ts.index, figure=fig) - @slow + @pytest.mark.slow def test_hist_bins_legacy(self): df = DataFrame(np.random.randn(10, 2)) ax = df.hist(bins=2)[0][0] assert len(ax.patches) == 2 - @slow + @pytest.mark.slow def test_hist_layout(self): df = self.hist_df with pytest.raises(ValueError): @@ -67,7 +66,7 @@ def test_hist_layout(self): with pytest.raises(ValueError): df.height.hist(layout=[1, 1]) - @slow + @pytest.mark.slow def test_hist_layout_with_by(self): df = self.hist_df @@ -113,7 +112,7 @@ def test_hist_layout_with_by(self): self._check_axes_shape( axes, axes_num=4, layout=(4, 2), figsize=(12, 7)) - @slow + @pytest.mark.slow def test_hist_no_overlap(self): from matplotlib.pyplot import subplot, gcf x = Series(randn(2)) @@ -126,13 +125,13 @@ def test_hist_no_overlap(self): axes = fig.axes if self.mpl_ge_1_5_0 else fig.get_axes() assert len(axes) == 2 - @slow + @pytest.mark.slow def test_hist_by_no_extra_plots(self): df = self.hist_df axes = df.height.hist(by=df.gender) # noqa assert len(self.plt.get_fignums()) == 1 - @slow + @pytest.mark.slow def test_plot_fails_when_ax_differs_from_figure(self): from pylab import figure fig1 = figure() @@ -144,7 +143,7 @@ def test_plot_fails_when_ax_differs_from_figure(self): class TestDataFramePlots(TestPlotBase): - @slow + @pytest.mark.slow def test_hist_df_legacy(self): from matplotlib.patches import Rectangle with tm.assert_produces_warning(UserWarning): @@ -210,7 +209,7 @@ def test_hist_df_legacy(self): with pytest.raises(AttributeError): ser.hist(foo='bar') - @slow + @pytest.mark.slow def test_hist_layout(self): df = DataFrame(randn(100, 3)) @@ -241,7 +240,7 @@ def test_hist_layout(self): with pytest.raises(ValueError): df.hist(layout=(-1, -1)) - @slow + @pytest.mark.slow # GH 9351 def test_tight_layout(self): if self.mpl_ge_2_0_1: @@ -254,7 +253,7 @@ def test_tight_layout(self): class TestDataFrameGroupByPlots(TestPlotBase): - @slow + @pytest.mark.slow def test_grouped_hist_legacy(self): from matplotlib.patches import Rectangle @@ -303,7 +302,7 @@ def test_grouped_hist_legacy(self): with tm.assert_produces_warning(FutureWarning): df.hist(by='C', figsize='default') - @slow + @pytest.mark.slow def test_grouped_hist_legacy2(self): n = 10 weight = Series(np.random.normal(166, 20, size=n)) @@ -318,7 +317,7 @@ def test_grouped_hist_legacy2(self): assert len(self.plt.get_fignums()) == 2 tm.close() - @slow + @pytest.mark.slow def test_grouped_hist_layout(self): df = self.hist_df pytest.raises(ValueError, df.hist, column='weight', by=df.gender, @@ -367,7 +366,7 @@ def test_grouped_hist_layout(self): axes = df.hist(column=['height', 'weight', 'category']) self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) - @slow + @pytest.mark.slow def test_grouped_hist_multiple_axes(self): # GH 6970, GH 7069 df = self.hist_df @@ -387,7 +386,7 @@ def test_grouped_hist_multiple_axes(self): # pass different number of axes from required axes = df.hist(column='height', ax=axes) - @slow + @pytest.mark.slow def test_axis_share_x(self): df = self.hist_df # GH4089 @@ -401,7 +400,7 @@ def test_axis_share_x(self): assert not ax1._shared_y_axes.joined(ax1, ax2) assert not ax2._shared_y_axes.joined(ax1, ax2) - @slow + @pytest.mark.slow def test_axis_share_y(self): df = self.hist_df ax1, ax2 = df.hist(column='height', by=df.gender, sharey=True) @@ -414,7 +413,7 @@ def test_axis_share_y(self): assert not ax1._shared_x_axes.joined(ax1, ax2) assert not ax2._shared_x_axes.joined(ax1, ax2) - @slow + @pytest.mark.slow def test_axis_share_xy(self): df = self.hist_df ax1, ax2 = df.hist(column='height', by=df.gender, sharex=True, diff --git a/pandas/tests/plotting/test_misc.py b/pandas/tests/plotting/test_misc.py index d93ad90a36a9c..684a943fb5a69 100644 --- a/pandas/tests/plotting/test_misc.py +++ b/pandas/tests/plotting/test_misc.py @@ -7,7 +7,6 @@ from pandas import Series, DataFrame from pandas.compat import lmap import pandas.util.testing as tm -from pandas.util.testing import slow import numpy as np from numpy import random @@ -30,7 +29,7 @@ def setup_method(self, method): self.ts = tm.makeTimeSeries() self.ts.name = 'ts' - @slow + @pytest.mark.slow def test_autocorrelation_plot(self): from pandas.plotting import autocorrelation_plot _check_plot_works(autocorrelation_plot, series=self.ts) @@ -39,13 +38,13 @@ def test_autocorrelation_plot(self): ax = autocorrelation_plot(self.ts, label='Test') self._check_legend_labels(ax, labels=['Test']) - @slow + @pytest.mark.slow def test_lag_plot(self): from pandas.plotting import lag_plot _check_plot_works(lag_plot, series=self.ts) _check_plot_works(lag_plot, series=self.ts, lag=5) - @slow + @pytest.mark.slow def test_bootstrap_plot(self): from pandas.plotting import bootstrap_plot _check_plot_works(bootstrap_plot, series=self.ts, size=10) @@ -53,7 +52,7 @@ def test_bootstrap_plot(self): class TestDataFramePlots(TestPlotBase): - @slow + @pytest.mark.slow def test_scatter_plot_legacy(self): tm._skip_if_no_scipy() @@ -130,7 +129,7 @@ def test_scatter_matrix_axis(self): self._check_ticks_props( axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0) - @slow + @pytest.mark.slow def test_andrews_curves(self): from pandas.plotting import andrews_curves from matplotlib import cm @@ -195,7 +194,7 @@ def test_andrews_curves(self): with tm.assert_produces_warning(FutureWarning): andrews_curves(data=df, class_column='Name') - @slow + @pytest.mark.slow def test_parallel_coordinates(self): from pandas.plotting import parallel_coordinates from matplotlib import cm @@ -263,7 +262,7 @@ def test_parallel_coordinates_with_sorted_labels(self): # lables and colors are ordered strictly increasing assert prev[1] < nxt[1] and prev[0] < nxt[0] - @slow + @pytest.mark.slow def test_radviz(self): from pandas.plotting import radviz from matplotlib import cm @@ -301,7 +300,7 @@ def test_radviz(self): handles, labels = ax.get_legend_handles_labels() self._check_colors(handles, facecolors=colors) - @slow + @pytest.mark.slow def test_subplot_titles(self): df = self.iris.drop('Name', axis=1).head() # Use the column names as the subplot titles diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index 7c66b5dafb9c7..9c9011ba1ca7b 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -12,7 +12,6 @@ from pandas import Series, DataFrame, date_range from pandas.compat import range, lrange import pandas.util.testing as tm -from pandas.util.testing import slow import numpy as np from numpy.random import randn @@ -41,7 +40,7 @@ def setup_method(self, method): self.iseries = tm.makePeriodSeries() self.iseries.name = 'iseries' - @slow + @pytest.mark.slow def test_plot(self): _check_plot_works(self.ts.plot, label='foo') _check_plot_works(self.ts.plot, use_index=False) @@ -79,7 +78,7 @@ def test_plot(self): ax = _check_plot_works(self.ts.plot, subplots=True, layout=(1, -1)) self._check_axes_shape(ax, axes_num=1, layout=(1, 1)) - @slow + @pytest.mark.slow def test_plot_figsize_and_title(self): # figsize and title _, ax = self.plt.subplots() @@ -210,7 +209,7 @@ def test_line_use_index_false(self): label2 = ax2.get_xlabel() assert label2 == '' - @slow + @pytest.mark.slow def test_bar_log(self): expected = np.array([1., 10., 100., 1000.]) @@ -252,7 +251,7 @@ def test_bar_log(self): tm.assert_almost_equal(res[1], ymax) tm.assert_numpy_array_equal(ax.xaxis.get_ticklocs(), expected) - @slow + @pytest.mark.slow def test_bar_ignore_index(self): df = Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']) _, ax = self.plt.subplots() @@ -280,7 +279,7 @@ def test_irregular_datetime(self): ax.set_xlim('1/1/1999', '1/1/2001') assert xp == ax.get_xlim()[0] - @slow + @pytest.mark.slow def test_pie_series(self): # if sum of values is less than 1.0, pie handle them as rate and draw # semicircle. @@ -339,14 +338,14 @@ def test_pie_nan(self): result = [x.get_text() for x in ax.texts] assert result == expected - @slow + @pytest.mark.slow def test_hist_df_kwargs(self): df = DataFrame(np.random.randn(10, 2)) _, ax = self.plt.subplots() ax = df.plot.hist(bins=5, ax=ax) assert len(ax.patches) == 10 - @slow + @pytest.mark.slow def test_hist_df_with_nonnumerics(self): # GH 9853 with tm.RNGContext(1): @@ -361,7 +360,7 @@ def test_hist_df_with_nonnumerics(self): ax = df.plot.hist(ax=ax) # bins=10 assert len(ax.patches) == 40 - @slow + @pytest.mark.slow def test_hist_legacy(self): _check_plot_works(self.ts.hist) _check_plot_works(self.ts.hist, grid=False) @@ -387,13 +386,13 @@ def test_hist_legacy(self): with pytest.raises(ValueError): self.ts.hist(by=self.ts.index, figure=fig) - @slow + @pytest.mark.slow def test_hist_bins_legacy(self): df = DataFrame(np.random.randn(10, 2)) ax = df.hist(bins=2)[0][0] assert len(ax.patches) == 2 - @slow + @pytest.mark.slow def test_hist_layout(self): df = self.hist_df with pytest.raises(ValueError): @@ -402,7 +401,7 @@ def test_hist_layout(self): with pytest.raises(ValueError): df.height.hist(layout=[1, 1]) - @slow + @pytest.mark.slow def test_hist_layout_with_by(self): df = self.hist_df @@ -446,7 +445,7 @@ def test_hist_layout_with_by(self): self._check_axes_shape(axes, axes_num=4, layout=(4, 2), figsize=(12, 7)) - @slow + @pytest.mark.slow def test_hist_no_overlap(self): from matplotlib.pyplot import subplot, gcf x = Series(randn(2)) @@ -459,7 +458,7 @@ def test_hist_no_overlap(self): axes = fig.axes if self.mpl_ge_1_5_0 else fig.get_axes() assert len(axes) == 2 - @slow + @pytest.mark.slow def test_hist_secondary_legend(self): # GH 9610 df = DataFrame(np.random.randn(30, 4), columns=list('abcd')) @@ -499,7 +498,7 @@ def test_hist_secondary_legend(self): assert ax.get_yaxis().get_visible() tm.close() - @slow + @pytest.mark.slow def test_df_series_secondary_legend(self): # GH 9779 df = DataFrame(np.random.randn(30, 3), columns=list('abc')) @@ -563,14 +562,14 @@ def test_df_series_secondary_legend(self): assert ax.get_yaxis().get_visible() tm.close() - @slow + @pytest.mark.slow def test_plot_fails_with_dupe_color_and_style(self): x = Series(randn(2)) with pytest.raises(ValueError): _, ax = self.plt.subplots() x.plot(style='k--', color='k', ax=ax) - @slow + @pytest.mark.slow def test_hist_kde(self): _, ax = self.plt.subplots() ax = self.ts.plot.hist(logy=True, ax=ax) @@ -593,7 +592,7 @@ def test_hist_kde(self): ylabels = ax.get_yticklabels() self._check_text_labels(ylabels, [''] * len(ylabels)) - @slow + @pytest.mark.slow def test_kde_kwargs(self): tm._skip_if_no_scipy() _skip_if_no_scipy_gaussian_kde() @@ -608,7 +607,7 @@ def test_kde_kwargs(self): self._check_ax_scales(ax, yaxis='log') self._check_text_labels(ax.yaxis.get_label(), 'Density') - @slow + @pytest.mark.slow def test_kde_missing_vals(self): tm._skip_if_no_scipy() _skip_if_no_scipy_gaussian_kde() @@ -619,7 +618,7 @@ def test_kde_missing_vals(self): # gh-14821: check if the values have any missing values assert any(~np.isnan(axes.lines[0].get_xdata())) - @slow + @pytest.mark.slow def test_hist_kwargs(self): _, ax = self.plt.subplots() ax = self.ts.plot.hist(bins=5, ax=ax) @@ -637,7 +636,7 @@ def test_hist_kwargs(self): ax = self.ts.plot.hist(align='left', stacked=True, ax=ax) tm.close() - @slow + @pytest.mark.slow def test_hist_kde_color(self): _, ax = self.plt.subplots() ax = self.ts.plot.hist(logy=True, bins=10, color='b', ax=ax) @@ -654,7 +653,7 @@ def test_hist_kde_color(self): assert len(lines) == 1 self._check_colors(lines, ['r']) - @slow + @pytest.mark.slow def test_boxplot_series(self): _, ax = self.plt.subplots() ax = self.ts.plot.box(logy=True, ax=ax) @@ -664,7 +663,7 @@ def test_boxplot_series(self): ylabels = ax.get_yticklabels() self._check_text_labels(ylabels, [''] * len(ylabels)) - @slow + @pytest.mark.slow def test_kind_both_ways(self): s = Series(range(3)) kinds = (plotting._core._common_kinds + @@ -676,7 +675,7 @@ def test_kind_both_ways(self): s.plot(kind=kind, ax=ax) getattr(s.plot, kind)() - @slow + @pytest.mark.slow def test_invalid_plot_data(self): s = Series(list('abcd')) _, ax = self.plt.subplots() @@ -686,7 +685,7 @@ def test_invalid_plot_data(self): with pytest.raises(TypeError): s.plot(kind=kind, ax=ax) - @slow + @pytest.mark.slow def test_valid_object_plot(self): s = Series(lrange(10), dtype=object) for kind in plotting._core._common_kinds: @@ -708,7 +707,7 @@ def test_invalid_kind(self): with pytest.raises(ValueError): s.plot(kind='aasdf') - @slow + @pytest.mark.slow def test_dup_datetime_index_plot(self): dr1 = date_range('1/1/2009', periods=4) dr2 = date_range('1/2/2009', periods=4) @@ -717,7 +716,7 @@ def test_dup_datetime_index_plot(self): s = Series(values, index=index) _check_plot_works(s.plot) - @slow + @pytest.mark.slow def test_errorbar_plot(self): s = Series(np.arange(10), name='x') @@ -764,14 +763,14 @@ def test_table(self): _check_plot_works(self.series.plot, table=True) _check_plot_works(self.series.plot, table=self.series) - @slow + @pytest.mark.slow def test_series_grid_settings(self): # Make sure plot defaults to rcParams['axes.grid'] setting, GH 9792 self._check_grid_settings(Series([1, 2, 3]), plotting._core._series_kinds + plotting._core._common_kinds) - @slow + @pytest.mark.slow def test_standard_colors(self): from pandas.plotting._style import _get_standard_colors @@ -788,7 +787,7 @@ def test_standard_colors(self): result = _get_standard_colors(3, color=[c]) assert result == [c] * 3 - @slow + @pytest.mark.slow def test_standard_colors_all(self): import matplotlib.colors as colors from pandas.plotting._style import _get_standard_colors diff --git a/pandas/tests/series/test_indexing.py b/pandas/tests/series/test_indexing.py index 7774d10c5eaf8..6d8a54b538237 100644 --- a/pandas/tests/series/test_indexing.py +++ b/pandas/tests/series/test_indexing.py @@ -20,8 +20,7 @@ from pandas.compat import lrange, range from pandas import compat -from pandas.util.testing import (slow, - assert_series_equal, +from pandas.util.testing import (assert_series_equal, assert_almost_equal, assert_frame_equal) import pandas.util.testing as tm @@ -2592,7 +2591,7 @@ def test_series_set_value(self): # s2 = s.set_value(dates[1], index[1]) # assert s2.values.dtype == 'M8[ns]' - @slow + @pytest.mark.slow def test_slice_locs_indexerror(self): times = [datetime(2000, 1, 1) + timedelta(minutes=i * 10) for i in range(100000)] diff --git a/pandas/tests/test_expressions.py b/pandas/tests/test_expressions.py index 08c3a25e66b0e..2b972477ae999 100644 --- a/pandas/tests/test_expressions.py +++ b/pandas/tests/test_expressions.py @@ -16,7 +16,7 @@ from pandas import compat, _np_version_under1p11, _np_version_under1p13 from pandas.util.testing import (assert_almost_equal, assert_series_equal, assert_frame_equal, assert_panel_equal, - assert_panel4d_equal, slow) + assert_panel4d_equal) from pandas.io.formats.printing import pprint_thing import pandas.util.testing as tm @@ -196,7 +196,7 @@ def test_integer_arithmetic_frame(self): def test_integer_arithmetic_series(self): self.run_series(self.integer.iloc[:, 0], self.integer.iloc[:, 0]) - @slow + @pytest.mark.slow def test_integer_panel(self): self.run_panel(_integer2_panel, np.random.randint(1, 100)) @@ -206,11 +206,11 @@ def test_float_arithemtic_frame(self): def test_float_arithmetic_series(self): self.run_series(self.frame2.iloc[:, 0], self.frame2.iloc[:, 0]) - @slow + @pytest.mark.slow def test_float_panel(self): self.run_panel(_frame2_panel, np.random.randn() + 0.1, binary_comp=0.8) - @slow + @pytest.mark.slow def test_panel4d(self): with catch_warnings(record=True): self.run_panel(tm.makePanel4D(), np.random.randn() + 0.5, @@ -226,7 +226,7 @@ def test_mixed_arithmetic_series(self): for col in self.mixed2.columns: self.run_series(self.mixed2[col], self.mixed2[col], binary_comp=4) - @slow + @pytest.mark.slow def test_mixed_panel(self): self.run_panel(_mixed2_panel, np.random.randint(1, 100), binary_comp=-2) diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index 3ba5d2065cddf..dd35e4375841e 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -2155,7 +2155,7 @@ def _non_null_values(x): assert_equal(cov_x_y, mean_x_times_y - (mean_x * mean_y)) - @tm.slow + @pytest.mark.slow def test_ewm_consistency(self): def _weights(s, com, adjust, ignore_na): if isinstance(s, DataFrame): @@ -2254,7 +2254,7 @@ def _ewma(s, com, min_periods, adjust, ignore_na): _variance_debiasing_factors(x, com=com, adjust=adjust, ignore_na=ignore_na))) - @tm.slow + @pytest.mark.slow def test_expanding_consistency(self): # suppress warnings about empty slices, as we are deliberately testing @@ -2328,7 +2328,7 @@ def test_expanding_consistency(self): assert_equal(expanding_f_result, expanding_apply_f_result) - @tm.slow + @pytest.mark.slow def test_rolling_consistency(self): # suppress warnings about empty slices, as we are deliberately testing diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 17e09b38b20e0..d6ba9561340cc 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -50,13 +50,6 @@ from pandas._libs import testing as _testing from pandas.io.common import urlopen -try: - import pytest - slow = pytest.mark.slow -except ImportError: - # Should be ok to just ignore. If you actually need - # slow then you'll hit an import error long before getting here. - pass N = 30 From 63536f4a80a1f1f03732411d015910c55a1f9290 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Thu, 13 Jul 2017 12:15:26 -0700 Subject: [PATCH 12/54] MAINT: Remove unused mock import (#16908) We import it, set it as an attribute, and then don't use it. --- pandas/tests/io/formats/test_printing.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/pandas/tests/io/formats/test_printing.py b/pandas/tests/io/formats/test_printing.py index aae3ba31648ff..ec34e7656e01f 100644 --- a/pandas/tests/io/formats/test_printing.py +++ b/pandas/tests/io/formats/test_printing.py @@ -127,14 +127,7 @@ class TestTableSchemaRepr(object): @classmethod def setup_class(cls): pytest.importorskip('IPython') - try: - import mock - except ImportError: - try: - from unittest import mock - except ImportError: - pytest.skip("Mock is not installed") - cls.mock = mock + from IPython.core.interactiveshell import InteractiveShell cls.display_formatter = InteractiveShell.instance().display_formatter From 25384ba459ba7de9fb9d36821f0a4ae239cc40b2 Mon Sep 17 00:00:00 2001 From: topper-123 Date: Thu, 13 Jul 2017 21:35:48 +0100 Subject: [PATCH 13/54] Let _get_dtype accept Categoricals and CategoricalIndex (#16887) --- doc/source/whatsnew/v0.21.0.txt | 1 - pandas/core/dtypes/common.py | 4 +++- pandas/tests/dtypes/test_common.py | 6 +++--- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index a5ee0e0ce2653..8ba57c0fa50be 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -149,7 +149,6 @@ Conversion ^^^^^^^^^^ - Indexing ^^^^^^^^ diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 2eebf3704253e..a386c04cc4fdd 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -11,7 +11,7 @@ ExtensionDtype) from .generic import (ABCCategorical, ABCPeriodIndex, ABCDatetimeIndex, ABCSeries, - ABCSparseArray, ABCSparseSeries) + ABCSparseArray, ABCSparseSeries, ABCCategoricalIndex) from .inference import is_string_like from .inference import * # noqa @@ -1713,6 +1713,8 @@ def _get_dtype(arr_or_dtype): return PeriodDtype.construct_from_string(arr_or_dtype) elif is_interval_dtype(arr_or_dtype): return IntervalDtype.construct_from_string(arr_or_dtype) + elif isinstance(arr_or_dtype, (ABCCategorical, ABCCategoricalIndex)): + return arr_or_dtype.dtype if hasattr(arr_or_dtype, 'dtype'): arr_or_dtype = arr_or_dtype.dtype diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index c32e8590c5675..7188e397c0617 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -532,16 +532,16 @@ def test_is_complex_dtype(): (float, np.dtype(float)), ('float64', np.dtype('float64')), (np.dtype('float64'), np.dtype('float64')), - pytest.mark.xfail((str, np.dtype(' Date: Thu, 13 Jul 2017 19:04:29 -0400 Subject: [PATCH 14/54] Fixes for #16896(TimedeltaIndex indexing regression for strings) (#16907) --- doc/source/whatsnew/v0.21.0.txt | 2 +- pandas/core/dtypes/common.py | 4 +++- pandas/tests/dtypes/test_common.py | 9 +++++++-- pandas/tests/indexes/timedeltas/test_timedelta.py | 3 +++ pandas/tests/indexing/test_timedelta.py | 9 ++++++++- 5 files changed, 22 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 8ba57c0fa50be..039b24cc63217 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -154,7 +154,7 @@ Indexing - When called with a null slice (e.g. ``df.iloc[:]``), the ``.iloc`` and ``.loc`` indexers return a shallow copy of the original object. Previously they returned the original object. (:issue:`13873`). - When called on an unsorted ``MultiIndex``, the ``loc`` indexer now will raise ``UnsortedIndexError`` only if proper slicing is used on non-sorted levels (:issue:`16734`). - +- Fixes regression in 0.20.3 when indexing with a string on a ``TimedeltaIndex`` (:issue:`16896`). I/O ^^^ diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index a386c04cc4fdd..114900ce802be 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -392,13 +392,15 @@ def is_timedelta64_dtype(arr_or_dtype): False >>> is_timedelta64_dtype(pd.Series([], dtype="timedelta64[ns]")) True + >>> is_timedelta64_dtype('0 days') + False """ if arr_or_dtype is None: return False try: tipo = _get_dtype_type(arr_or_dtype) - except ValueError: + except: return False return issubclass(tipo, np.timedelta64) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 7188e397c0617..290cdd732b6d6 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -199,12 +199,17 @@ def test_is_datetime64tz_dtype(): def test_is_timedelta64_dtype(): assert not com.is_timedelta64_dtype(object) + assert not com.is_timedelta64_dtype(None) assert not com.is_timedelta64_dtype([1, 2, 3]) assert not com.is_timedelta64_dtype(np.array([], dtype=np.datetime64)) + assert not com.is_timedelta64_dtype('0 days') + assert not com.is_timedelta64_dtype("0 days 00:00:00") + assert not com.is_timedelta64_dtype(["0 days 00:00:00"]) + assert not com.is_timedelta64_dtype("NO DATE") + assert com.is_timedelta64_dtype(np.timedelta64) assert com.is_timedelta64_dtype(pd.Series([], dtype="timedelta64[ns]")) - - assert not com.is_timedelta64_dtype("0 days 00:00:00") + assert com.is_timedelta64_dtype(pd.to_timedelta(['0 days', '1 days'])) def test_is_period_dtype(): diff --git a/pandas/tests/indexes/timedeltas/test_timedelta.py b/pandas/tests/indexes/timedeltas/test_timedelta.py index 08cf5108ffdb1..a4fc26382fb9b 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta.py @@ -66,6 +66,9 @@ def test_get_loc(self): for method, loc in [('pad', 1), ('backfill', 2), ('nearest', 1)]: assert idx.get_loc('1 day 1 hour', method) == loc + # GH 16896 + assert idx.get_loc('0 days') == 0 + def test_get_loc_nat(self): tidx = TimedeltaIndex(['1 days 01:00:00', 'NaT', '2 days 01:00:00']) diff --git a/pandas/tests/indexing/test_timedelta.py b/pandas/tests/indexing/test_timedelta.py index be3ea8f0c371d..32609362e49af 100644 --- a/pandas/tests/indexing/test_timedelta.py +++ b/pandas/tests/indexing/test_timedelta.py @@ -5,7 +5,6 @@ class TestTimedeltaIndexing(object): - def test_boolean_indexing(self): # GH 14946 df = pd.DataFrame({'x': range(10)}) @@ -40,3 +39,11 @@ def test_list_like_indexing(self, indexer, expected): dtype="int64") tm.assert_frame_equal(expected, df) + + def test_string_indexing(self): + # GH 16896 + df = pd.DataFrame({'x': range(3)}, + index=pd.to_timedelta(range(3), unit='days')) + expected = df.iloc[0] + sliced = df.loc['0 days'] + tm.assert_series_equal(sliced, expected) From 6000c5b9624fdd8925099f215eba282bfbef87ce Mon Sep 17 00:00:00 2001 From: jdeschenes Date: Fri, 14 Jul 2017 10:13:53 -0400 Subject: [PATCH 15/54] Fix for #16909(DeltatimeIndex.get_loc is not working on np.deltatime64 data type) (#16912) --- doc/source/whatsnew/v0.21.0.txt | 1 + pandas/core/indexes/timedeltas.py | 4 ++-- pandas/tests/indexes/timedeltas/test_timedelta.py | 3 +++ 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 039b24cc63217..2716d9b09eaa9 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -155,6 +155,7 @@ Indexing - When called with a null slice (e.g. ``df.iloc[:]``), the ``.iloc`` and ``.loc`` indexers return a shallow copy of the original object. Previously they returned the original object. (:issue:`13873`). - When called on an unsorted ``MultiIndex``, the ``loc`` indexer now will raise ``UnsortedIndexError`` only if proper slicing is used on non-sorted levels (:issue:`16734`). - Fixes regression in 0.20.3 when indexing with a string on a ``TimedeltaIndex`` (:issue:`16896`). +- Fixed ``TimedeltaIndex.get_loc`` handling of ``np.timedelta64`` inputs (:issue:`16909`). I/O ^^^ diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index faec813df3993..68713743d72ed 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -15,7 +15,7 @@ _ensure_int64) from pandas.core.dtypes.missing import isnull from pandas.core.dtypes.generic import ABCSeries -from pandas.core.common import _maybe_box, _values_from_object, is_bool_indexer +from pandas.core.common import _maybe_box, _values_from_object from pandas.core.indexes.base import Index from pandas.core.indexes.numeric import Int64Index @@ -682,7 +682,7 @@ def get_loc(self, key, method=None, tolerance=None): ------- loc : int """ - if is_bool_indexer(key) or is_timedelta64_dtype(key): + if is_list_like(key): raise TypeError if isnull(key): diff --git a/pandas/tests/indexes/timedeltas/test_timedelta.py b/pandas/tests/indexes/timedeltas/test_timedelta.py index a4fc26382fb9b..59e4b1432b8bc 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta.py @@ -66,6 +66,9 @@ def test_get_loc(self): for method, loc in [('pad', 1), ('backfill', 2), ('nearest', 1)]: assert idx.get_loc('1 day 1 hour', method) == loc + # GH 16909 + assert idx.get_loc(idx[1].to_timedelta64()) == 1 + # GH 16896 assert idx.get_loc('0 days') == 0 From a587d568d213c62307a72d98d6913239f55844e8 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 14 Jul 2017 14:46:41 -0500 Subject: [PATCH 16/54] DOC: Recommend sphinx 1.5 for now (#16929) For the SciPy sprint tomorrow, until the cause of the doc-building slowdown is fully identified. --- ci/requirements_all.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/requirements_all.txt b/ci/requirements_all.txt index e9f49ed879c86..de37ec4d20be4 100644 --- a/ci/requirements_all.txt +++ b/ci/requirements_all.txt @@ -2,7 +2,7 @@ pytest pytest-cov pytest-xdist flake8 -sphinx +sphinx=1.5* nbsphinx ipython python-dateutil From 6858d0f6caa60c98acc4b6c3eaa6cd0309aedca6 Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Fri, 14 Jul 2017 22:20:28 +0100 Subject: [PATCH 17/54] BUG: Allow value labels to be read with iterator (#16926) All value labels to be read before the iterator has been used Fix issue where categorical data was incorrectly reformatted when write_index was False closes #16923 --- doc/source/whatsnew/v0.21.0.txt | 1 + pandas/io/stata.py | 36 ++++++++++++++++++--------------- pandas/tests/io/test_stata.py | 18 ++++++++++++++--- 3 files changed, 36 insertions(+), 19 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 2716d9b09eaa9..bd19d71182762 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -162,6 +162,7 @@ I/O - Bug in :func:`read_csv` in which non integer values for the header argument generated an unhelpful / unrelated error message (:issue:`16338`) +- Bug in :func:`read_stata` where value labels could not be read when using an iterator (:issue:`16923`) Plotting ^^^^^^^^ diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 107dccfc8175c..30991d8a24c63 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -997,6 +997,7 @@ def __init__(self, path_or_buf, convert_dates=True, self.path_or_buf = BytesIO(contents) self._read_header() + self._setup_dtype() def __enter__(self): """ enter context manager """ @@ -1299,6 +1300,23 @@ def _read_old_header(self, first_char): # necessary data to continue parsing self.data_location = self.path_or_buf.tell() + def _setup_dtype(self): + """Map between numpy and state dtypes""" + if self._dtype is not None: + return self._dtype + + dtype = [] # Convert struct data types to numpy data type + for i, typ in enumerate(self.typlist): + if typ in self.NUMPY_TYPE_MAP: + dtype.append(('s' + str(i), self.byteorder + + self.NUMPY_TYPE_MAP[typ])) + else: + dtype.append(('s' + str(i), 'S' + str(typ))) + dtype = np.dtype(dtype) + self._dtype = dtype + + return self._dtype + def _calcsize(self, fmt): return (type(fmt) is int and fmt or struct.calcsize(self.byteorder + fmt)) @@ -1472,22 +1490,10 @@ def read(self, nrows=None, convert_dates=None, if nrows is None: nrows = self.nobs - if (self.format_version >= 117) and (self._dtype is None): + if (self.format_version >= 117) and (not self._value_labels_read): self._can_read_value_labels = True self._read_strls() - # Setup the dtype. - if self._dtype is None: - dtype = [] # Convert struct data types to numpy data type - for i, typ in enumerate(self.typlist): - if typ in self.NUMPY_TYPE_MAP: - dtype.append(('s' + str(i), self.byteorder + - self.NUMPY_TYPE_MAP[typ])) - else: - dtype.append(('s' + str(i), 'S' + str(typ))) - dtype = np.dtype(dtype) - self._dtype = dtype - # Read data dtype = self._dtype max_read_len = (self.nobs - self._lines_read) * dtype.itemsize @@ -1958,7 +1964,6 @@ def _prepare_categoricals(self, data): return data get_base_missing_value = StataMissingValue.get_base_missing_value - index = data.index data_formatted = [] for col, col_is_cat in zip(data, is_cat): if col_is_cat: @@ -1981,8 +1986,7 @@ def _prepare_categoricals(self, data): # Replace missing values with Stata missing value for type values[values == -1] = get_base_missing_value(dtype) - data_formatted.append((col, values, index)) - + data_formatted.append((col, values)) else: data_formatted.append((col, data[col])) return DataFrame.from_items(data_formatted) diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index b9c6736563160..a414928d318c4 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -9,18 +9,18 @@ from datetime import datetime from distutils.version import LooseVersion -import pytest import numpy as np import pandas as pd import pandas.util.testing as tm +import pytest from pandas import compat +from pandas._libs.tslib import NaT from pandas.compat import iterkeys +from pandas.core.dtypes.common import is_categorical_dtype from pandas.core.frame import DataFrame, Series from pandas.io.parsers import read_csv from pandas.io.stata import (read_stata, StataReader, InvalidColumnName, PossiblePrecisionLoss, StataMissingValue) -from pandas._libs.tslib import NaT -from pandas.core.dtypes.common import is_categorical_dtype class TestStata(object): @@ -1297,3 +1297,15 @@ def test_pickle_path_localpath(self): reader = lambda x: read_stata(x).set_index('index') result = tm.round_trip_localpath(df.to_stata, reader) tm.assert_frame_equal(df, result) + + @pytest.mark.parametrize('write_index', [True, False]) + def test_value_labels_iterator(self, write_index): + # GH 16923 + d = {'A': ['B', 'E', 'C', 'A', 'E']} + df = pd.DataFrame(data=d) + df['A'] = df['A'].astype('category') + with tm.ensure_clean() as path: + df.to_stata(path, write_index=write_index) + dta_iter = pd.read_stata(path, iterator=True) + value_labels = dta_iter.value_labels() + assert value_labels == {'A': {0: 'A', 1: 'B', 2: 'C', 3: 'E'}} From ad24759871ea43131711cfce1e5fc69c06d82956 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Fri, 14 Jul 2017 21:16:00 -0700 Subject: [PATCH 18/54] DOC: Update flake8 command instructions (#16919) --- .github/PULL_REQUEST_TEMPLATE.md | 2 +- doc/source/contributing.rst | 24 +++++++++++++++++------- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 959858fb50f89..e8b6ee21ad104 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,4 +1,4 @@ - [ ] closes #xxxx - [ ] tests added / passed - - [ ] passes ``git diff upstream/master --name-only -- '*.py' | flake8 --diff`` (On Windows, ``git diff upstream/master -u -- "*.py" | flake8 --diff`` might work as an alternative.) + - [ ] passes ``git diff upstream/master -u -- "*.py" | flake8 --diff`` - [ ] whatsnew entry diff --git a/doc/source/contributing.rst b/doc/source/contributing.rst index cd444f796fabb..bfcf560565977 100644 --- a/doc/source/contributing.rst +++ b/doc/source/contributing.rst @@ -509,7 +509,7 @@ the `flake8 `_ tool and report any stylistic errors in your code. Therefore, it is helpful before submitting code to run the check yourself on the diff:: - git diff master --name-only -- '*.py' | flake8 --diff + git diff master -u -- "*.py" | flake8 --diff This command will catch any stylistic errors in your changes specifically, but be beware it may not catch all of them. For example, if you delete the only @@ -518,18 +518,28 @@ unused function. However, style-checking the diff will not catch this because the actual import is not part of the diff. Thus, for completeness, you should run this command, though it will take longer:: - git diff master --name-only -- '*.py' | grep 'pandas/' | xargs -r flake8 + git diff master --name-only -- "*.py" | grep "pandas/" | xargs -r flake8 Note that on OSX, the ``-r`` flag is not available, so you have to omit it and run this slightly modified command:: - git diff master --name-only -- '*.py' | grep 'pandas/' | xargs flake8 + git diff master --name-only -- "*.py" | grep "pandas/" | xargs flake8 -Note that on Windows, ``grep``, ``xargs``, and other tools are likely -unavailable. However, this has been shown to work on smaller commits in the -standard Windows command line:: +Note that on Windows, these commands are unfortunately not possible because +commands like ``grep`` and ``xargs`` are not available natively. To imitate the +behavior with the commands above, you should run:: - git diff master -u -- "*.py" | flake8 --diff + git diff master --name-only -- "*.py" + +This will list all of the Python files that have been modified. The only ones +that matter during linting are any whose directory filepath begins with "pandas." +For each filepath, copy and paste it after the ``flake8`` command as shown below: + + flake8 + +Alternatively, you can install the ``grep`` and ``xargs`` commands via the +`MinGW `__ toolchain, and it will allow you to run the +commands above. Backwards Compatibility ~~~~~~~~~~~~~~~~~~~~~~~ From 5f2b96bb637f6ddeec169c5ef8ad20013a03c853 Mon Sep 17 00:00:00 2001 From: Eric Wieser Date: Sat, 15 Jul 2017 13:30:03 +0100 Subject: [PATCH 19/54] TST: Don't assert that a bug exists in numpy (#16940) Better to ignore the warning from the bug, rather than assert the bug is still there After this change, numpy/numpy#9412 _could_ be backported to fix the bug --- pandas/tests/test_algos.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 9504d2a9426f0..993dcc4f527b2 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -2,6 +2,7 @@ import numpy as np import pytest +import warnings from numpy.random import RandomState from numpy import nan @@ -127,7 +128,7 @@ def test_unsortable(self): arr = np.array([1, 2, datetime.now(), 0, 3], dtype=object) if compat.PY2 and not pd._np_version_under1p10: # RuntimeWarning: tp_compare didn't return -1 or -2 for exception - with tm.assert_produces_warning(RuntimeWarning): + with warnings.catch_warnings(): pytest.raises(TypeError, algos.safe_sort, arr) else: pytest.raises(TypeError, algos.safe_sort, arr) From 6cee09ebfd2e8fb15f3e225bd9770852a6a533d1 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 15 Jul 2017 08:11:34 -0500 Subject: [PATCH 20/54] CI: add .pep8speakes.yml --- .pep8speakes.yml | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 .pep8speakes.yml diff --git a/.pep8speakes.yml b/.pep8speakes.yml new file mode 100644 index 0000000000000..299b76c8922cc --- /dev/null +++ b/.pep8speakes.yml @@ -0,0 +1,10 @@ +# File : .pep8speaks.yml + +scanner: + diff_only: True # If True, errors caused by only the patch are shown + +pycodestyle: + max-line-length: 79 + ignore: # Errors and warnings to ignore + - E731 + - E402 From 80e40f81d78ade9921607a092a00b83f9d34cfd3 Mon Sep 17 00:00:00 2001 From: faic Date: Sat, 15 Jul 2017 16:58:24 +0300 Subject: [PATCH 21/54] CLN16668: remove OrderedDefaultDict (#16939) --- pandas/compat/__init__.py | 25 ------------------------- pandas/core/panel.py | 6 ++++-- 2 files changed, 4 insertions(+), 27 deletions(-) diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 9eacb9acef2c9..33b41d61aa978 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -21,7 +21,6 @@ given metaclass instead (and avoids intermediary class creation) Other items: -* OrderedDefaultDict * platform checker """ # pylint disable=W0611 @@ -373,30 +372,6 @@ def parse_date(timestr, *args, **kwargs): parse_date = _date_parser.parse -class OrderedDefaultdict(OrderedDict): - - def __init__(self, *args, **kwargs): - newdefault = None - newargs = () - if args: - newdefault = args[0] - if not (newdefault is None or callable(newdefault)): - raise TypeError('first argument must be callable or None') - newargs = args[1:] - self.default_factory = newdefault - super(self.__class__, self).__init__(*newargs, **kwargs) - - def __missing__(self, key): - if self.default_factory is None: - raise KeyError(key) - self[key] = value = self.default_factory() - return value - - def __reduce__(self): # optional, for pickle support - args = self.default_factory if self.default_factory else tuple() - return type(self), args, None, None, list(self.items()) - - # https://github.com/pandas-dev/pandas/pull/9123 def is_platform_little_endian(): """ am I little endian """ diff --git a/pandas/core/panel.py b/pandas/core/panel.py index d1f5b4587059c..69a8468552f54 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -19,7 +19,7 @@ import pandas.core.ops as ops import pandas.core.missing as missing from pandas import compat -from pandas.compat import (map, zip, range, u, OrderedDict, OrderedDefaultdict) +from pandas.compat import (map, zip, range, u, OrderedDict) from pandas.compat.numpy import function as nv from pandas.core.common import _try_sort, _default_index from pandas.core.frame import DataFrame @@ -260,9 +260,11 @@ def from_dict(cls, data, intersect=False, orient='items', dtype=None): ------- Panel """ + from collections import defaultdict + orient = orient.lower() if orient == 'minor': - new_data = OrderedDefaultdict(dict) + new_data = defaultdict(OrderedDict) for col, df in compat.iteritems(data): for item, s in compat.iteritems(df): new_data[item][col] = s From 61f0c5ce2eae8a548e4729ee5cc8a8633faa8316 Mon Sep 17 00:00:00 2001 From: Alan Velasco Date: Sat, 15 Jul 2017 10:34:04 -0500 Subject: [PATCH 22/54] Change "pls" to "please" in error message (#16947) --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6559fc4c24ce2..4d8b831b7d63f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3359,7 +3359,7 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, inplace = validate_bool_kwarg(inplace, 'inplace') # 10726 if by is not None: - warnings.warn("by argument to sort_index is deprecated, pls use " + warnings.warn("by argument to sort_index is deprecated, please use " ".sort_values(by=...)", FutureWarning, stacklevel=2) if level is not None: raise ValueError("unable to simultaneously sort by and level") From 0e47b280ae6159dbc8817f3c7bd3e296af480c5d Mon Sep 17 00:00:00 2001 From: Alex Lubbock Date: Sat, 15 Jul 2017 10:34:31 -0500 Subject: [PATCH 23/54] BUG: MultiIndex sort with ascending as list (#16937) --- doc/source/whatsnew/v0.21.0.txt | 1 + pandas/core/indexes/multi.py | 3 ++- pandas/tests/test_multilevel.py | 23 +++++++++++++++++++++++ 3 files changed, 26 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index bd19d71182762..6ddf6029b99bb 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -156,6 +156,7 @@ Indexing - When called on an unsorted ``MultiIndex``, the ``loc`` indexer now will raise ``UnsortedIndexError`` only if proper slicing is used on non-sorted levels (:issue:`16734`). - Fixes regression in 0.20.3 when indexing with a string on a ``TimedeltaIndex`` (:issue:`16896`). - Fixed ``TimedeltaIndex.get_loc`` handling of ``np.timedelta64`` inputs (:issue:`16909`). +- Fix :meth:`MultiIndex.sort_index` ordering when ``ascending`` argument is a list, but not all levels are specified, or are in a different order (:issue:`16934`). I/O ^^^ diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 81eac0ac0684f..ed7ca079a07b5 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1697,7 +1697,8 @@ def sortlevel(self, level=0, ascending=True, sort_remaining=True): raise ValueError("level must have same length as ascending") from pandas.core.sorting import lexsort_indexer - indexer = lexsort_indexer(self.labels, orders=ascending) + indexer = lexsort_indexer([self.labels[lev] for lev in level], + orders=ascending) # level ordering else: diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index c8c210c42eac2..a56ff0fc2d158 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -2781,3 +2781,26 @@ def test_sort_index_nan(self): result = s.sort_index(na_position='first') expected = s.iloc[[1, 2, 3, 0]] tm.assert_series_equal(result, expected) + + def test_sort_ascending_list(self): + # GH: 16934 + + # Set up a Series with a three level MultiIndex + arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], + ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'], + [4, 3, 2, 1, 4, 3, 2, 1]] + tuples = list(zip(*arrays)) + index = pd.MultiIndex.from_tuples(tuples, + names=['first', 'second', 'third']) + s = pd.Series(range(8), index=index) + + # Sort with boolean ascending + result = s.sort_index(level=['third', 'first'], ascending=False) + expected = s.iloc[[4, 0, 5, 1, 6, 2, 7, 3]] + tm.assert_series_equal(result, expected) + + # Sort with list of boolean ascending + result = s.sort_index(level=['third', 'first'], + ascending=[False, True]) + expected = s.iloc[[0, 4, 1, 5, 2, 6, 3, 7]] + tm.assert_series_equal(result, expected) From d7bf220c2daeaf86ba2e2026b4fe900d441720d8 Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Sat, 15 Jul 2017 17:38:22 +0200 Subject: [PATCH 24/54] DOC: Improving docstring of pop method (#16416) (#16520) --- pandas/core/generic.py | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 5722539b87aec..a4bb746722c1e 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -547,6 +547,43 @@ def swapaxes(self, axis1, axis2, copy=True): def pop(self, item): """ Return item and drop from frame. Raise KeyError if not found. + + Parameters + ---------- + item : str + Column label to be popped + + Returns + ------- + popped : Series + + Examples + -------- + >>> df = pd.DataFrame([('falcon', 'bird', 389.0), + ... ('parrot', 'bird', 24.0), + ... ('lion', 'mammal', 80.5), + ... ('monkey', 'mammal', np.nan)], + ... columns=('name', 'class', 'max_speed')) + >>> df + name class max_speed + 0 falcon bird 389.0 + 1 parrot bird 24.0 + 2 lion mammal 80.5 + 3 monkey mammal NaN + + >>> df.pop('class') + 0 bird + 1 bird + 2 mammal + 3 mammal + Name: class, dtype: object + + >>> df + name max_speed + 0 falcon 389.0 + 1 parrot 24.0 + 2 lion 80.5 + 3 monkey NaN """ result = self[item] del self[item] From 794fd789603e06e86456375f92489ae4de92a99a Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 15 Jul 2017 10:57:41 -0500 Subject: [PATCH 25/54] PEP8 --- pandas/core/frame.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4d8b831b7d63f..b5462bbe67647 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3359,8 +3359,9 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, inplace = validate_bool_kwarg(inplace, 'inplace') # 10726 if by is not None: - warnings.warn("by argument to sort_index is deprecated, please use " - ".sort_values(by=...)", FutureWarning, stacklevel=2) + warnings.warn("by argument to sort_index is deprecated, " + "please use .sort_values(by=...)", + FutureWarning, stacklevel=2) if level is not None: raise ValueError("unable to simultaneously sort by and level") return self.sort_values(by, axis=axis, ascending=ascending, From daf07a64d681e70eda6211b739919cae6345e86f Mon Sep 17 00:00:00 2001 From: Alan Velasco Date: Sat, 15 Jul 2017 11:01:38 -0500 Subject: [PATCH 26/54] WARN: add stacklevel to to_dict() UserWarning (#16927) (#16936) * ERR: add stacklevel to to_dict() UserWarning (#16927) * TST: Add warning testing to to_dict() * Fix warning assertion on to_dict() test * Add github issue to documentation on to_dict() warning test --- pandas/core/frame.py | 3 ++- pandas/tests/frame/test_convert_to.py | 7 +++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b5462bbe67647..9920ddf854850 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -972,7 +972,8 @@ def to_dict(self, orient='dict', into=dict): """ if not self.columns.is_unique: warnings.warn("DataFrame columns are not unique, some " - "columns will be omitted.", UserWarning) + "columns will be omitted.", UserWarning, + stacklevel=2) # GH16122 into_c = standardize_mapping(into) if orient.lower().startswith('d'): diff --git a/pandas/tests/frame/test_convert_to.py b/pandas/tests/frame/test_convert_to.py index 34dd138ee1c80..629c695b702fe 100644 --- a/pandas/tests/frame/test_convert_to.py +++ b/pandas/tests/frame/test_convert_to.py @@ -216,6 +216,13 @@ def test_to_dict_errors(self, mapping): with pytest.raises(TypeError): df.to_dict(into=mapping) + def test_to_dict_not_unique_warning(self): + # GH16927: When converting to a dict, if a column has a non-unique name + # it will be dropped, throwing a warning. + df = DataFrame([[1, 2, 3]], columns=['a', 'a', 'b']) + with tm.assert_produces_warning(UserWarning): + df.to_dict() + @pytest.mark.parametrize('tz', ['UTC', 'GMT', 'US/Eastern']) def test_to_records_datetimeindex_with_tz(self, tz): # GH13937 From 4c498f8451fe4c491a6f38ed9e35da3d3ab6b9b8 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 15 Jul 2017 11:26:10 -0500 Subject: [PATCH 27/54] CI: fix pep8speaks .yml file --- .pep8speakes.yml => .pep8speaks.yml | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename .pep8speakes.yml => .pep8speaks.yml (100%) diff --git a/.pep8speakes.yml b/.pep8speaks.yml similarity index 100% rename from .pep8speakes.yml rename to .pep8speaks.yml From 7500218947bffd4915832e9037d9f48991e53ca3 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 15 Jul 2017 12:33:04 -0500 Subject: [PATCH 28/54] DOC: whatsnew 0.21.0 edits --- doc/source/whatsnew/v0.21.0.txt | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 6ddf6029b99bb..34095d55b8cc9 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -31,15 +31,15 @@ Other Enhancements ^^^^^^^^^^^^^^^^^^ - The ``validate`` argument for :func:`merge` function now checks whether a merge is one-to-one, one-to-many, many-to-one, or many-to-many. If a merge is found to not be an example of specified merge type, an exception of type ``MergeError`` will be raised. For more, see :ref:`here ` (:issue:`16270`) -- ``Series.to_dict()`` and ``DataFrame.to_dict()`` now support an ``into`` keyword which allows you to specify the ``collections.Mapping`` subclass that you would like returned. The default is ``dict``, which is backwards compatible. (:issue:`16122`) -- ``RangeIndex.append`` now returns a ``RangeIndex`` object when possible (:issue:`16212`) -- ``Series.rename_axis()`` and ``DataFrame.rename_axis()`` with ``inplace=True`` now return ``None`` while renaming the axis inplace. (:issue:`15704`) -- :func:`to_pickle` has gained a ``protocol`` parameter (:issue:`16252`). By default, this parameter is set to `HIGHEST_PROTOCOL `__ +- :func:`Series.to_dict` and :func:`DataFrame.to_dict` now support an ``into`` keyword which allows you to specify the ``collections.Mapping`` subclass that you would like returned. The default is ``dict``, which is backwards compatible. (:issue:`16122`) +- :func:`RangeIndex.append` now returns a ``RangeIndex`` object when possible (:issue:`16212`) +- :func:`Series.rename_axis` and :func:`DataFrame.rename_axis` with ``inplace=True`` now return ``None`` while renaming the axis inplace. (:issue:`15704`) +- :func:`Series.to_pickle` and :func:`DataFrame.to_pickle` have gained a ``protocol`` parameter (:issue:`16252`). By default, this parameter is set to `HIGHEST_PROTOCOL `__ - :func:`api.types.infer_dtype` now infers decimals. (:issue:`15690`) - :func:`read_feather` has gained the ``nthreads`` parameter for multi-threaded operations (:issue:`16359`) - :func:`DataFrame.clip()` and :func:`Series.clip()` have gained an ``inplace`` argument. (:issue:`15388`) - :func:`crosstab` has gained a ``margins_name`` parameter to define the name of the row / column that will contain the totals when ``margins=True``. (:issue:`15972`) -- :func:`Dataframe.select_dtypes` now accepts scalar values for include/exclude as well as list-like. (:issue:`16855`) +- :func:`DataFrame.select_dtypes` now accepts scalar values for include/exclude as well as list-like. (:issue:`16855`) .. _whatsnew_0210.api_breaking: @@ -92,9 +92,14 @@ the target. Now, a ``ValueError`` will be raised when such an input is passed in ... ValueError: Cannot operate inplace if there is no assignment +.. _whatsnew_0210.api: + +Other API Changes +^^^^^^^^^^^^^^^^^ + - Support has been dropped for Python 3.4 (:issue:`15251`) - The Categorical constructor no longer accepts a scalar for the ``categories`` keyword. (:issue:`16022`) -- Accessing a non-existent attribute on a closed :class:`HDFStore` will now +- Accessing a non-existent attribute on a closed :class:`~pandas.HDFStore` will now raise an ``AttributeError`` rather than a ``ClosedFileError`` (:issue:`16301`) - :func:`read_csv` now treats ``'null'`` strings as missing values by default (:issue:`16471`) - :func:`read_csv` now treats ``'n/a'`` strings as missing values by default (:issue:`16078`) @@ -102,12 +107,6 @@ the target. Now, a ``ValueError`` will be raised when such an input is passed in - Compression defaults in HDF stores now follow pytable standards. Default is no compression and if ``complib`` is missing and ``complevel`` > 0 ``zlib`` is used (:issue:`15943`) - ``Index.get_indexer_non_unique()`` now returns a ndarray indexer rather than an ``Index``; this is consistent with ``Index.get_indexer()`` (:issue:`16819`) - Removed the ``@slow`` decorator from ``pandas.util.testing``, which caused issues for some downstream packages' test suites. Use ``@pytest.mark.slow`` instead, which achieves the same thing (:issue:`16850`) - -.. _whatsnew_0210.api: - -Other API Changes -^^^^^^^^^^^^^^^^^ - - Moved definition of ``MergeError`` to the ``pandas.errors`` module. @@ -127,7 +126,7 @@ Removal of prior version deprecations/changes - The ``pd.options.display.mpl_style`` configuration has been dropped (:issue:`12190`) - ``Index`` has dropped the ``.sym_diff()`` method in favor of ``.symmetric_difference()`` (:issue:`12591`) - ``Categorical`` has dropped the ``.order()`` and ``.sort()`` methods in favor of ``.sort_values()`` (:issue:`12882`) -- :func:`eval` and :method:`DataFrame.eval` have changed the default of ``inplace`` from ``None`` to ``False`` (:issue:`11149`) +- :func:`eval` and :func:`DataFrame.eval` have changed the default of ``inplace`` from ``None`` to ``False`` (:issue:`11149`) - The function ``get_offset_name`` has been dropped in favor of the ``.freqstr`` attribute for an offset (:issue:`11834`) From 3955261c04d5b838488a45fe7b186399bcdca137 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 15 Jul 2017 13:07:35 -0500 Subject: [PATCH 29/54] CI: disable codecov reporting --- codecov.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/codecov.yml b/codecov.yml index b4552563deeaa..512bc2e82a736 100644 --- a/codecov.yml +++ b/codecov.yml @@ -5,7 +5,9 @@ coverage: status: project: default: + enabled: no target: '82' patch: default: + enabled: no target: '50' From 96168ef698ac8bbccba251258ee66958359b11bf Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 15 Jul 2017 12:14:34 -0700 Subject: [PATCH 30/54] MAINT: Move series.remove_na to core.dtypes.missing.remove_na_arraylike Closes gh-16935 --- pandas/core/dtypes/missing.py | 7 +++++++ pandas/core/series.py | 11 ++--------- pandas/plotting/_core.py | 12 ++++++------ pandas/tests/test_panel.py | 4 ++-- pandas/tests/test_panel4d.py | 4 ++-- 5 files changed, 19 insertions(+), 19 deletions(-) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index af3a873bc2866..9913923cb7807 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -394,3 +394,10 @@ def na_value_for_dtype(dtype): elif is_bool_dtype(dtype): return False return np.nan + + +def remove_na_arraylike(arr): + """ + Return array-like containing only true/non-NaN values, possibly empty. + """ + return arr[notnull(lib.values_from_object(arr))] diff --git a/pandas/core/series.py b/pandas/core/series.py index e1f668dd3afda..98b548f8ab3b5 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -36,7 +36,7 @@ maybe_upcast, infer_dtype_from_scalar, maybe_convert_platform, maybe_cast_to_datetime, maybe_castable) -from pandas.core.dtypes.missing import isnull, notnull +from pandas.core.dtypes.missing import isnull, notnull, remove_na_arraylike from pandas.core.common import (is_bool_indexer, _default_index, @@ -2749,7 +2749,7 @@ def dropna(self, axis=0, inplace=False, **kwargs): axis = self._get_axis_number(axis or 0) if self._can_hold_na: - result = remove_na(self) + result = remove_na_arraylike(self) if inplace: self._update_inplace(result) else: @@ -2888,13 +2888,6 @@ def _dir_additions(self): # Supplementary functions -def remove_na(series): - """ - Return series containing only true/non-NaN values, possibly empty. - """ - return series[notnull(_values_from_object(series))] - - def _sanitize_index(data, index, copy=False): """ sanitize an index type to return an ndarray of the underlying, pass thru a non-Index diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index f8e83aea03594..9cceebb5c4cdb 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -11,7 +11,7 @@ from pandas.util._decorators import cache_readonly from pandas.core.base import PandasObject -from pandas.core.dtypes.missing import notnull +from pandas.core.dtypes.missing import notnull, remove_na_arraylike from pandas.core.dtypes.common import ( is_list_like, is_integer, @@ -21,7 +21,7 @@ from pandas.core.common import AbstractMethodError, isnull, _try_sort from pandas.core.generic import _shared_docs, _shared_doc_kwargs from pandas.core.index import Index, MultiIndex -from pandas.core.series import Series, remove_na +from pandas.core.series import Series from pandas.core.indexes.period import PeriodIndex from pandas.compat import range, lrange, map, zip, string_types import pandas.compat as compat @@ -1376,7 +1376,7 @@ def _plot(cls, ax, y, style=None, bw_method=None, ind=None, from scipy.stats import gaussian_kde from scipy import __version__ as spv - y = remove_na(y) + y = remove_na_arraylike(y) if LooseVersion(spv) >= '0.11.0': gkde = gaussian_kde(y, bw_method=bw_method) @@ -1495,13 +1495,13 @@ def _args_adjust(self): @classmethod def _plot(cls, ax, y, column_num=None, return_type='axes', **kwds): if y.ndim == 2: - y = [remove_na(v) for v in y] + y = [remove_na_arraylike(v) for v in y] # Boxplot fails with empty arrays, so need to add a NaN # if any cols are empty # GH 8181 y = [v if v.size > 0 else np.array([np.nan]) for v in y] else: - y = remove_na(y) + y = remove_na_arraylike(y) bp = ax.boxplot(y, **kwds) if return_type == 'dict': @@ -1969,7 +1969,7 @@ def maybe_color_bp(bp): def plot_group(keys, values, ax): keys = [pprint_thing(x) for x in keys] - values = [remove_na(v) for v in values] + values = [remove_na_arraylike(v) for v in values] bp = ax.boxplot(values, **kwds) if fontsize is not None: ax.tick_params(axis='both', labelsize=fontsize) diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index e19e42e062932..445611c1696f5 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -10,11 +10,11 @@ import pandas as pd from pandas.core.dtypes.common import is_float_dtype +from pandas.core.dtypes.missing import remove_na_arraylike from pandas import (Series, DataFrame, Index, date_range, isnull, notnull, pivot, MultiIndex) from pandas.core.nanops import nanall, nanany from pandas.core.panel import Panel -from pandas.core.series import remove_na from pandas.io.formats.printing import pprint_thing from pandas import compat @@ -155,7 +155,7 @@ def _check_stat_op(self, name, alternative, obj=None, has_skipna=True): if has_skipna: def skipna_wrapper(x): - nona = remove_na(x) + nona = remove_na_arraylike(x) if len(nona) == 0: return np.nan return alternative(nona) diff --git a/pandas/tests/test_panel4d.py b/pandas/tests/test_panel4d.py index e1995316e7b7c..18643aff15e9b 100644 --- a/pandas/tests/test_panel4d.py +++ b/pandas/tests/test_panel4d.py @@ -7,10 +7,10 @@ import numpy as np from pandas.core.dtypes.common import is_float_dtype +from pandas.core.dtypes.missing import remove_na_arraylike from pandas import Series, Index, isnull, notnull from pandas.core.panel import Panel from pandas.core.panel4d import Panel4D -from pandas.core.series import remove_na from pandas.tseries.offsets import BDay from pandas.util.testing import (assert_frame_equal, assert_series_equal, @@ -118,7 +118,7 @@ def _check_stat_op(self, name, alternative, obj=None, has_skipna=True): if has_skipna: def skipna_wrapper(x): - nona = remove_na(x) + nona = remove_na_arraylike(x) if len(nona) == 0: return np.nan return alternative(nona) From 2cd85ca748f62d7430b30e2d9ddd036e972cc64e Mon Sep 17 00:00:00 2001 From: Alan Velasco Date: Sat, 15 Jul 2017 16:28:23 -0500 Subject: [PATCH 31/54] Support non unique period indexes on join and merge operations (#16949) * Support non unique period indexes on join and merge operations * Add frame assertion on tests and release notes * Explicitly use dtype int64 on arange --- doc/source/whatsnew/v0.21.0.txt | 2 +- pandas/core/indexes/base.py | 4 ++-- pandas/tests/reshape/test_join.py | 12 ++++++++++++ pandas/tests/reshape/test_merge.py | 12 ++++++++++++ 4 files changed, 27 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 34095d55b8cc9..11d3e4cf964aa 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -184,7 +184,7 @@ Sparse Reshaping ^^^^^^^^^ - +- Joining/Merging with a non unique ``PeriodIndex`` raised a TypeError (:issue:`16871`) Numeric diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index e1053c1610175..bbbc19b36964d 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3119,14 +3119,14 @@ def _join_multi(self, other, how, return_indexers=True): def _join_non_unique(self, other, how='left', return_indexers=False): from pandas.core.reshape.merge import _get_join_indexers - left_idx, right_idx = _get_join_indexers([self.values], + left_idx, right_idx = _get_join_indexers([self._values], [other._values], how=how, sort=True) left_idx = _ensure_platform_int(left_idx) right_idx = _ensure_platform_int(right_idx) - join_index = np.asarray(self.values.take(left_idx)) + join_index = np.asarray(self._values.take(left_idx)) mask = left_idx == -1 np.putmask(join_index, mask, other._values.take(right_idx)) diff --git a/pandas/tests/reshape/test_join.py b/pandas/tests/reshape/test_join.py index e25661fb65271..e4894307918c6 100644 --- a/pandas/tests/reshape/test_join.py +++ b/pandas/tests/reshape/test_join.py @@ -550,6 +550,18 @@ def test_join_mixed_non_unique_index(self): index=[1, 2, 2, 'a']) tm.assert_frame_equal(result, expected) + def test_join_non_unique_period_index(self): + # GH #16871 + index = pd.period_range('2016-01-01', periods=16, freq='M') + df = DataFrame([i for i in range(len(index))], + index=index, columns=['pnum']) + df2 = concat([df, df]) + result = df.join(df2, how='inner', rsuffix='_df2') + expected = DataFrame( + np.tile(np.arange(16, dtype=np.int64).repeat(2).reshape(-1, 1), 2), + columns=['pnum', 'pnum_df2'], index=df2.sort_index().index) + tm.assert_frame_equal(result, expected) + def test_mixed_type_join_with_suffix(self): # GH #916 df = DataFrame(np.random.randn(20, 6), diff --git a/pandas/tests/reshape/test_merge.py b/pandas/tests/reshape/test_merge.py index 4ac376a9752cb..919675188576e 100644 --- a/pandas/tests/reshape/test_merge.py +++ b/pandas/tests/reshape/test_merge.py @@ -585,6 +585,18 @@ def test_merge_on_datetime64tz(self): assert result['value_x'].dtype == 'datetime64[ns, US/Eastern]' assert result['value_y'].dtype == 'datetime64[ns, US/Eastern]' + def test_merge_non_unique_period_index(self): + # GH #16871 + index = pd.period_range('2016-01-01', periods=16, freq='M') + df = DataFrame([i for i in range(len(index))], + index=index, columns=['pnum']) + df2 = concat([df, df]) + result = df.merge(df2, left_index=True, right_index=True, how='inner') + expected = DataFrame( + np.tile(np.arange(16, dtype=np.int64).repeat(2).reshape(-1, 1), 2), + columns=['pnum_x', 'pnum_y'], index=df2.sort_index().index) + tm.assert_frame_equal(result, expected) + def test_merge_on_periods(self): left = pd.DataFrame({'key': pd.period_range('20151010', periods=2, freq='D'), From 8e3d8315d63f61c1cc7a0ea9ad24cdd63b63f6b8 Mon Sep 17 00:00:00 2001 From: Morgan243 Date: Sat, 15 Jul 2017 19:13:49 -0400 Subject: [PATCH 32/54] BUG: Set secondary axis font size for `secondary_y` during plotting The parameter was not being respected for `secondary_y`. Closes gh-12565 --- doc/source/whatsnew/v0.21.0.txt | 2 +- pandas/plotting/_core.py | 9 +++++++++ pandas/tests/plotting/test_frame.py | 17 +++++++++++++++++ 3 files changed, 27 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 11d3e4cf964aa..df53c4a3d6caf 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -166,7 +166,7 @@ I/O Plotting ^^^^^^^^ - +- Bug in plotting methods using ``secondary_y`` and ``fontsize`` not setting secondary axis font size (:issue:`12565`) Groupby/Resample/Rolling diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 9cceebb5c4cdb..a623288efc1ae 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -379,6 +379,11 @@ def _post_plot_logic_common(self, ax, data): self._apply_axis_properties(ax.xaxis, rot=self.rot, fontsize=self.fontsize) self._apply_axis_properties(ax.yaxis, fontsize=self.fontsize) + + if hasattr(ax, 'right_ax'): + self._apply_axis_properties(ax.right_ax.yaxis, + fontsize=self.fontsize) + elif self.orientation == 'horizontal': if self._need_to_set_index: yticklabels = [labels.get(y, '') for y in ax.get_yticks()] @@ -386,6 +391,10 @@ def _post_plot_logic_common(self, ax, data): self._apply_axis_properties(ax.yaxis, rot=self.rot, fontsize=self.fontsize) self._apply_axis_properties(ax.xaxis, fontsize=self.fontsize) + + if hasattr(ax, 'right_ax'): + self._apply_axis_properties(ax.right_ax.yaxis, + fontsize=self.fontsize) else: # pragma no cover raise ValueError diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index 7878740f64e55..6d813ac76cc4e 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -2733,6 +2733,23 @@ def test_rcParams_bar_colors(self): barplot = pd.DataFrame([[1, 2, 3]]).plot(kind="bar") assert color_tuples == [c.get_facecolor() for c in barplot.patches] + @pytest.mark.parametrize('method', ['line', 'barh', 'bar']) + def test_secondary_axis_font_size(self, method): + # GH: 12565 + df = (pd.DataFrame(np.random.randn(15, 2), + columns=list('AB')) + .assign(C=lambda df: df.B.cumsum()) + .assign(D=lambda df: df.C * 1.1)) + + fontsize = 20 + sy = ['C', 'D'] + + kwargs = dict(secondary_y=sy, fontsize=fontsize, + mark_right=True) + ax = getattr(df.plot, method)(**kwargs) + self._check_ticks_props(axes=ax.right_ax, + ylabelsize=fontsize) + def _generate_4_axes_via_gridspec(): import matplotlib.pyplot as plt From 4f04d0be1fe22dabaff6c0eeb6162bffb763af46 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 15 Jul 2017 16:04:03 -0500 Subject: [PATCH 33/54] DOC: more whatsnew fixes --- doc/source/whatsnew/v0.21.0.txt | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index df53c4a3d6caf..a5d4259480ba8 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -46,11 +46,11 @@ Other Enhancements Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. _whatsnew_0210.api_breaking.pandas_eval: + Improved error handling during item assignment in pd.eval ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. _whatsnew_0210.api_breaking.pandas_eval: - :func:`eval` will now raise a ``ValueError`` when item assignment malfunctions, or inplace operations are specified, but there is no item assignment in the expression (:issue:`16732`) @@ -154,8 +154,8 @@ Indexing - When called with a null slice (e.g. ``df.iloc[:]``), the ``.iloc`` and ``.loc`` indexers return a shallow copy of the original object. Previously they returned the original object. (:issue:`13873`). - When called on an unsorted ``MultiIndex``, the ``loc`` indexer now will raise ``UnsortedIndexError`` only if proper slicing is used on non-sorted levels (:issue:`16734`). - Fixes regression in 0.20.3 when indexing with a string on a ``TimedeltaIndex`` (:issue:`16896`). -- Fixed ``TimedeltaIndex.get_loc`` handling of ``np.timedelta64`` inputs (:issue:`16909`). -- Fix :meth:`MultiIndex.sort_index` ordering when ``ascending`` argument is a list, but not all levels are specified, or are in a different order (:issue:`16934`). +- Fixed :func:`TimedeltaIndex.get_loc` handling of ``np.timedelta64`` inputs (:issue:`16909`). +- Fix :func:`MultiIndex.sort_index` ordering when ``ascending`` argument is a list, but not all levels are specified, or are in a different order (:issue:`16934`). I/O ^^^ @@ -172,9 +172,9 @@ Plotting Groupby/Resample/Rolling ^^^^^^^^^^^^^^^^^^^^^^^^ -- Bug in ``DataFrame.resample().size()`` where an empty ``DataFrame`` did not return a ``Series`` (:issue:`14962`) -- Bug in ``infer_freq`` causing indices with 2-day gaps during the working week to be wrongly inferred as business daily (:issue:`16624`) -- Bug in ``.rolling.quantile()`` which incorrectly used different defaults than :func:`Series.quantile()` and :func:`DataFrame.quantile()` (:issue:`9413`, :issue:`16211`) +- Bug in ``DataFrame.resample(...).size()`` where an empty ``DataFrame`` did not return a ``Series`` (:issue:`14962`) +- Bug in :func:`infer_freq` causing indices with 2-day gaps during the working week to be wrongly inferred as business daily (:issue:`16624`) +- Bug in ``.rolling(...).quantile()`` which incorrectly used different defaults than :func:`Series.quantile()` and :func:`DataFrame.quantile()` (:issue:`9413`, :issue:`16211`) Sparse @@ -194,7 +194,7 @@ Numeric Categorical ^^^^^^^^^^^ -- Bug in ``:func:Series.isin()`` when called with a categorical (:issue`16639`) +- Bug in :func:`Series.isin` when called with a categorical (:issue`16639`) Other From 1212fe034b7302f40bf253aedd9e3989514eeb52 Mon Sep 17 00:00:00 2001 From: aernlund Date: Sat, 15 Jul 2017 18:43:02 -0500 Subject: [PATCH 34/54] DOC: Reset index examples closes #16416 Author: aernlund Closes #16967 from aernlund/reset_index_docs and squashes the following commits: 3c6a4b6 [aernlund] DOC: added examples to reset_index 4838155 [aernlund] DOC: added examples to reset_index 2a51e2b [aernlund] DOC: added examples to reset_index --- pandas/core/frame.py | 32 ++++++++++++++++++++++++++++++++ pandas/core/series.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9920ddf854850..9a79ca1d4eab1 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3020,6 +3020,38 @@ def reset_index(self, level=None, drop=False, inplace=False, col_level=0, Returns ------- resetted : DataFrame + + Examples + -------- + >>> df = pd.DataFrame({'a': [1, 2, 3, 4], 'b': [5, 6, 7, 8]}, + ... index=pd.Index(['a', 'b', 'c', 'd'], + ... name='idx')) + >>> df.reset_index() + idx a b + 0 a 1 5 + 1 b 2 6 + 2 c 3 7 + 3 d 4 8 + + >>> arrays = [np.array(['bar', 'bar', 'baz', 'baz', 'foo', + ... 'foo', 'qux', 'qux']), + ... np.array(['one', 'two', 'one', 'two', 'one', 'two', + ... 'one', 'two'])] + >>> df2 = pd.DataFrame( + ... np.random.randn(8, 4), + ... index=pd.MultiIndex.from_arrays(arrays, + ... names=['a', 'b'])) + >>> df2.reset_index(level='a') + a 0 1 2 3 + b + one bar -1.099413 0.291838 0.598198 0.162181 + two bar -0.312184 -0.119904 0.250360 0.364378 + one baz 0.713596 -0.490636 0.074967 -0.297857 + two baz 0.998397 0.524499 -2.228976 0.901155 + one foo 0.923204 0.920695 1.264488 1.476921 + two foo -1.566922 0.783278 -0.073656 0.266027 + one qux -0.230470 0.109800 -1.383409 0.048421 + two qux -0.865993 -0.865984 0.705367 -0.170446 """ inplace = validate_bool_kwarg(inplace, 'inplace') if inplace: diff --git a/pandas/core/series.py b/pandas/core/series.py index 98b548f8ab3b5..4d5b718ce0ae9 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -948,6 +948,37 @@ def reset_index(self, level=None, drop=False, name=None, inplace=False): Returns ---------- resetted : DataFrame, or Series if drop == True + + Examples + -------- + >>> s = pd.Series([1, 2, 3, 4], index=pd.Index(['a', 'b', 'c', 'd'], + ... name = 'idx')) + >>> s.reset_index() + index 0 + 0 0 1 + 1 1 2 + 2 2 3 + 3 3 4 + + >>> arrays = [np.array(['bar', 'bar', 'baz', 'baz', 'foo', + ... 'foo', 'qux', 'qux']), + ... np.array(['one', 'two', 'one', 'two', 'one', 'two', + ... 'one', 'two'])] + >>> s2 = pd.Series( + ... np.random.randn(8), + ... index=pd.MultiIndex.from_arrays(arrays, + ... names=['a', 'b'])) + >>> s2.reset_index(level='a') + a 0 + b + one bar -0.286320 + two bar -0.587934 + one baz 0.710491 + two baz -1.429006 + one foo 0.790700 + two foo 0.824863 + one qux -0.718963 + two qux -0.055028 """ inplace = validate_bool_kwarg(inplace, 'inplace') if drop: From 3524edb82e7945998876591813b7e77fe620ce36 Mon Sep 17 00:00:00 2001 From: Daniel Chen Date: Sat, 15 Jul 2017 18:49:08 -0500 Subject: [PATCH 35/54] channel from pandas to conda-forge (#16966) --- doc/source/contributing.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/contributing.rst b/doc/source/contributing.rst index bfcf560565977..b44d0f36b86a1 100644 --- a/doc/source/contributing.rst +++ b/doc/source/contributing.rst @@ -171,7 +171,7 @@ other dependencies, you can install them as follows:: To install *all* pandas dependencies you can do the following:: - conda install -n pandas_dev -c pandas --file ci/requirements_all.txt + conda install -n pandas_dev -c conda-forge --file ci/requirements_all.txt To work in this environment, Windows users should ``activate`` it as follows:: From 53ae390f442e745503745e5fa8ed7b06b72fd102 Mon Sep 17 00:00:00 2001 From: Prasanjit Prakash Date: Sun, 16 Jul 2017 06:42:57 +0530 Subject: [PATCH 36/54] BUG: coercing of bools in groupby transform (#16895) --- doc/source/whatsnew/v0.21.0.txt | 2 +- pandas/core/dtypes/cast.py | 7 ++++--- pandas/tests/dtypes/test_cast.py | 8 +++++++- pandas/tests/groupby/test_transform.py | 13 +++++++++++++ 4 files changed, 25 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index a5d4259480ba8..762107a261090 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -175,7 +175,7 @@ Groupby/Resample/Rolling - Bug in ``DataFrame.resample(...).size()`` where an empty ``DataFrame`` did not return a ``Series`` (:issue:`14962`) - Bug in :func:`infer_freq` causing indices with 2-day gaps during the working week to be wrongly inferred as business daily (:issue:`16624`) - Bug in ``.rolling(...).quantile()`` which incorrectly used different defaults than :func:`Series.quantile()` and :func:`DataFrame.quantile()` (:issue:`9413`, :issue:`16211`) - +- Bug in ``groupby.transform()`` that would coerce boolean dtypes back to float (:issue:`16875`) Sparse ^^^^^^ diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 16b0a5c8a74ca..6532e17695c86 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -110,9 +110,7 @@ def trans(x): # noqa np.prod(result.shape)): return result - if issubclass(dtype.type, np.floating): - return result.astype(dtype) - elif is_bool_dtype(dtype) or is_integer_dtype(dtype): + if is_bool_dtype(dtype) or is_integer_dtype(dtype): # if we don't have any elements, just astype it if not np.prod(result.shape): @@ -144,6 +142,9 @@ def trans(x): # noqa # hit here if (new_result == result).all(): return new_result + elif (issubclass(dtype.type, np.floating) and + not is_bool_dtype(result.dtype)): + return result.astype(dtype) # a datetimelike # GH12821, iNaT is casted to float diff --git a/pandas/tests/dtypes/test_cast.py b/pandas/tests/dtypes/test_cast.py index 767e99d98cf29..6e07487b3e04f 100644 --- a/pandas/tests/dtypes/test_cast.py +++ b/pandas/tests/dtypes/test_cast.py @@ -9,7 +9,7 @@ from datetime import datetime, timedelta, date import numpy as np -from pandas import Timedelta, Timestamp, DatetimeIndex, DataFrame, NaT +from pandas import Timedelta, Timestamp, DatetimeIndex, DataFrame, NaT, Series from pandas.core.dtypes.cast import ( maybe_downcast_to_dtype, @@ -45,6 +45,12 @@ def test_downcast_conv(self): expected = np.array([8, 8, 8, 8, 9]) assert (np.array_equal(result, expected)) + # GH16875 coercing of bools + ser = Series([True, True, False]) + result = maybe_downcast_to_dtype(ser, np.dtype(np.float64)) + expected = ser + tm.assert_series_equal(result, expected) + # conversions expected = np.array([1, 2]) diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index 40434ff510421..98839a17d6e0c 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -195,6 +195,19 @@ def test_transform_bug(self): expected = Series(np.arange(5, 0, step=-1), name='B') assert_series_equal(result, expected) + def test_transform_numeric_to_boolean(self): + # GH 16875 + # inconsistency in transforming boolean values + expected = pd.Series([True, True], name='A') + + df = pd.DataFrame({'A': [1.1, 2.2], 'B': [1, 2]}) + result = df.groupby('B').A.transform(lambda x: True) + assert_series_equal(result, expected) + + df = pd.DataFrame({'A': [1, 2], 'B': [1, 2]}) + result = df.groupby('B').A.transform(lambda x: True) + assert_series_equal(result, expected) + def test_transform_datetime_to_timedelta(self): # GH 15429 # transforming a datetime to timedelta From 01a8be3578e9d0b2a66b8318c5477e3e6cfb75f2 Mon Sep 17 00:00:00 2001 From: Andrew Date: Sat, 15 Jul 2017 21:20:55 -0400 Subject: [PATCH 37/54] DOC: misspelling in DatetimeIndex.indexer_between_time [CI skip] (#16963) --- pandas/core/indexes/datetimes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index d8aae2367976b..e6bc1790f2992 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -1882,7 +1882,7 @@ def indexer_between_time(self, start_time, end_time, include_start=True, Select values between particular times of day (e.g., 9:00-9:30AM). Return values of the index between two times. If start_time or - end_time are strings then tseres.tools.to_time is used to convert to + end_time are strings then tseries.tools.to_time is used to convert to a time object. Parameters From 148e038bfaf2a3893b52e28b6469cf5984eec794 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 15 Jul 2017 20:42:56 -0500 Subject: [PATCH 38/54] CLN: some residual code removed, xref to #16761 (#16974) --- pandas/core/config_init.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index e70db1d13e376..04563907582ee 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -255,18 +255,6 @@ def use_numexpr_cb(key): df.info() (the behaviour in earlier versions of pandas). """ -pc_mpl_style_doc = """ -: bool - Setting this to 'default' will modify the rcParams used by matplotlib - to give plots a more pleasing visual style by default. - Setting this to None/False restores the values to their initial value. -""" - -pc_mpl_style_deprecation_warning = """ -mpl_style had been deprecated and will be removed in a future version. -Use `matplotlib.pyplot.style.use` instead. -""" - pc_memory_usage_doc = """ : bool, string or None This specifies if the memory usage of a DataFrame should be displayed when From 9c096d29a1e9a68b8151de4896b0d9684383821a Mon Sep 17 00:00:00 2001 From: Iva Miholic Date: Sun, 16 Jul 2017 09:04:35 +0100 Subject: [PATCH 39/54] ENH: Create a 'Y' alias for date_range yearly frequency Closes gh-9313 --- pandas/tests/tseries/test_frequencies.py | 41 ++++++++++++++++-------- pandas/tseries/frequencies.py | 22 +++++++++++++ 2 files changed, 50 insertions(+), 13 deletions(-) diff --git a/pandas/tests/tseries/test_frequencies.py b/pandas/tests/tseries/test_frequencies.py index 54d12317b0bf8..4bcd0b49db7e0 100644 --- a/pandas/tests/tseries/test_frequencies.py +++ b/pandas/tests/tseries/test_frequencies.py @@ -248,9 +248,10 @@ def test_anchored_shortcuts(self): # ensure invalid cases fail as expected invalid_anchors = ['SM-0', 'SM-28', 'SM-29', - 'SM-FOO', 'BSM', 'SM--1' + 'SM-FOO', 'BSM', 'SM--1', 'SMS-1', 'SMS-28', 'SMS-30', - 'SMS-BAR', 'BSMS', 'SMS--2'] + 'SMS-BAR', 'SMS-BYR' 'BSMS', + 'SMS--2'] for invalid_anchor in invalid_anchors: with tm.assert_raises_regex(ValueError, 'Invalid frequency: '): @@ -292,11 +293,15 @@ def test_get_rule_month(): result = frequencies._get_rule_month('A-DEC') assert (result == 'DEC') + result = frequencies._get_rule_month('Y-DEC') + assert (result == 'DEC') result = frequencies._get_rule_month(offsets.YearEnd()) assert (result == 'DEC') result = frequencies._get_rule_month('A-MAY') assert (result == 'MAY') + result = frequencies._get_rule_month('Y-MAY') + assert (result == 'MAY') result = frequencies._get_rule_month(offsets.YearEnd(month=5)) assert (result == 'MAY') @@ -305,6 +310,10 @@ def test_period_str_to_code(): assert (frequencies._period_str_to_code('A') == 1000) assert (frequencies._period_str_to_code('A-DEC') == 1000) assert (frequencies._period_str_to_code('A-JAN') == 1001) + assert (frequencies._period_str_to_code('Y') == 1000) + assert (frequencies._period_str_to_code('Y-DEC') == 1000) + assert (frequencies._period_str_to_code('Y-JAN') == 1001) + assert (frequencies._period_str_to_code('Q') == 2000) assert (frequencies._period_str_to_code('Q-DEC') == 2000) assert (frequencies._period_str_to_code('Q-FEB') == 2002) @@ -349,6 +358,10 @@ def test_freq_code(self): assert frequencies.get_freq('3A') == 1000 assert frequencies.get_freq('-1A') == 1000 + assert frequencies.get_freq('Y') == 1000 + assert frequencies.get_freq('3Y') == 1000 + assert frequencies.get_freq('-1Y') == 1000 + assert frequencies.get_freq('W') == 4000 assert frequencies.get_freq('W-MON') == 4001 assert frequencies.get_freq('W-FRI') == 4005 @@ -369,6 +382,13 @@ def test_freq_group(self): assert frequencies.get_freq_group('-1A') == 1000 assert frequencies.get_freq_group('A-JAN') == 1000 assert frequencies.get_freq_group('A-MAY') == 1000 + + assert frequencies.get_freq_group('Y') == 1000 + assert frequencies.get_freq_group('3Y') == 1000 + assert frequencies.get_freq_group('-1Y') == 1000 + assert frequencies.get_freq_group('Y-JAN') == 1000 + assert frequencies.get_freq_group('Y-MAY') == 1000 + assert frequencies.get_freq_group(offsets.YearEnd()) == 1000 assert frequencies.get_freq_group(offsets.YearEnd(month=1)) == 1000 assert frequencies.get_freq_group(offsets.YearEnd(month=5)) == 1000 @@ -790,12 +810,6 @@ def test_series(self): for freq in [None, 'L']: s = Series(period_range('2013', periods=10, freq=freq)) pytest.raises(TypeError, lambda: frequencies.infer_freq(s)) - for freq in ['Y']: - - msg = frequencies._INVALID_FREQ_ERROR - with tm.assert_raises_regex(ValueError, msg): - s = Series(period_range('2013', periods=10, freq=freq)) - pytest.raises(TypeError, lambda: frequencies.infer_freq(s)) # DateTimeIndex for freq in ['M', 'L', 'S']: @@ -812,11 +826,12 @@ def test_legacy_offset_warnings(self): 'W@FRI', 'W@SAT', 'W@SUN', 'Q@JAN', 'Q@FEB', 'Q@MAR', 'A@JAN', 'A@FEB', 'A@MAR', 'A@APR', 'A@MAY', 'A@JUN', 'A@JUL', 'A@AUG', 'A@SEP', 'A@OCT', 'A@NOV', 'A@DEC', - 'WOM@1MON', 'WOM@2MON', 'WOM@3MON', 'WOM@4MON', - 'WOM@1TUE', 'WOM@2TUE', 'WOM@3TUE', 'WOM@4TUE', - 'WOM@1WED', 'WOM@2WED', 'WOM@3WED', 'WOM@4WED', - 'WOM@1THU', 'WOM@2THU', 'WOM@3THU', 'WOM@4THU' - 'WOM@1FRI', 'WOM@2FRI', 'WOM@3FRI', 'WOM@4FRI'] + 'Y@JAN', 'WOM@1MON', 'WOM@2MON', 'WOM@3MON', + 'WOM@4MON', 'WOM@1TUE', 'WOM@2TUE', 'WOM@3TUE', + 'WOM@4TUE', 'WOM@1WED', 'WOM@2WED', 'WOM@3WED', + 'WOM@4WED', 'WOM@1THU', 'WOM@2THU', 'WOM@3THU', + 'WOM@4THU', 'WOM@1FRI', 'WOM@2FRI', 'WOM@3FRI', + 'WOM@4FRI'] msg = frequencies._INVALID_FREQ_ERROR for freq in freqs: diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index c5f6c00a4005a..5c3c90520d1c3 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -422,6 +422,27 @@ def get_period_alias(offset_str): return _offset_to_period_map.get(offset_str, None) +_pure_alias = { + # 'A' is equivalent to 'Y'. + 'Y': 'A', + 'YS': 'AS', + 'BY': 'BA', + 'BYS': 'BAS', + 'Y-DEC': 'A-DEC', + 'Y-JAN': 'A-JAN', + 'Y-FEB': 'A-FEB', + 'Y-MAR': 'A-MAR', + 'Y-APR': 'A-APR', + 'Y-MAY': 'A-MAY', + 'Y-JUN': 'A-JUN', + 'Y-JUL': 'A-JUL', + 'Y-AUG': 'A-AUG', + 'Y-SEP': 'A-SEP', + 'Y-OCT': 'A-OCT', + 'Y-NOV': 'A-NOV', +} + + _lite_rule_alias = { 'W': 'W-SUN', 'Q': 'Q-DEC', @@ -718,6 +739,7 @@ def get_standard_freq(freq): def _period_str_to_code(freqstr): + freqstr = _pure_alias.get(freqstr, freqstr) freqstr = _lite_rule_alias.get(freqstr, freqstr) if freqstr not in _dont_uppercase: From 7ffe7fc21f3dc4ca444de9c83dbf61313b6986e2 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Sun, 16 Jul 2017 02:57:14 -0700 Subject: [PATCH 40/54] Revert "ENH: Create a 'Y' alias for date_range yearly frequency" (#16976) This reverts commit 9c096d29a1e9a68b8151de4896b0d9684383821a, as it was prematurely made. --- pandas/tests/tseries/test_frequencies.py | 41 ++++++++---------------- pandas/tseries/frequencies.py | 22 ------------- 2 files changed, 13 insertions(+), 50 deletions(-) diff --git a/pandas/tests/tseries/test_frequencies.py b/pandas/tests/tseries/test_frequencies.py index 4bcd0b49db7e0..54d12317b0bf8 100644 --- a/pandas/tests/tseries/test_frequencies.py +++ b/pandas/tests/tseries/test_frequencies.py @@ -248,10 +248,9 @@ def test_anchored_shortcuts(self): # ensure invalid cases fail as expected invalid_anchors = ['SM-0', 'SM-28', 'SM-29', - 'SM-FOO', 'BSM', 'SM--1', + 'SM-FOO', 'BSM', 'SM--1' 'SMS-1', 'SMS-28', 'SMS-30', - 'SMS-BAR', 'SMS-BYR' 'BSMS', - 'SMS--2'] + 'SMS-BAR', 'BSMS', 'SMS--2'] for invalid_anchor in invalid_anchors: with tm.assert_raises_regex(ValueError, 'Invalid frequency: '): @@ -293,15 +292,11 @@ def test_get_rule_month(): result = frequencies._get_rule_month('A-DEC') assert (result == 'DEC') - result = frequencies._get_rule_month('Y-DEC') - assert (result == 'DEC') result = frequencies._get_rule_month(offsets.YearEnd()) assert (result == 'DEC') result = frequencies._get_rule_month('A-MAY') assert (result == 'MAY') - result = frequencies._get_rule_month('Y-MAY') - assert (result == 'MAY') result = frequencies._get_rule_month(offsets.YearEnd(month=5)) assert (result == 'MAY') @@ -310,10 +305,6 @@ def test_period_str_to_code(): assert (frequencies._period_str_to_code('A') == 1000) assert (frequencies._period_str_to_code('A-DEC') == 1000) assert (frequencies._period_str_to_code('A-JAN') == 1001) - assert (frequencies._period_str_to_code('Y') == 1000) - assert (frequencies._period_str_to_code('Y-DEC') == 1000) - assert (frequencies._period_str_to_code('Y-JAN') == 1001) - assert (frequencies._period_str_to_code('Q') == 2000) assert (frequencies._period_str_to_code('Q-DEC') == 2000) assert (frequencies._period_str_to_code('Q-FEB') == 2002) @@ -358,10 +349,6 @@ def test_freq_code(self): assert frequencies.get_freq('3A') == 1000 assert frequencies.get_freq('-1A') == 1000 - assert frequencies.get_freq('Y') == 1000 - assert frequencies.get_freq('3Y') == 1000 - assert frequencies.get_freq('-1Y') == 1000 - assert frequencies.get_freq('W') == 4000 assert frequencies.get_freq('W-MON') == 4001 assert frequencies.get_freq('W-FRI') == 4005 @@ -382,13 +369,6 @@ def test_freq_group(self): assert frequencies.get_freq_group('-1A') == 1000 assert frequencies.get_freq_group('A-JAN') == 1000 assert frequencies.get_freq_group('A-MAY') == 1000 - - assert frequencies.get_freq_group('Y') == 1000 - assert frequencies.get_freq_group('3Y') == 1000 - assert frequencies.get_freq_group('-1Y') == 1000 - assert frequencies.get_freq_group('Y-JAN') == 1000 - assert frequencies.get_freq_group('Y-MAY') == 1000 - assert frequencies.get_freq_group(offsets.YearEnd()) == 1000 assert frequencies.get_freq_group(offsets.YearEnd(month=1)) == 1000 assert frequencies.get_freq_group(offsets.YearEnd(month=5)) == 1000 @@ -810,6 +790,12 @@ def test_series(self): for freq in [None, 'L']: s = Series(period_range('2013', periods=10, freq=freq)) pytest.raises(TypeError, lambda: frequencies.infer_freq(s)) + for freq in ['Y']: + + msg = frequencies._INVALID_FREQ_ERROR + with tm.assert_raises_regex(ValueError, msg): + s = Series(period_range('2013', periods=10, freq=freq)) + pytest.raises(TypeError, lambda: frequencies.infer_freq(s)) # DateTimeIndex for freq in ['M', 'L', 'S']: @@ -826,12 +812,11 @@ def test_legacy_offset_warnings(self): 'W@FRI', 'W@SAT', 'W@SUN', 'Q@JAN', 'Q@FEB', 'Q@MAR', 'A@JAN', 'A@FEB', 'A@MAR', 'A@APR', 'A@MAY', 'A@JUN', 'A@JUL', 'A@AUG', 'A@SEP', 'A@OCT', 'A@NOV', 'A@DEC', - 'Y@JAN', 'WOM@1MON', 'WOM@2MON', 'WOM@3MON', - 'WOM@4MON', 'WOM@1TUE', 'WOM@2TUE', 'WOM@3TUE', - 'WOM@4TUE', 'WOM@1WED', 'WOM@2WED', 'WOM@3WED', - 'WOM@4WED', 'WOM@1THU', 'WOM@2THU', 'WOM@3THU', - 'WOM@4THU', 'WOM@1FRI', 'WOM@2FRI', 'WOM@3FRI', - 'WOM@4FRI'] + 'WOM@1MON', 'WOM@2MON', 'WOM@3MON', 'WOM@4MON', + 'WOM@1TUE', 'WOM@2TUE', 'WOM@3TUE', 'WOM@4TUE', + 'WOM@1WED', 'WOM@2WED', 'WOM@3WED', 'WOM@4WED', + 'WOM@1THU', 'WOM@2THU', 'WOM@3THU', 'WOM@4THU' + 'WOM@1FRI', 'WOM@2FRI', 'WOM@3FRI', 'WOM@4FRI'] msg = frequencies._INVALID_FREQ_ERROR for freq in freqs: diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index 5c3c90520d1c3..c5f6c00a4005a 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -422,27 +422,6 @@ def get_period_alias(offset_str): return _offset_to_period_map.get(offset_str, None) -_pure_alias = { - # 'A' is equivalent to 'Y'. - 'Y': 'A', - 'YS': 'AS', - 'BY': 'BA', - 'BYS': 'BAS', - 'Y-DEC': 'A-DEC', - 'Y-JAN': 'A-JAN', - 'Y-FEB': 'A-FEB', - 'Y-MAR': 'A-MAR', - 'Y-APR': 'A-APR', - 'Y-MAY': 'A-MAY', - 'Y-JUN': 'A-JUN', - 'Y-JUL': 'A-JUL', - 'Y-AUG': 'A-AUG', - 'Y-SEP': 'A-SEP', - 'Y-OCT': 'A-OCT', - 'Y-NOV': 'A-NOV', -} - - _lite_rule_alias = { 'W': 'W-SUN', 'Q': 'Q-DEC', @@ -739,7 +718,6 @@ def get_standard_freq(freq): def _period_str_to_code(freqstr): - freqstr = _pure_alias.get(freqstr, freqstr) freqstr = _lite_rule_alias.get(freqstr, freqstr) if freqstr not in _dont_uppercase: From 1d1c03ef807b5ea3cd589b60ea578c88a0c1227c Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Sun, 16 Jul 2017 17:23:30 +0200 Subject: [PATCH 41/54] DOC: behavior when slicing with missing bounds (#16932) closes #16917 --- doc/source/indexing.rst | 41 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 37 insertions(+), 4 deletions(-) diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index f988fb7cd6806..1659d57b33b84 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -78,8 +78,10 @@ of multi-axis indexing. *label* of the index. This use is **not** an integer position along the index) - A list or array of labels ``['a', 'b', 'c']`` - - A slice object with labels ``'a':'f'``, (note that contrary to usual python - slices, **both** the start and the stop are included!) + - A slice object with labels ``'a':'f'`` (note that contrary to usual python + slices, **both** the start and the stop are included, when present in the + index! - also see :ref:`Slicing with labels + `) - A boolean array - A ``callable`` function with one argument (the calling Series, DataFrame or Panel) and that returns valid output for indexing (one of the above) @@ -330,13 +332,16 @@ Selection By Label dfl.loc['20130102':'20130104'] pandas provides a suite of methods in order to have **purely label based indexing**. This is a strict inclusion based protocol. -**At least 1** of the labels for which you ask, must be in the index or a ``KeyError`` will be raised! When slicing, the start bound is *included*, **AND** the stop bound is *included*. Integers are valid labels, but they refer to the label **and not the position**. +**At least 1** of the labels for which you ask, must be in the index or a ``KeyError`` will be raised! When slicing, both the start bound **AND** the stop bound are *included*, if present in the index. Integers are valid labels, but they refer to the label **and not the position**. The ``.loc`` attribute is the primary access method. The following are valid inputs: - A single label, e.g. ``5`` or ``'a'``, (note that ``5`` is interpreted as a *label* of the index. This use is **not** an integer position along the index) - A list or array of labels ``['a', 'b', 'c']`` -- A slice object with labels ``'a':'f'`` (note that contrary to usual python slices, **both** the start and the stop are included!) +- A slice object with labels ``'a':'f'`` (note that contrary to usual python + slices, **both** the start and the stop are included, when present in the + index! - also See :ref:`Slicing with labels + `) - A boolean array - A ``callable``, see :ref:`Selection By Callable ` @@ -390,6 +395,34 @@ For getting a value explicitly (equiv to deprecated ``df.get_value('a','A')``) # this is also equivalent to ``df1.at['a','A']`` df1.loc['a', 'A'] +.. _indexing.slicing_with_labels: + +Slicing with labels +~~~~~~~~~~~~~~~~~~~ + +When using ``.loc`` with slices, if both the start and the stop labels are +present in the index, then elements *located* between the two (including them) +are returned: + +.. ipython:: python + + s = pd.Series(list('abcde'), index=[0,3,2,5,4]) + s.loc[3:5] + +If at least one of the two is absent, but the index is sorted, and can be +compared against start and stop labels, then slicing will still work as +expected, by selecting labels which *rank* between the two: + +.. ipython:: python + + s.sort_index() + s.sort_index().loc[1:6] + +However, if at least one of the two is absent *and* the index is not sorted, an +error will be raised (since doing otherwise would be computationally expensive, +as well as potentially ambiguous for mixed type indexes). For instance, in the +above example, ``s.loc[1:6]`` would raise ``KeyError``. + .. _indexing.integer: Selection By Position From 745c01265e31afb9048fe461dfd8c88ad2606702 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Sun, 16 Jul 2017 08:31:12 -0700 Subject: [PATCH 42/54] TST: Add test for sub-char in read_csv (#16977) Closes gh-16893. --- pandas/tests/io/parser/common.py | 10 ++++++++++ pandas/tests/io/parser/data/sub_char.csv | 2 ++ 2 files changed, 12 insertions(+) create mode 100644 pandas/tests/io/parser/data/sub_char.csv diff --git a/pandas/tests/io/parser/common.py b/pandas/tests/io/parser/common.py index 584a6561b505b..4d1f9936af983 100644 --- a/pandas/tests/io/parser/common.py +++ b/pandas/tests/io/parser/common.py @@ -1677,6 +1677,16 @@ def test_internal_eof_byte_to_file(self): result = self.read_csv(path) tm.assert_frame_equal(result, expected) + def test_sub_character(self): + # see gh-16893 + dirpath = tm.get_data_path() + filename = os.path.join(dirpath, "sub_char.csv") + + expected = DataFrame([[1, 2, 3]], columns=["a", "\x1ab", "c"]) + result = self.read_csv(filename) + + tm.assert_frame_equal(result, expected) + def test_file_handles(self): # GH 14418 - don't close user provided file handles diff --git a/pandas/tests/io/parser/data/sub_char.csv b/pandas/tests/io/parser/data/sub_char.csv new file mode 100644 index 0000000000000..ff1fa777832c7 --- /dev/null +++ b/pandas/tests/io/parser/data/sub_char.csv @@ -0,0 +1,2 @@ +a,"b",c +1,2,3 \ No newline at end of file From cbd0354d024d6d45c67fceab69f908eb51339f70 Mon Sep 17 00:00:00 2001 From: rdk1024 Date: Sun, 16 Jul 2017 05:32:45 -1000 Subject: [PATCH 43/54] DEPR: deprecate html.border option (#16970) --- doc/source/options.rst | 2 +- doc/source/whatsnew/v0.21.0.txt | 1 + pandas/core/config_init.py | 22 ++++++++++++++++------ pandas/io/formats/format.py | 2 +- pandas/tests/io/formats/test_to_html.py | 7 ++++++- 5 files changed, 25 insertions(+), 9 deletions(-) diff --git a/doc/source/options.rst b/doc/source/options.rst index 6ff5b76014c95..f373705a96f48 100644 --- a/doc/source/options.rst +++ b/doc/source/options.rst @@ -400,7 +400,7 @@ display.width 80 Width of the display in charact display.html.table_schema False Whether to publish a Table Schema representation for frontends that support it. -html.border 1 A ``border=value`` attribute is +display.html.border 1 A ``border=value`` attribute is inserted in the ```` tag for the DataFrame HTML repr. io.excel.xls.writer xlwt The default Excel writer engine for diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 762107a261090..7c52cf6f450b2 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -116,6 +116,7 @@ Deprecations ~~~~~~~~~~~~ - :func:`read_excel()` has deprecated ``sheetname`` in favor of ``sheet_name`` for consistency with ``.to_excel()`` (:issue:`10559`). +- ``pd.options.html.border`` has been deprecated in favor of ``pd.options.display.html.border`` (:issue:`15793`). .. _whatsnew_0210.prior_deprecations: diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 04563907582ee..ae3001564a62f 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -202,6 +202,17 @@ def use_numexpr_cb(key): (default: False) """ +pc_html_border_doc = """ +: int + A ``border=value`` attribute is inserted in the ``
`` tag + for the DataFrame HTML repr. +""" + +pc_html_border_deprecation_warning = """\ +html.border has been deprecated, use display.html.border instead +(currently both are identical) +""" + pc_line_width_deprecation_warning = """\ line_width has been deprecated, use display.width instead (currently both are identical) @@ -369,6 +380,8 @@ def table_schema_cb(key): validator=is_bool) cf.register_option('html.table_schema', False, pc_table_schema_doc, validator=is_bool, cb=table_schema_cb) + cf.register_option('html.border', 1, pc_html_border_doc, + validator=is_int) cf.deprecate_option('display.line_width', @@ -378,16 +391,13 @@ def table_schema_cb(key): cf.deprecate_option('display.height', msg=pc_height_deprecation_warning, rkey='display.max_rows') -pc_html_border_doc = """ -: int - A ``border=value`` attribute is inserted in the ``
`` tag - for the DataFrame HTML repr. -""" - with cf.config_prefix('html'): cf.register_option('border', 1, pc_html_border_doc, validator=is_int) +cf.deprecate_option('html.border', msg=pc_html_border_deprecation_warning, + rkey='display.html.border') + tc_sim_interactive_doc = """ : boolean diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 0627ca9179509..23eb3bb05fd0a 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1064,7 +1064,7 @@ def __init__(self, formatter, classes=None, max_rows=None, max_cols=None, self.max_cols < len(self.fmt.columns)) self.notebook = notebook if border is None: - border = get_option('html.border') + border = get_option('display.html.border') self.border = border def write(self, s, indent=0): diff --git a/pandas/tests/io/formats/test_to_html.py b/pandas/tests/io/formats/test_to_html.py index 9f4e532ec2287..1e174c34221d5 100644 --- a/pandas/tests/io/formats/test_to_html.py +++ b/pandas/tests/io/formats/test_to_html.py @@ -1401,7 +1401,7 @@ def test_to_html_border(self): def test_to_html_border_option(self): df = DataFrame({'A': [1, 2]}) - with pd.option_context('html.border', 0): + with pd.option_context('display.html.border', 0): result = df.to_html() assert 'border="0"' in result assert 'border="0"' in df._repr_html_() @@ -1411,6 +1411,11 @@ def test_to_html_border_zero(self): result = df.to_html(border=0) assert 'border="0"' in result + def test_display_option_warning(self): + with tm.assert_produces_warning(DeprecationWarning, + check_stacklevel=False): + pd.options.html.border + def test_to_html(self): # big mixed biggie = DataFrame({'A': np.random.randn(200), From 692b5eeeff9b8e8c750f3e64db0c39dc149a73e8 Mon Sep 17 00:00:00 2001 From: fding253 Date: Sun, 16 Jul 2017 10:55:33 -0500 Subject: [PATCH 44/54] DOC: document convention argument for resample() (#16965) * DOC: document convention argument for resample() --- pandas/core/generic.py | 43 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index a4bb746722c1e..e4e2e0093b1a6 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4826,6 +4826,8 @@ def resample(self, rule, how=None, axis=0, fill_method=None, closed=None, label : {'right', 'left'} Which bin edge label to label bucket with convention : {'start', 'end', 's', 'e'} + For PeriodIndex only, controls whether to use the start or end of + `rule` loffset : timedelta Adjust the resampled time labels base : int, default 0 @@ -4946,6 +4948,47 @@ def resample(self, rule, how=None, axis=0, fill_method=None, closed=None, 2000-01-01 00:06:00 26 Freq: 3T, dtype: int64 + For a Series with a PeriodIndex, the keyword `convention` can be + used to control whether to use the start or end of `rule`. + + >>> s = pd.Series([1, 2], index=pd.period_range('2012-01-01', + freq='A', + periods=2)) + >>> s + 2012 1 + 2013 2 + Freq: A-DEC, dtype: int64 + + Resample by month using 'start' `convention`. Values are assigned to + the first month of the period. + + >>> s.resample('M', convention='start').asfreq().head() + 2012-01 1.0 + 2012-02 NaN + 2012-03 NaN + 2012-04 NaN + 2012-05 NaN + Freq: M, dtype: float64 + + Resample by month using 'end' `convention`. Values are assigned to + the last month of the period. + + >>> s.resample('M', convention='end').asfreq() + 2012-12 1.0 + 2013-01 NaN + 2013-02 NaN + 2013-03 NaN + 2013-04 NaN + 2013-05 NaN + 2013-06 NaN + 2013-07 NaN + 2013-08 NaN + 2013-09 NaN + 2013-10 NaN + 2013-11 NaN + 2013-12 2.0 + Freq: M, dtype: float64 + For DataFrame objects, the keyword ``on`` can be used to specify the column instead of the index for resampling. From ea487fc9b197285f25b066450c46fc456db09e2a Mon Sep 17 00:00:00 2001 From: gfyoung Date: Sun, 16 Jul 2017 23:19:27 -0700 Subject: [PATCH 45/54] DOC: Clarify 'it' in aggregate doc (#16989) Closes gh-16988. --- pandas/core/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e4e2e0093b1a6..f12592feaa4c3 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3152,7 +3152,7 @@ def pipe(self, func, *args, **kwargs): (e.g., np.mean(arr_2d, axis=0)) as opposed to mimicking the default Numpy behavior (e.g., np.mean(arr_2d)). - agg is an alias for aggregate. Use it. + `agg` is an alias for `aggregate`. Use the alias. Returns ------- From ec927a47e472eebb5ba7086dcc15f3dda1c832cd Mon Sep 17 00:00:00 2001 From: cclauss Date: Mon, 17 Jul 2017 14:59:14 +0200 Subject: [PATCH 46/54] CLN/COMPAT: for various py2/py3 in doc/bench scripts (#16984) --- asv_bench/vbench_to_asv.py | 6 +- bench/alignment.py | 22 -- bench/bench_dense_to_sparse.py | 14 - bench/bench_get_put_value.py | 56 ---- bench/bench_groupby.py | 66 ----- bench/bench_join_panel.py | 85 ------ bench/bench_khash_dict.py | 89 ------ bench/bench_merge.R | 161 ---------- bench/bench_merge.py | 105 ------- bench/bench_merge_sqlite.py | 87 ------ bench/bench_pivot.R | 27 -- bench/bench_pivot.py | 16 - bench/bench_take_indexing.py | 55 ---- bench/bench_unique.py | 278 ------------------ bench/bench_with_subset.R | 53 ---- bench/bench_with_subset.py | 116 -------- bench/better_unique.py | 80 ----- bench/duplicated.R | 22 -- bench/io_roundtrip.py | 116 -------- bench/larry.py | 0 bench/serialize.py | 89 ------ bench/test.py | 70 ----- bench/zoo_bench.R | 71 ----- bench/zoo_bench.py | 36 --- doc/source/conf.py | 5 + .../ipython_sphinxext/ipython_directive.py | 4 +- scripts/find_commits_touching_func.py | 10 +- scripts/windows_builder/build_27-32.bat | 25 -- scripts/windows_builder/build_27-64.bat | 25 -- scripts/windows_builder/build_34-32.bat | 27 -- scripts/windows_builder/build_34-64.bat | 27 -- scripts/windows_builder/check_and_build.bat | 2 - scripts/windows_builder/check_and_build.py | 194 ------------ scripts/windows_builder/readme.txt | 17 -- 34 files changed, 14 insertions(+), 2042 deletions(-) delete mode 100644 bench/alignment.py delete mode 100644 bench/bench_dense_to_sparse.py delete mode 100644 bench/bench_get_put_value.py delete mode 100644 bench/bench_groupby.py delete mode 100644 bench/bench_join_panel.py delete mode 100644 bench/bench_khash_dict.py delete mode 100644 bench/bench_merge.R delete mode 100644 bench/bench_merge.py delete mode 100644 bench/bench_merge_sqlite.py delete mode 100644 bench/bench_pivot.R delete mode 100644 bench/bench_pivot.py delete mode 100644 bench/bench_take_indexing.py delete mode 100644 bench/bench_unique.py delete mode 100644 bench/bench_with_subset.R delete mode 100644 bench/bench_with_subset.py delete mode 100644 bench/better_unique.py delete mode 100644 bench/duplicated.R delete mode 100644 bench/io_roundtrip.py delete mode 100644 bench/larry.py delete mode 100644 bench/serialize.py delete mode 100644 bench/test.py delete mode 100644 bench/zoo_bench.R delete mode 100644 bench/zoo_bench.py delete mode 100644 scripts/windows_builder/build_27-32.bat delete mode 100644 scripts/windows_builder/build_27-64.bat delete mode 100644 scripts/windows_builder/build_34-32.bat delete mode 100644 scripts/windows_builder/build_34-64.bat delete mode 100644 scripts/windows_builder/check_and_build.bat delete mode 100644 scripts/windows_builder/check_and_build.py delete mode 100644 scripts/windows_builder/readme.txt diff --git a/asv_bench/vbench_to_asv.py b/asv_bench/vbench_to_asv.py index c3041ec2b1ba1..2a4ce5d183ea2 100644 --- a/asv_bench/vbench_to_asv.py +++ b/asv_bench/vbench_to_asv.py @@ -114,7 +114,7 @@ def translate_module(target_module): l_vars = {} exec('import ' + target_module) in g_vars - print target_module + print(target_module) module = eval(target_module, g_vars) benchmarks = [] @@ -157,7 +157,7 @@ def translate_module(target_module): mod = os.path.basename(module) if mod in ['make.py', 'measure_memory_consumption.py', 'perf_HEAD.py', 'run_suite.py', 'test_perf.py', 'generate_rst_files.py', 'test.py', 'suite.py']: continue - print - print mod + print('') + print(mod) translate_module(mod.replace('.py', '')) diff --git a/bench/alignment.py b/bench/alignment.py deleted file mode 100644 index bc3134f597ee0..0000000000000 --- a/bench/alignment.py +++ /dev/null @@ -1,22 +0,0 @@ -# Setup -from pandas.compat import range, lrange -import numpy as np -import pandas -import la -N = 1000 -K = 50 -arr1 = np.random.randn(N, K) -arr2 = np.random.randn(N, K) -idx1 = lrange(N) -idx2 = lrange(K) - -# pandas -dma1 = pandas.DataFrame(arr1, idx1, idx2) -dma2 = pandas.DataFrame(arr2, idx1[::-1], idx2[::-1]) - -# larry -lar1 = la.larry(arr1, [idx1, idx2]) -lar2 = la.larry(arr2, [idx1[::-1], idx2[::-1]]) - -for i in range(100): - result = lar1 + lar2 diff --git a/bench/bench_dense_to_sparse.py b/bench/bench_dense_to_sparse.py deleted file mode 100644 index e1dcd3456e88d..0000000000000 --- a/bench/bench_dense_to_sparse.py +++ /dev/null @@ -1,14 +0,0 @@ -from pandas import * - -K = 100 -N = 100000 -rng = DatetimeIndex('1/1/2000', periods=N, offset=datetools.Minute()) - -rng2 = np.asarray(rng).astype('M8[us]').astype('i8') - -series = {} -for i in range(1, K + 1): - data = np.random.randn(N)[:-i] - this_rng = rng2[:-i] - data[100:] = np.nan - series[i] = SparseSeries(data, index=this_rng) diff --git a/bench/bench_get_put_value.py b/bench/bench_get_put_value.py deleted file mode 100644 index 427e0b1b10a22..0000000000000 --- a/bench/bench_get_put_value.py +++ /dev/null @@ -1,56 +0,0 @@ -from pandas import * -from pandas.util.testing import rands -from pandas.compat import range - -N = 1000 -K = 50 - - -def _random_index(howmany): - return Index([rands(10) for _ in range(howmany)]) - -df = DataFrame(np.random.randn(N, K), index=_random_index(N), - columns=_random_index(K)) - - -def get1(): - for col in df.columns: - for row in df.index: - _ = df[col][row] - - -def get2(): - for col in df.columns: - for row in df.index: - _ = df.get_value(row, col) - - -def put1(): - for col in df.columns: - for row in df.index: - df[col][row] = 0 - - -def put2(): - for col in df.columns: - for row in df.index: - df.set_value(row, col, 0) - - -def resize1(): - buf = DataFrame() - for col in df.columns: - for row in df.index: - buf = buf.set_value(row, col, 5.) - return buf - - -def resize2(): - from collections import defaultdict - - buf = defaultdict(dict) - for col in df.columns: - for row in df.index: - buf[col][row] = 5. - - return DataFrame(buf) diff --git a/bench/bench_groupby.py b/bench/bench_groupby.py deleted file mode 100644 index d7a2853e1e7b2..0000000000000 --- a/bench/bench_groupby.py +++ /dev/null @@ -1,66 +0,0 @@ -from pandas import * -from pandas.util.testing import rands -from pandas.compat import range - -import string -import random - -k = 20000 -n = 10 - -foo = np.tile(np.array([rands(10) for _ in range(k)], dtype='O'), n) -foo2 = list(foo) -random.shuffle(foo) -random.shuffle(foo2) - -df = DataFrame({'A': foo, - 'B': foo2, - 'C': np.random.randn(n * k)}) - -import pandas._sandbox as sbx - - -def f(): - table = sbx.StringHashTable(len(df)) - ret = table.factorize(df['A']) - return ret - - -def g(): - table = sbx.PyObjectHashTable(len(df)) - ret = table.factorize(df['A']) - return ret - -ret = f() - -""" -import pandas._tseries as lib - -f = np.std - - -grouped = df.groupby(['A', 'B']) - -label_list = [ping.labels for ping in grouped.groupings] -shape = [len(ping.ids) for ping in grouped.groupings] - -from pandas.core.groupby import get_group_index - - -group_index = get_group_index(label_list, shape, - sort=True, xnull=True).astype('i4') - -ngroups = np.prod(shape) - -indexer = lib.groupsort_indexer(group_index, ngroups) - -values = df['C'].values.take(indexer) -group_index = group_index.take(indexer) - -f = lambda x: x.std(ddof=1) - -grouper = lib.Grouper(df['C'], np.ndarray.std, group_index, ngroups) -result = grouper.get_result() - -expected = grouped.std() -""" diff --git a/bench/bench_join_panel.py b/bench/bench_join_panel.py deleted file mode 100644 index f3c3f8ba15f70..0000000000000 --- a/bench/bench_join_panel.py +++ /dev/null @@ -1,85 +0,0 @@ -# reasonably efficient - - -def create_panels_append(cls, panels): - """ return an append list of panels """ - panels = [a for a in panels if a is not None] - # corner cases - if len(panels) == 0: - return None - elif len(panels) == 1: - return panels[0] - elif len(panels) == 2 and panels[0] == panels[1]: - return panels[0] - # import pdb; pdb.set_trace() - # create a joint index for the axis - - def joint_index_for_axis(panels, axis): - s = set() - for p in panels: - s.update(list(getattr(p, axis))) - return sorted(list(s)) - - def reindex_on_axis(panels, axis, axis_reindex): - new_axis = joint_index_for_axis(panels, axis) - new_panels = [p.reindex(**{axis_reindex: new_axis, - 'copy': False}) for p in panels] - return new_panels, new_axis - # create the joint major index, dont' reindex the sub-panels - we are - # appending - major = joint_index_for_axis(panels, 'major_axis') - # reindex on minor axis - panels, minor = reindex_on_axis(panels, 'minor_axis', 'minor') - # reindex on items - panels, items = reindex_on_axis(panels, 'items', 'items') - # concatenate values - try: - values = np.concatenate([p.values for p in panels], axis=1) - except Exception as detail: - raise Exception("cannot append values that dont' match dimensions! -> [%s] %s" - % (','.join(["%s" % p for p in panels]), str(detail))) - # pm('append - create_panel') - p = Panel(values, items=items, major_axis=major, - minor_axis=minor) - # pm('append - done') - return p - - -# does the job but inefficient (better to handle like you read a table in -# pytables...e.g create a LongPanel then convert to Wide) -def create_panels_join(cls, panels): - """ given an array of panels's, create a single panel """ - panels = [a for a in panels if a is not None] - # corner cases - if len(panels) == 0: - return None - elif len(panels) == 1: - return panels[0] - elif len(panels) == 2 and panels[0] == panels[1]: - return panels[0] - d = dict() - minor, major, items = set(), set(), set() - for panel in panels: - items.update(panel.items) - major.update(panel.major_axis) - minor.update(panel.minor_axis) - values = panel.values - for item, item_index in panel.items.indexMap.items(): - for minor_i, minor_index in panel.minor_axis.indexMap.items(): - for major_i, major_index in panel.major_axis.indexMap.items(): - try: - d[(minor_i, major_i, item)] = values[item_index, major_index, minor_index] - except: - pass - # stack the values - minor = sorted(list(minor)) - major = sorted(list(major)) - items = sorted(list(items)) - # create the 3d stack (items x columns x indicies) - data = np.dstack([np.asarray([np.asarray([d.get((minor_i, major_i, item), np.nan) - for item in items]) - for major_i in major]).transpose() - for minor_i in minor]) - # construct the panel - return Panel(data, items, major, minor) -add_class_method(Panel, create_panels_join, 'join_many') diff --git a/bench/bench_khash_dict.py b/bench/bench_khash_dict.py deleted file mode 100644 index 054fc36131b65..0000000000000 --- a/bench/bench_khash_dict.py +++ /dev/null @@ -1,89 +0,0 @@ -""" -Some comparisons of khash.h to Python dict -""" -from __future__ import print_function - -import numpy as np -import os - -from vbench.api import Benchmark -from pandas.util.testing import rands -from pandas.compat import range -import pandas._tseries as lib -import pandas._sandbox as sbx -import time - -import psutil - -pid = os.getpid() -proc = psutil.Process(pid) - - -def object_test_data(n): - pass - - -def string_test_data(n): - return np.array([rands(10) for _ in range(n)], dtype='O') - - -def int_test_data(n): - return np.arange(n, dtype='i8') - -N = 1000000 - -#---------------------------------------------------------------------- -# Benchmark 1: map_locations - - -def map_locations_python_object(): - arr = string_test_data(N) - return _timeit(lambda: lib.map_indices_object(arr)) - - -def map_locations_khash_object(): - arr = string_test_data(N) - - def f(): - table = sbx.PyObjectHashTable(len(arr)) - table.map_locations(arr) - return _timeit(f) - - -def _timeit(f, iterations=10): - start = time.time() - for _ in range(iterations): - foo = f() - elapsed = time.time() - start - return elapsed - -#---------------------------------------------------------------------- -# Benchmark 2: lookup_locations - - -def lookup_python(values): - table = lib.map_indices_object(values) - return _timeit(lambda: lib.merge_indexer_object(values, table)) - - -def lookup_khash(values): - table = sbx.PyObjectHashTable(len(values)) - table.map_locations(values) - locs = table.lookup_locations(values) - # elapsed = _timeit(lambda: table.lookup_locations2(values)) - return table - - -def leak(values): - for _ in range(100): - print(proc.get_memory_info()) - table = lookup_khash(values) - # table.destroy() - -arr = string_test_data(N) - -#---------------------------------------------------------------------- -# Benchmark 3: unique - -#---------------------------------------------------------------------- -# Benchmark 4: factorize diff --git a/bench/bench_merge.R b/bench/bench_merge.R deleted file mode 100644 index 3ed4618494857..0000000000000 --- a/bench/bench_merge.R +++ /dev/null @@ -1,161 +0,0 @@ -library(plyr) -library(data.table) -N <- 10000 -indices = rep(NA, N) -indices2 = rep(NA, N) -for (i in 1:N) { - indices[i] <- paste(sample(letters, 10), collapse="") - indices2[i] <- paste(sample(letters, 10), collapse="") -} -left <- data.frame(key=rep(indices[1:8000], 10), - key2=rep(indices2[1:8000], 10), - value=rnorm(80000)) -right <- data.frame(key=indices[2001:10000], - key2=indices2[2001:10000], - value2=rnorm(8000)) - -right2 <- data.frame(key=rep(right$key, 2), - key2=rep(right$key2, 2), - value2=rnorm(16000)) - -left.dt <- data.table(left, key=c("key", "key2")) -right.dt <- data.table(right, key=c("key", "key2")) -right2.dt <- data.table(right2, key=c("key", "key2")) - -# left.dt2 <- data.table(left) -# right.dt2 <- data.table(right) - -## left <- data.frame(key=rep(indices[1:1000], 10), -## key2=rep(indices2[1:1000], 10), -## value=rnorm(100000)) -## right <- data.frame(key=indices[1:1000], -## key2=indices2[1:1000], -## value2=rnorm(10000)) - -timeit <- function(func, niter=10) { - timing = rep(NA, niter) - for (i in 1:niter) { - gc() - timing[i] <- system.time(func())[3] - } - mean(timing) -} - -left.join <- function(sort=FALSE) { - result <- base::merge(left, right, all.x=TRUE, sort=sort) -} - -right.join <- function(sort=FALSE) { - result <- base::merge(left, right, all.y=TRUE, sort=sort) -} - -outer.join <- function(sort=FALSE) { - result <- base::merge(left, right, all=TRUE, sort=sort) -} - -inner.join <- function(sort=FALSE) { - result <- base::merge(left, right, all=FALSE, sort=sort) -} - -left.join.dt <- function(sort=FALSE) { - result <- right.dt[left.dt] -} - -right.join.dt <- function(sort=FALSE) { - result <- left.dt[right.dt] -} - -outer.join.dt <- function(sort=FALSE) { - result <- merge(left.dt, right.dt, all=TRUE, sort=sort) -} - -inner.join.dt <- function(sort=FALSE) { - result <- merge(left.dt, right.dt, all=FALSE, sort=sort) -} - -plyr.join <- function(type) { - result <- plyr::join(left, right, by=c("key", "key2"), - type=type, match="first") -} - -sort.options <- c(FALSE, TRUE) - -# many-to-one - -results <- matrix(nrow=4, ncol=3) -colnames(results) <- c("base::merge", "plyr", "data.table") -rownames(results) <- c("inner", "outer", "left", "right") - -base.functions <- c(inner.join, outer.join, left.join, right.join) -plyr.functions <- c(function() plyr.join("inner"), - function() plyr.join("full"), - function() plyr.join("left"), - function() plyr.join("right")) -dt.functions <- c(inner.join.dt, outer.join.dt, left.join.dt, right.join.dt) -for (i in 1:4) { - base.func <- base.functions[[i]] - plyr.func <- plyr.functions[[i]] - dt.func <- dt.functions[[i]] - results[i, 1] <- timeit(base.func) - results[i, 2] <- timeit(plyr.func) - results[i, 3] <- timeit(dt.func) -} - - -# many-to-many - -left.join <- function(sort=FALSE) { - result <- base::merge(left, right2, all.x=TRUE, sort=sort) -} - -right.join <- function(sort=FALSE) { - result <- base::merge(left, right2, all.y=TRUE, sort=sort) -} - -outer.join <- function(sort=FALSE) { - result <- base::merge(left, right2, all=TRUE, sort=sort) -} - -inner.join <- function(sort=FALSE) { - result <- base::merge(left, right2, all=FALSE, sort=sort) -} - -left.join.dt <- function(sort=FALSE) { - result <- right2.dt[left.dt] -} - -right.join.dt <- function(sort=FALSE) { - result <- left.dt[right2.dt] -} - -outer.join.dt <- function(sort=FALSE) { - result <- merge(left.dt, right2.dt, all=TRUE, sort=sort) -} - -inner.join.dt <- function(sort=FALSE) { - result <- merge(left.dt, right2.dt, all=FALSE, sort=sort) -} - -sort.options <- c(FALSE, TRUE) - -# many-to-one - -results <- matrix(nrow=4, ncol=3) -colnames(results) <- c("base::merge", "plyr", "data.table") -rownames(results) <- c("inner", "outer", "left", "right") - -base.functions <- c(inner.join, outer.join, left.join, right.join) -plyr.functions <- c(function() plyr.join("inner"), - function() plyr.join("full"), - function() plyr.join("left"), - function() plyr.join("right")) -dt.functions <- c(inner.join.dt, outer.join.dt, left.join.dt, right.join.dt) -for (i in 1:4) { - base.func <- base.functions[[i]] - plyr.func <- plyr.functions[[i]] - dt.func <- dt.functions[[i]] - results[i, 1] <- timeit(base.func) - results[i, 2] <- timeit(plyr.func) - results[i, 3] <- timeit(dt.func) -} - diff --git a/bench/bench_merge.py b/bench/bench_merge.py deleted file mode 100644 index 330dba7b9af69..0000000000000 --- a/bench/bench_merge.py +++ /dev/null @@ -1,105 +0,0 @@ -import random -import gc -import time -from pandas import * -from pandas.compat import range, lrange, StringIO -from pandas.util.testing import rands - -N = 10000 -ngroups = 10 - - -def get_test_data(ngroups=100, n=N): - unique_groups = lrange(ngroups) - arr = np.asarray(np.tile(unique_groups, n / ngroups), dtype=object) - - if len(arr) < n: - arr = np.asarray(list(arr) + unique_groups[:n - len(arr)], - dtype=object) - - random.shuffle(arr) - return arr - -# aggregate multiple columns -# df = DataFrame({'key1' : get_test_data(ngroups=ngroups), -# 'key2' : get_test_data(ngroups=ngroups), -# 'data1' : np.random.randn(N), -# 'data2' : np.random.randn(N)}) - -# df2 = DataFrame({'key1' : get_test_data(ngroups=ngroups, n=N//10), -# 'key2' : get_test_data(ngroups=ngroups//2, n=N//10), -# 'value' : np.random.randn(N // 10)}) -# result = merge.merge(df, df2, on='key2') - -N = 10000 - -indices = np.array([rands(10) for _ in range(N)], dtype='O') -indices2 = np.array([rands(10) for _ in range(N)], dtype='O') -key = np.tile(indices[:8000], 10) -key2 = np.tile(indices2[:8000], 10) - -left = DataFrame({'key': key, 'key2': key2, - 'value': np.random.randn(80000)}) -right = DataFrame({'key': indices[2000:], 'key2': indices2[2000:], - 'value2': np.random.randn(8000)}) - -right2 = right.append(right, ignore_index=True) - - -join_methods = ['inner', 'outer', 'left', 'right'] -results = DataFrame(index=join_methods, columns=[False, True]) -niter = 10 -for sort in [False, True]: - for join_method in join_methods: - f = lambda: merge(left, right, how=join_method, sort=sort) - gc.disable() - start = time.time() - for _ in range(niter): - f() - elapsed = (time.time() - start) / niter - gc.enable() - results[sort][join_method] = elapsed -# results.columns = ['pandas'] -results.columns = ['dont_sort', 'sort'] - - -# R results -# many to one -r_results = read_table(StringIO(""" base::merge plyr data.table -inner 0.2475 0.1183 0.1100 -outer 0.4213 0.1916 0.2090 -left 0.2998 0.1188 0.0572 -right 0.3102 0.0536 0.0376 -"""), sep='\s+') - -presults = results[['dont_sort']].rename(columns={'dont_sort': 'pandas'}) -all_results = presults.join(r_results) - -all_results = all_results.div(all_results['pandas'], axis=0) - -all_results = all_results.ix[:, ['pandas', 'data.table', 'plyr', - 'base::merge']] - -sort_results = DataFrame.from_items([('pandas', results['sort']), - ('R', r_results['base::merge'])]) -sort_results['Ratio'] = sort_results['R'] / sort_results['pandas'] - - -nosort_results = DataFrame.from_items([('pandas', results['dont_sort']), - ('R', r_results['base::merge'])]) -nosort_results['Ratio'] = nosort_results['R'] / nosort_results['pandas'] - -# many to many - -# many to one -r_results = read_table(StringIO("""base::merge plyr data.table -inner 0.4610 0.1276 0.1269 -outer 0.9195 0.1881 0.2725 -left 0.6559 0.1257 0.0678 -right 0.6425 0.0522 0.0428 -"""), sep='\s+') - -all_results = presults.join(r_results) -all_results = all_results.div(all_results['pandas'], axis=0) -all_results = all_results.ix[:, ['pandas', 'data.table', 'plyr', - 'base::merge']] diff --git a/bench/bench_merge_sqlite.py b/bench/bench_merge_sqlite.py deleted file mode 100644 index 3ad4b810119c3..0000000000000 --- a/bench/bench_merge_sqlite.py +++ /dev/null @@ -1,87 +0,0 @@ -import numpy as np -from collections import defaultdict -import gc -import time -from pandas import DataFrame -from pandas.util.testing import rands -from pandas.compat import range, zip -import random - -N = 10000 - -indices = np.array([rands(10) for _ in range(N)], dtype='O') -indices2 = np.array([rands(10) for _ in range(N)], dtype='O') -key = np.tile(indices[:8000], 10) -key2 = np.tile(indices2[:8000], 10) - -left = DataFrame({'key': key, 'key2': key2, - 'value': np.random.randn(80000)}) -right = DataFrame({'key': indices[2000:], 'key2': indices2[2000:], - 'value2': np.random.randn(8000)}) - -# right2 = right.append(right, ignore_index=True) -# right = right2 - -# random.shuffle(key2) -# indices2 = indices.copy() -# random.shuffle(indices2) - -# Prepare Database -import sqlite3 -create_sql_indexes = True - -conn = sqlite3.connect(':memory:') -conn.execute( - 'create table left( key varchar(10), key2 varchar(10), value int);') -conn.execute( - 'create table right( key varchar(10), key2 varchar(10), value2 int);') -conn.executemany('insert into left values (?, ?, ?)', - zip(key, key2, left['value'])) -conn.executemany('insert into right values (?, ?, ?)', - zip(right['key'], right['key2'], right['value2'])) - -# Create Indices -if create_sql_indexes: - conn.execute('create index left_ix on left(key, key2)') - conn.execute('create index right_ix on right(key, key2)') - - -join_methods = ['inner', 'left outer', 'left'] # others not supported -sql_results = DataFrame(index=join_methods, columns=[False]) -niter = 5 -for sort in [False]: - for join_method in join_methods: - sql = """CREATE TABLE test as select * - from left - %s join right - on left.key=right.key - and left.key2 = right.key2;""" % join_method - sql = """select * - from left - %s join right - on left.key=right.key - and left.key2 = right.key2;""" % join_method - - if sort: - sql = '%s order by key, key2' % sql - f = lambda: list(conn.execute(sql)) # list fetches results - g = lambda: conn.execute(sql) # list fetches results - gc.disable() - start = time.time() - # for _ in range(niter): - g() - elapsed = (time.time() - start) / niter - gc.enable() - - cur = conn.execute("DROP TABLE test") - conn.commit() - - sql_results[sort][join_method] = elapsed - sql_results.columns = ['sqlite3'] # ['dont_sort', 'sort'] - sql_results.index = ['inner', 'outer', 'left'] - - sql = """select * - from left - inner join right - on left.key=right.key - and left.key2 = right.key2;""" diff --git a/bench/bench_pivot.R b/bench/bench_pivot.R deleted file mode 100644 index 06dc6a105bc43..0000000000000 --- a/bench/bench_pivot.R +++ /dev/null @@ -1,27 +0,0 @@ -library(reshape2) - - -n <- 100000 -a.size <- 5 -b.size <- 5 - -data <- data.frame(a=sample(letters[1:a.size], n, replace=T), - b=sample(letters[1:b.size], n, replace=T), - c=rnorm(n), - d=rnorm(n)) - -timings <- numeric() - -# acast(melt(data, id=c("a", "b")), a ~ b, mean) -# acast(melt(data, id=c("a", "b")), a + b ~ variable, mean) - -for (i in 1:10) { - gc() - tim <- system.time(acast(melt(data, id=c("a", "b")), a ~ b, mean, - subset=.(variable=="c"))) - timings[i] = tim[3] -} - -mean(timings) - -acast(melt(data, id=c("a", "b")), a ~ b, mean, subset=.(variable="c")) diff --git a/bench/bench_pivot.py b/bench/bench_pivot.py deleted file mode 100644 index 007bd0aaebc2f..0000000000000 --- a/bench/bench_pivot.py +++ /dev/null @@ -1,16 +0,0 @@ -from pandas import * -import string - - -n = 100000 -asize = 5 -bsize = 5 - -letters = np.asarray(list(string.letters), dtype=object) - -data = DataFrame(dict(foo=letters[:asize][np.random.randint(0, asize, n)], - bar=letters[:bsize][np.random.randint(0, bsize, n)], - baz=np.random.randn(n), - qux=np.random.randn(n))) - -table = pivot_table(data, xby=['foo', 'bar']) diff --git a/bench/bench_take_indexing.py b/bench/bench_take_indexing.py deleted file mode 100644 index 5fb584bcfe45f..0000000000000 --- a/bench/bench_take_indexing.py +++ /dev/null @@ -1,55 +0,0 @@ -from __future__ import print_function -import numpy as np - -from pandas import * -import pandas._tseries as lib - -from pandas import DataFrame -import timeit -from pandas.compat import zip - -setup = """ -from pandas import Series -import pandas._tseries as lib -import random -import numpy as np - -import random -n = %d -k = %d -arr = np.random.randn(n, k) -indexer = np.arange(n, dtype=np.int32) -indexer = indexer[::-1] -""" - -sizes = [100, 1000, 10000, 100000] -iters = [1000, 1000, 100, 1] - -fancy_2d = [] -take_2d = [] -cython_2d = [] - -n = 1000 - - -def _timeit(stmt, size, k=5, iters=1000): - timer = timeit.Timer(stmt=stmt, setup=setup % (sz, k)) - return timer.timeit(n) / n - -for sz, its in zip(sizes, iters): - print(sz) - fancy_2d.append(_timeit('arr[indexer]', sz, iters=its)) - take_2d.append(_timeit('arr.take(indexer, axis=0)', sz, iters=its)) - cython_2d.append(_timeit('lib.take_axis0(arr, indexer)', sz, iters=its)) - -df = DataFrame({'fancy': fancy_2d, - 'take': take_2d, - 'cython': cython_2d}) - -print(df) - -from pandas.rpy.common import r -r('mat <- matrix(rnorm(50000), nrow=10000, ncol=5)') -r('set.seed(12345') -r('indexer <- sample(1:10000)') -r('mat[indexer,]') diff --git a/bench/bench_unique.py b/bench/bench_unique.py deleted file mode 100644 index 87bd2f2df586c..0000000000000 --- a/bench/bench_unique.py +++ /dev/null @@ -1,278 +0,0 @@ -from __future__ import print_function -from pandas import * -from pandas.util.testing import rands -from pandas.compat import range, zip -import pandas._tseries as lib -import numpy as np -import matplotlib.pyplot as plt - -N = 50000 -K = 10000 - -groups = np.array([rands(10) for _ in range(K)], dtype='O') -groups2 = np.array([rands(10) for _ in range(K)], dtype='O') - -labels = np.tile(groups, N // K) -labels2 = np.tile(groups2, N // K) -data = np.random.randn(N) - - -def timeit(f, niter): - import gc - import time - gc.disable() - start = time.time() - for _ in range(niter): - f() - elapsed = (time.time() - start) / niter - gc.enable() - return elapsed - - -def algo1(): - unique_labels = np.unique(labels) - result = np.empty(len(unique_labels)) - for i, label in enumerate(unique_labels): - result[i] = data[labels == label].sum() - - -def algo2(): - unique_labels = np.unique(labels) - indices = lib.groupby_indices(labels) - result = np.empty(len(unique_labels)) - - for i, label in enumerate(unique_labels): - result[i] = data.take(indices[label]).sum() - - -def algo3_nosort(): - rizer = lib.DictFactorizer() - labs, counts = rizer.factorize(labels, sort=False) - k = len(rizer.uniques) - out = np.empty(k) - lib.group_add(out, counts, data, labs) - - -def algo3_sort(): - rizer = lib.DictFactorizer() - labs, counts = rizer.factorize(labels, sort=True) - k = len(rizer.uniques) - out = np.empty(k) - lib.group_add(out, counts, data, labs) - -import numpy as np -import random - - -# dict to hold results -counts = {} - -# a hack to generate random key, value pairs. -# 5k keys, 100k values -x = np.tile(np.arange(5000, dtype='O'), 20) -random.shuffle(x) -xarr = x -x = [int(y) for y in x] -data = np.random.uniform(0, 1, 100000) - - -def f(): - # groupby sum - for k, v in zip(x, data): - try: - counts[k] += v - except KeyError: - counts[k] = v - - -def f2(): - rizer = lib.DictFactorizer() - labs, counts = rizer.factorize(xarr, sort=False) - k = len(rizer.uniques) - out = np.empty(k) - lib.group_add(out, counts, data, labs) - - -def algo4(): - rizer = lib.DictFactorizer() - labs1, _ = rizer.factorize(labels, sort=False) - k1 = len(rizer.uniques) - - rizer = lib.DictFactorizer() - labs2, _ = rizer.factorize(labels2, sort=False) - k2 = len(rizer.uniques) - - group_id = labs1 * k2 + labs2 - max_group = k1 * k2 - - if max_group > 1e6: - rizer = lib.Int64Factorizer(len(group_id)) - group_id, _ = rizer.factorize(group_id.astype('i8'), sort=True) - max_group = len(rizer.uniques) - - out = np.empty(max_group) - counts = np.zeros(max_group, dtype='i4') - lib.group_add(out, counts, data, group_id) - -# cumtime percall filename:lineno(function) -# 0.592 0.592 :1() - # 0.584 0.006 groupby_ex.py:37(algo3_nosort) - # 0.535 0.005 {method 'factorize' of DictFactorizer' objects} - # 0.047 0.000 {pandas._tseries.group_add} - # 0.002 0.000 numeric.py:65(zeros_like) - # 0.001 0.000 {method 'fill' of 'numpy.ndarray' objects} - # 0.000 0.000 {numpy.core.multiarray.empty_like} - # 0.000 0.000 {numpy.core.multiarray.empty} - -# UNIQUE timings - -# N = 10000000 -# K = 500000 - -# groups = np.array([rands(10) for _ in range(K)], dtype='O') - -# labels = np.tile(groups, N // K) -data = np.random.randn(N) - -data = np.random.randn(N) - -Ks = [100, 1000, 5000, 10000, 25000, 50000, 100000] - -# Ks = [500000, 1000000, 2500000, 5000000, 10000000] - -import psutil -import os -import gc - -pid = os.getpid() -proc = psutil.Process(pid) - - -def dict_unique(values, expected_K, sort=False, memory=False): - if memory: - gc.collect() - before_mem = proc.get_memory_info().rss - - rizer = lib.DictFactorizer() - result = rizer.unique_int64(values) - - if memory: - result = proc.get_memory_info().rss - before_mem - return result - - if sort: - result.sort() - assert(len(result) == expected_K) - return result - - -def khash_unique(values, expected_K, size_hint=False, sort=False, - memory=False): - if memory: - gc.collect() - before_mem = proc.get_memory_info().rss - - if size_hint: - rizer = lib.Factorizer(len(values)) - else: - rizer = lib.Factorizer(100) - - result = [] - result = rizer.unique(values) - - if memory: - result = proc.get_memory_info().rss - before_mem - return result - - if sort: - result.sort() - assert(len(result) == expected_K) - - -def khash_unique_str(values, expected_K, size_hint=False, sort=False, - memory=False): - if memory: - gc.collect() - before_mem = proc.get_memory_info().rss - - if size_hint: - rizer = lib.StringHashTable(len(values)) - else: - rizer = lib.StringHashTable(100) - - result = [] - result = rizer.unique(values) - - if memory: - result = proc.get_memory_info().rss - before_mem - return result - - if sort: - result.sort() - assert(len(result) == expected_K) - - -def khash_unique_int64(values, expected_K, size_hint=False, sort=False): - if size_hint: - rizer = lib.Int64HashTable(len(values)) - else: - rizer = lib.Int64HashTable(100) - - result = [] - result = rizer.unique(values) - - if sort: - result.sort() - assert(len(result) == expected_K) - - -def hash_bench(): - numpy = [] - dict_based = [] - dict_based_sort = [] - khash_hint = [] - khash_nohint = [] - for K in Ks: - print(K) - # groups = np.array([rands(10) for _ in range(K)]) - # labels = np.tile(groups, N // K).astype('O') - - groups = np.random.randint(0, long(100000000000), size=K) - labels = np.tile(groups, N // K) - dict_based.append(timeit(lambda: dict_unique(labels, K), 20)) - khash_nohint.append(timeit(lambda: khash_unique_int64(labels, K), 20)) - khash_hint.append(timeit(lambda: khash_unique_int64(labels, K, - size_hint=True), 20)) - - # memory, hard to get - # dict_based.append(np.mean([dict_unique(labels, K, memory=True) - # for _ in range(10)])) - # khash_nohint.append(np.mean([khash_unique(labels, K, memory=True) - # for _ in range(10)])) - # khash_hint.append(np.mean([khash_unique(labels, K, size_hint=True, memory=True) - # for _ in range(10)])) - - # dict_based_sort.append(timeit(lambda: dict_unique(labels, K, - # sort=True), 10)) - # numpy.append(timeit(lambda: np.unique(labels), 10)) - - # unique_timings = DataFrame({'numpy.unique' : numpy, - # 'dict, no sort' : dict_based, - # 'dict, sort' : dict_based_sort}, - # columns=['dict, no sort', - # 'dict, sort', 'numpy.unique'], - # index=Ks) - - unique_timings = DataFrame({'dict': dict_based, - 'khash, preallocate': khash_hint, - 'khash': khash_nohint}, - columns=['khash, preallocate', 'khash', 'dict'], - index=Ks) - - unique_timings.plot(kind='bar', legend=False) - plt.legend(loc='best') - plt.title('Unique on 100,000 values, int64') - plt.xlabel('Number of unique labels') - plt.ylabel('Mean execution time') - - plt.show() diff --git a/bench/bench_with_subset.R b/bench/bench_with_subset.R deleted file mode 100644 index 69d0f7a9eec63..0000000000000 --- a/bench/bench_with_subset.R +++ /dev/null @@ -1,53 +0,0 @@ -library(microbenchmark) -library(data.table) - - -data.frame.subset.bench <- function (n=1e7, times=30) { - df <- data.frame(a=rnorm(n), b=rnorm(n), c=rnorm(n)) - print(microbenchmark(subset(df, a <= b & b <= (c ^ 2 + b ^ 2 - a) & b > c), - times=times)) -} - - -# data.table allows something very similar to query with an expression -# but we have chained comparisons AND we're faster BOO YAH! -data.table.subset.expression.bench <- function (n=1e7, times=30) { - dt <- data.table(a=rnorm(n), b=rnorm(n), c=rnorm(n)) - print(microbenchmark(dt[, a <= b & b <= (c ^ 2 + b ^ 2 - a) & b > c], - times=times)) -} - - -# compare against subset with data.table for good measure -data.table.subset.bench <- function (n=1e7, times=30) { - dt <- data.table(a=rnorm(n), b=rnorm(n), c=rnorm(n)) - print(microbenchmark(subset(dt, a <= b & b <= (c ^ 2 + b ^ 2 - a) & b > c), - times=times)) -} - - -data.frame.with.bench <- function (n=1e7, times=30) { - df <- data.frame(a=rnorm(n), b=rnorm(n), c=rnorm(n)) - - print(microbenchmark(with(df, a + b * (c ^ 2 + b ^ 2 - a) / (a * c) ^ 3), - times=times)) -} - - -data.table.with.bench <- function (n=1e7, times=30) { - dt <- data.table(a=rnorm(n), b=rnorm(n), c=rnorm(n)) - print(microbenchmark(with(dt, a + b * (c ^ 2 + b ^ 2 - a) / (a * c) ^ 3), - times=times)) -} - - -bench <- function () { - data.frame.subset.bench() - data.table.subset.expression.bench() - data.table.subset.bench() - data.frame.with.bench() - data.table.with.bench() -} - - -bench() diff --git a/bench/bench_with_subset.py b/bench/bench_with_subset.py deleted file mode 100644 index 017401df3f7f3..0000000000000 --- a/bench/bench_with_subset.py +++ /dev/null @@ -1,116 +0,0 @@ -#!/usr/bin/env python - -""" -Microbenchmarks for comparison with R's "with" and "subset" functions -""" - -from __future__ import print_function -import numpy as np -from numpy import array -from timeit import repeat as timeit -from pandas.compat import range, zip -from pandas import DataFrame - - -setup_common = """from pandas import DataFrame -from numpy.random import randn -df = DataFrame(randn(%d, 3), columns=list('abc')) -%s""" - - -setup_with = "s = 'a + b * (c ** 2 + b ** 2 - a) / (a * c) ** 3'" - - -def bench_with(n, times=10, repeat=3, engine='numexpr'): - return np.array(timeit('df.eval(s, engine=%r)' % engine, - setup=setup_common % (n, setup_with), - repeat=repeat, number=times)) / times - - -setup_subset = "s = 'a <= b <= c ** 2 + b ** 2 - a and b > c'" - - -def bench_subset(n, times=10, repeat=3, engine='numexpr'): - return np.array(timeit('df.query(s, engine=%r)' % engine, - setup=setup_common % (n, setup_subset), - repeat=repeat, number=times)) / times - - -def bench(mn=1, mx=7, num=100, engines=('python', 'numexpr'), verbose=False): - r = np.logspace(mn, mx, num=num).round().astype(int) - - ev = DataFrame(np.empty((num, len(engines))), columns=engines) - qu = ev.copy(deep=True) - - ev['size'] = qu['size'] = r - - for engine in engines: - for i, n in enumerate(r): - if verbose: - print('engine: %r, i == %d' % (engine, i)) - ev.loc[i, engine] = bench_with(n, times=1, repeat=1, engine=engine) - qu.loc[i, engine] = bench_subset(n, times=1, repeat=1, - engine=engine) - - return ev, qu - - -def plot_perf(df, engines, title, filename=None): - from matplotlib.pyplot import figure, rc - - try: - from mpltools import style - except ImportError: - pass - else: - style.use('ggplot') - - rc('text', usetex=True) - - fig = figure(figsize=(4, 3), dpi=100) - ax = fig.add_subplot(111) - - for engine in engines: - ax.plot(df.size, df[engine], label=engine, lw=2) - - ax.set_xlabel('Number of Rows') - ax.set_ylabel('Time (s)') - ax.set_title(title) - ax.legend(loc='best') - ax.tick_params(top=False, right=False) - - fig.tight_layout() - - if filename is not None: - fig.savefig(filename) - - -if __name__ == '__main__': - import os - import pandas as pd - - pandas_dir = os.path.dirname(os.path.abspath(os.path.dirname(__file__))) - static_path = os.path.join(pandas_dir, 'doc', 'source', '_static') - - join = lambda p: os.path.join(static_path, p) - - fn = join('eval-query-perf-data.h5') - - engines = 'python', 'numexpr' - - if not os.path.exists(fn): - ev, qu = bench(verbose=True) - ev.to_hdf(fn, 'eval') - qu.to_hdf(fn, 'query') - else: - ev = pd.read_hdf(fn, 'eval') - qu = pd.read_hdf(fn, 'query') - - plot_perf(ev, engines, 'DataFrame.eval()', filename=join('eval-perf.png')) - plot_perf(qu, engines, 'DataFrame.query()', - filename=join('query-perf.png')) - - plot_perf(ev[ev.size <= 50000], engines, 'DataFrame.eval()', - filename=join('eval-perf-small.png')) - plot_perf(qu[qu.size <= 500000], engines, 'DataFrame.query()', - filename=join('query-perf-small.png')) diff --git a/bench/better_unique.py b/bench/better_unique.py deleted file mode 100644 index e03a4f433ce66..0000000000000 --- a/bench/better_unique.py +++ /dev/null @@ -1,80 +0,0 @@ -from __future__ import print_function -from pandas import DataFrame -from pandas.compat import range, zip -import timeit - -setup = """ -from pandas import Series -import pandas._tseries as _tseries -from pandas.compat import range -import random -import numpy as np - -def better_unique(values): - uniques = _tseries.fast_unique(values) - id_map = _tseries.map_indices_buf(uniques) - labels = _tseries.get_unique_labels(values, id_map) - return uniques, labels - -tot = 100000 - -def get_test_data(ngroups=100, n=tot): - unique_groups = range(ngroups) - random.shuffle(unique_groups) - arr = np.asarray(np.tile(unique_groups, n / ngroups), dtype=object) - - if len(arr) < n: - arr = np.asarray(list(arr) + unique_groups[:n - len(arr)], - dtype=object) - - return arr - -arr = get_test_data(ngroups=%d) -""" - -group_sizes = [10, 100, 1000, 10000, - 20000, 30000, 40000, - 50000, 60000, 70000, - 80000, 90000, 100000] - -numbers = [100, 100, 50] + [10] * 10 - -numpy = [] -wes = [] - -for sz, n in zip(group_sizes, numbers): - # wes_timer = timeit.Timer(stmt='better_unique(arr)', - # setup=setup % sz) - wes_timer = timeit.Timer(stmt='_tseries.fast_unique(arr)', - setup=setup % sz) - - numpy_timer = timeit.Timer(stmt='np.unique(arr)', - setup=setup % sz) - - print(n) - numpy_result = numpy_timer.timeit(number=n) / n - wes_result = wes_timer.timeit(number=n) / n - - print('Groups: %d, NumPy: %s, Wes: %s' % (sz, numpy_result, wes_result)) - - wes.append(wes_result) - numpy.append(numpy_result) - -result = DataFrame({'wes': wes, 'numpy': numpy}, index=group_sizes) - - -def make_plot(numpy, wes): - pass - -# def get_test_data(ngroups=100, n=100000): -# unique_groups = range(ngroups) -# random.shuffle(unique_groups) -# arr = np.asarray(np.tile(unique_groups, n / ngroups), dtype=object) - -# if len(arr) < n: -# arr = np.asarray(list(arr) + unique_groups[:n - len(arr)], -# dtype=object) - -# return arr - -# arr = get_test_data(ngroups=1000) diff --git a/bench/duplicated.R b/bench/duplicated.R deleted file mode 100644 index eb2376df2932a..0000000000000 --- a/bench/duplicated.R +++ /dev/null @@ -1,22 +0,0 @@ -N <- 100000 - -k1 = rep(NA, N) -k2 = rep(NA, N) -for (i in 1:N){ - k1[i] <- paste(sample(letters, 1), collapse="") - k2[i] <- paste(sample(letters, 1), collapse="") -} -df <- data.frame(a=k1, b=k2, c=rep(1:100, N / 100)) -df2 <- data.frame(a=k1, b=k2) - -timings <- numeric() -timings2 <- numeric() -for (i in 1:50) { - gc() - timings[i] = system.time(deduped <- df[!duplicated(df),])[3] - gc() - timings2[i] = system.time(deduped <- df[!duplicated(df[,c("a", "b")]),])[3] -} - -mean(timings) -mean(timings2) diff --git a/bench/io_roundtrip.py b/bench/io_roundtrip.py deleted file mode 100644 index d87da0ec6321a..0000000000000 --- a/bench/io_roundtrip.py +++ /dev/null @@ -1,116 +0,0 @@ -from __future__ import print_function -import time -import os -import numpy as np - -import la -import pandas -from pandas.compat import range -from pandas import datetools, DatetimeIndex - - -def timeit(f, iterations): - start = time.clock() - - for i in range(iterations): - f() - - return time.clock() - start - - -def rountrip_archive(N, K=50, iterations=10): - # Create data - arr = np.random.randn(N, K) - # lar = la.larry(arr) - dma = pandas.DataFrame(arr, - DatetimeIndex('1/1/2000', periods=N, - offset=datetools.Minute())) - dma[201] = 'bar' - - # filenames - filename_numpy = '/Users/wesm/tmp/numpy.npz' - filename_larry = '/Users/wesm/tmp/archive.hdf5' - filename_pandas = '/Users/wesm/tmp/pandas_tmp' - - # Delete old files - try: - os.unlink(filename_numpy) - except: - pass - try: - os.unlink(filename_larry) - except: - pass - - try: - os.unlink(filename_pandas) - except: - pass - - # Time a round trip save and load - # numpy_f = lambda: numpy_roundtrip(filename_numpy, arr, arr) - # numpy_time = timeit(numpy_f, iterations) / iterations - - # larry_f = lambda: larry_roundtrip(filename_larry, lar, lar) - # larry_time = timeit(larry_f, iterations) / iterations - - pandas_f = lambda: pandas_roundtrip(filename_pandas, dma, dma) - pandas_time = timeit(pandas_f, iterations) / iterations - print('pandas (HDF5) %7.4f seconds' % pandas_time) - - pickle_f = lambda: pandas_roundtrip(filename_pandas, dma, dma) - pickle_time = timeit(pickle_f, iterations) / iterations - print('pandas (pickle) %7.4f seconds' % pickle_time) - - # print('Numpy (npz) %7.4f seconds' % numpy_time) - # print('larry (HDF5) %7.4f seconds' % larry_time) - - # Delete old files - try: - os.unlink(filename_numpy) - except: - pass - try: - os.unlink(filename_larry) - except: - pass - - try: - os.unlink(filename_pandas) - except: - pass - - -def numpy_roundtrip(filename, arr1, arr2): - np.savez(filename, arr1=arr1, arr2=arr2) - npz = np.load(filename) - arr1 = npz['arr1'] - arr2 = npz['arr2'] - - -def larry_roundtrip(filename, lar1, lar2): - io = la.IO(filename) - io['lar1'] = lar1 - io['lar2'] = lar2 - lar1 = io['lar1'] - lar2 = io['lar2'] - - -def pandas_roundtrip(filename, dma1, dma2): - # What's the best way to code this? - from pandas.io.pytables import HDFStore - store = HDFStore(filename) - store['dma1'] = dma1 - store['dma2'] = dma2 - dma1 = store['dma1'] - dma2 = store['dma2'] - - -def pandas_roundtrip_pickle(filename, dma1, dma2): - dma1.save(filename) - dma1 = pandas.DataFrame.load(filename) - dma2.save(filename) - dma2 = pandas.DataFrame.load(filename) - -if __name__ == '__main__': - rountrip_archive(10000, K=200) diff --git a/bench/larry.py b/bench/larry.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/bench/serialize.py b/bench/serialize.py deleted file mode 100644 index b0edd6a5752d2..0000000000000 --- a/bench/serialize.py +++ /dev/null @@ -1,89 +0,0 @@ -from __future__ import print_function -from pandas.compat import range, lrange -import time -import os -import numpy as np - -import la -import pandas - - -def timeit(f, iterations): - start = time.clock() - - for i in range(iterations): - f() - - return time.clock() - start - - -def roundtrip_archive(N, iterations=10): - - # Create data - arr = np.random.randn(N, N) - lar = la.larry(arr) - dma = pandas.DataFrame(arr, lrange(N), lrange(N)) - - # filenames - filename_numpy = '/Users/wesm/tmp/numpy.npz' - filename_larry = '/Users/wesm/tmp/archive.hdf5' - filename_pandas = '/Users/wesm/tmp/pandas_tmp' - - # Delete old files - try: - os.unlink(filename_numpy) - except: - pass - try: - os.unlink(filename_larry) - except: - pass - try: - os.unlink(filename_pandas) - except: - pass - - # Time a round trip save and load - numpy_f = lambda: numpy_roundtrip(filename_numpy, arr, arr) - numpy_time = timeit(numpy_f, iterations) / iterations - - larry_f = lambda: larry_roundtrip(filename_larry, lar, lar) - larry_time = timeit(larry_f, iterations) / iterations - - pandas_f = lambda: pandas_roundtrip(filename_pandas, dma, dma) - pandas_time = timeit(pandas_f, iterations) / iterations - - print('Numpy (npz) %7.4f seconds' % numpy_time) - print('larry (HDF5) %7.4f seconds' % larry_time) - print('pandas (HDF5) %7.4f seconds' % pandas_time) - - -def numpy_roundtrip(filename, arr1, arr2): - np.savez(filename, arr1=arr1, arr2=arr2) - npz = np.load(filename) - arr1 = npz['arr1'] - arr2 = npz['arr2'] - - -def larry_roundtrip(filename, lar1, lar2): - io = la.IO(filename) - io['lar1'] = lar1 - io['lar2'] = lar2 - lar1 = io['lar1'] - lar2 = io['lar2'] - - -def pandas_roundtrip(filename, dma1, dma2): - from pandas.io.pytables import HDFStore - store = HDFStore(filename) - store['dma1'] = dma1 - store['dma2'] = dma2 - dma1 = store['dma1'] - dma2 = store['dma2'] - - -def pandas_roundtrip_pickle(filename, dma1, dma2): - dma1.save(filename) - dma1 = pandas.DataFrame.load(filename) - dma2.save(filename) - dma2 = pandas.DataFrame.load(filename) diff --git a/bench/test.py b/bench/test.py deleted file mode 100644 index 2339deab313a1..0000000000000 --- a/bench/test.py +++ /dev/null @@ -1,70 +0,0 @@ -import numpy as np -import itertools -import collections -import scipy.ndimage as ndi -from pandas.compat import zip, range - -N = 10000 - -lat = np.random.randint(0, 360, N) -lon = np.random.randint(0, 360, N) -data = np.random.randn(N) - - -def groupby1(lat, lon, data): - indexer = np.lexsort((lon, lat)) - lat = lat.take(indexer) - lon = lon.take(indexer) - sorted_data = data.take(indexer) - - keys = 1000. * lat + lon - unique_keys = np.unique(keys) - bounds = keys.searchsorted(unique_keys) - - result = group_agg(sorted_data, bounds, lambda x: x.mean()) - - decoder = keys.searchsorted(unique_keys) - - return dict(zip(zip(lat.take(decoder), lon.take(decoder)), result)) - - -def group_mean(lat, lon, data): - indexer = np.lexsort((lon, lat)) - lat = lat.take(indexer) - lon = lon.take(indexer) - sorted_data = data.take(indexer) - - keys = 1000 * lat + lon - unique_keys = np.unique(keys) - - result = ndi.mean(sorted_data, labels=keys, index=unique_keys) - decoder = keys.searchsorted(unique_keys) - - return dict(zip(zip(lat.take(decoder), lon.take(decoder)), result)) - - -def group_mean_naive(lat, lon, data): - grouped = collections.defaultdict(list) - for lt, ln, da in zip(lat, lon, data): - grouped[(lt, ln)].append(da) - - averaged = dict((ltln, np.mean(da)) for ltln, da in grouped.items()) - - return averaged - - -def group_agg(values, bounds, f): - N = len(values) - result = np.empty(len(bounds), dtype=float) - for i, left_bound in enumerate(bounds): - if i == len(bounds) - 1: - right_bound = N - else: - right_bound = bounds[i + 1] - - result[i] = f(values[left_bound: right_bound]) - - return result - -# for i in range(10): -# groupby1(lat, lon, data) diff --git a/bench/zoo_bench.R b/bench/zoo_bench.R deleted file mode 100644 index 294d55f51a9ab..0000000000000 --- a/bench/zoo_bench.R +++ /dev/null @@ -1,71 +0,0 @@ -library(zoo) -library(xts) -library(fts) -library(tseries) -library(its) -library(xtable) - -## indices = rep(NA, 100000) -## for (i in 1:100000) -## indices[i] <- paste(sample(letters, 10), collapse="") - - - -## x <- zoo(rnorm(100000), indices) -## y <- zoo(rnorm(90000), indices[sample(1:100000, 90000)]) - -## indices <- as.POSIXct(1:100000) - -indices <- as.POSIXct(Sys.Date()) + seq(1, 100000000, 100) - -sz <- 500000 - -## x <- xts(rnorm(sz), sample(indices, sz)) -## y <- xts(rnorm(sz), sample(indices, sz)) - -zoo.bench <- function(){ - x <- zoo(rnorm(sz), sample(indices, sz)) - y <- zoo(rnorm(sz), sample(indices, sz)) - timeit(function() {x + y}) -} - -xts.bench <- function(){ - x <- xts(rnorm(sz), sample(indices, sz)) - y <- xts(rnorm(sz), sample(indices, sz)) - timeit(function() {x + y}) -} - -fts.bench <- function(){ - x <- fts(rnorm(sz), sort(sample(indices, sz))) - y <- fts(rnorm(sz), sort(sample(indices, sz)) - timeit(function() {x + y}) -} - -its.bench <- function(){ - x <- its(rnorm(sz), sort(sample(indices, sz))) - y <- its(rnorm(sz), sort(sample(indices, sz))) - timeit(function() {x + y}) -} - -irts.bench <- function(){ - x <- irts(sort(sample(indices, sz)), rnorm(sz)) - y <- irts(sort(sample(indices, sz)), rnorm(sz)) - timeit(function() {x + y}) -} - -timeit <- function(f){ - timings <- numeric() - for (i in 1:10) { - gc() - timings[i] = system.time(f())[3] - } - mean(timings) -} - -bench <- function(){ - results <- c(xts.bench(), fts.bench(), its.bench(), zoo.bench()) - names <- c("xts", "fts", "its", "zoo") - data.frame(results, names) -} - -result <- bench() diff --git a/bench/zoo_bench.py b/bench/zoo_bench.py deleted file mode 100644 index 74cb1952a5a2a..0000000000000 --- a/bench/zoo_bench.py +++ /dev/null @@ -1,36 +0,0 @@ -from pandas import * -from pandas.util.testing import rands - -n = 1000000 -# indices = Index([rands(10) for _ in xrange(n)]) - - -def sample(values, k): - sampler = np.random.permutation(len(values)) - return values.take(sampler[:k]) -sz = 500000 -rng = np.arange(0, 10000000000000, 10000000) -stamps = np.datetime64(datetime.now()).view('i8') + rng -idx1 = np.sort(sample(stamps, sz)) -idx2 = np.sort(sample(stamps, sz)) -ts1 = Series(np.random.randn(sz), idx1) -ts2 = Series(np.random.randn(sz), idx2) - - -# subsample_size = 90000 - -# x = Series(np.random.randn(100000), indices) -# y = Series(np.random.randn(subsample_size), -# index=sample(indices, subsample_size)) - - -# lx = larry(np.random.randn(100000), [list(indices)]) -# ly = larry(np.random.randn(subsample_size), [list(y.index)]) - -# Benchmark 1: Two 1-million length time series (int64-based index) with -# randomly chosen timestamps - -# Benchmark 2: Join two 5-variate time series DataFrames (outer and inner join) - -# df1 = DataFrame(np.random.randn(1000000, 5), idx1, columns=range(5)) -# df2 = DataFrame(np.random.randn(1000000, 5), idx2, columns=range(5, 10)) diff --git a/doc/source/conf.py b/doc/source/conf.py index 394fa44c30573..cb3063d59beae 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -17,6 +17,11 @@ import importlib from pandas.compat import u, PY3 +try: + raw_input # Python 2 +except NameError: + raw_input = input # Python 3 + # https://github.com/sphinx-doc/sphinx/pull/2325/files # Workaround for sphinx-build recursion limit overflow: # pickle.dump(doctree, f, pickle.HIGHEST_PROTOCOL) diff --git a/doc/sphinxext/ipython_sphinxext/ipython_directive.py b/doc/sphinxext/ipython_sphinxext/ipython_directive.py index 49fbacba99592..922767a8e2d46 100644 --- a/doc/sphinxext/ipython_sphinxext/ipython_directive.py +++ b/doc/sphinxext/ipython_sphinxext/ipython_directive.py @@ -111,7 +111,7 @@ import sys import tempfile import ast -from pandas.compat import zip, range, map, lmap, u, cStringIO as StringIO +from pandas.compat import zip, range, map, lmap, u, text_type, cStringIO as StringIO import warnings # To keep compatibility with various python versions @@ -138,10 +138,8 @@ if PY3: from io import StringIO - text_type = str else: from StringIO import StringIO - text_type = unicode #----------------------------------------------------------------------------- # Globals diff --git a/scripts/find_commits_touching_func.py b/scripts/find_commits_touching_func.py index 099761f38bb44..74ea120bf0b64 100755 --- a/scripts/find_commits_touching_func.py +++ b/scripts/find_commits_touching_func.py @@ -4,7 +4,7 @@ # copryright 2013, y-p @ github from __future__ import print_function -from pandas.compat import range, lrange, map +from pandas.compat import range, lrange, map, string_types, text_type """Search the git history for all commits touching a named method @@ -94,7 +94,7 @@ def get_hits(defname,files=()): def get_commit_info(c,fmt,sep='\t'): r=sh.git('log', "--format={}".format(fmt), '{}^..{}'.format(c,c),"-n","1",_tty_out=False) - return compat.text_type(r).split(sep) + return text_type(r).split(sep) def get_commit_vitals(c,hlen=HASH_LEN): h,s,d= get_commit_info(c,'%H\t%s\t%ci',"\t") @@ -183,11 +183,11 @@ def main(): !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! """) return - if isinstance(args.file_masks,compat.string_types): + if isinstance(args.file_masks, string_types): args.file_masks = args.file_masks.split(',') - if isinstance(args.path_masks,compat.string_types): + if isinstance(args.path_masks, string_types): args.path_masks = args.path_masks.split(',') - if isinstance(args.dir_masks,compat.string_types): + if isinstance(args.dir_masks, string_types): args.dir_masks = args.dir_masks.split(',') logger.setLevel(getattr(logging,args.debug_level)) diff --git a/scripts/windows_builder/build_27-32.bat b/scripts/windows_builder/build_27-32.bat deleted file mode 100644 index 37eb4d436d567..0000000000000 --- a/scripts/windows_builder/build_27-32.bat +++ /dev/null @@ -1,25 +0,0 @@ -@echo off -echo "starting 27-32" - -setlocal EnableDelayedExpansion -set MSSdk=1 -CALL "C:\Program Files\Microsoft SDKs\Windows\v7.0\Bin\SetEnv.cmd" /x86 /release -set DISTUTILS_USE_SDK=1 - -title 27-32 build -echo "building" -cd "c:\users\Jeff Reback\documents\github\pandas" -C:\python27-32\python.exe setup.py build > build.27-32.log 2>&1 - -title "installing" -C:\python27-32\python.exe setup.py bdist --formats=wininst > install.27-32.log 2>&1 - -echo "testing" -C:\python27-32\scripts\nosetests -A "not slow" build\lib.win32-2.7\pandas > test.27-32.log 2>&1 - -echo "versions" -cd build\lib.win32-2.7 -C:\python27-32\python.exe ../../ci/print_versions.py > ../../versions.27-32.log 2>&1 - -exit - diff --git a/scripts/windows_builder/build_27-64.bat b/scripts/windows_builder/build_27-64.bat deleted file mode 100644 index e76e25d0ef39c..0000000000000 --- a/scripts/windows_builder/build_27-64.bat +++ /dev/null @@ -1,25 +0,0 @@ -@echo off -echo "starting 27-64" - -setlocal EnableDelayedExpansion -set MSSdk=1 -CALL "C:\Program Files\Microsoft SDKs\Windows\v7.0\Bin\SetEnv.cmd" /x64 /release -set DISTUTILS_USE_SDK=1 - -title 27-64 build -echo "building" -cd "c:\users\Jeff Reback\documents\github\pandas" -C:\python27-64\python.exe setup.py build > build.27-64.log 2>&1 - -echo "installing" -C:\python27-64\python.exe setup.py bdist --formats=wininst > install.27-64.log 2>&1 - -echo "testing" -C:\python27-64\scripts\nosetests -A "not slow" build\lib.win-amd64-2.7\pandas > test.27-64.log 2>&1 - -echo "versions" -cd build\lib.win-amd64-2.7 -C:\python27-64\python.exe ../../ci/print_versions.py > ../../versions.27-64.log 2>&1 - -exit - diff --git a/scripts/windows_builder/build_34-32.bat b/scripts/windows_builder/build_34-32.bat deleted file mode 100644 index 8e060e000bc8f..0000000000000 --- a/scripts/windows_builder/build_34-32.bat +++ /dev/null @@ -1,27 +0,0 @@ -@echo off -echo "starting 34-32" - -setlocal EnableDelayedExpansion -set MSSdk=1 -CALL "C:\Program Files\Microsoft SDKs\Windows\v7.1\Bin\SetEnv.cmd" /x86 /release -set DISTUTILS_USE_SDK=1 - -title 34-32 build -echo "building" -cd "c:\users\Jeff Reback\documents\github\pandas" -C:\python34-32\python.exe setup.py build > build.34-32.log 2>&1 - -echo "installing" -C:\python34-32\python.exe setup.py bdist --formats=wininst > install.34-32.log 2>&1 - -echo "testing" -C:\python34-32\scripts\nosetests -A "not slow" build\lib.win32-3.4\pandas > test.34-32.log 2>&1 - -echo "versions" -cd build\lib.win32-3.4 -C:\python34-32\python.exe ../../ci/print_versions.py > ../../versions.34-32.log 2>&1 - -exit - - - diff --git a/scripts/windows_builder/build_34-64.bat b/scripts/windows_builder/build_34-64.bat deleted file mode 100644 index 3a8512b730346..0000000000000 --- a/scripts/windows_builder/build_34-64.bat +++ /dev/null @@ -1,27 +0,0 @@ -@echo off -echo "starting 34-64" - -setlocal EnableDelayedExpansion -set MSSdk=1 -CALL "C:\Program Files\Microsoft SDKs\Windows\v7.1\Bin\SetEnv.cmd" /x64 /release -set DISTUTILS_USE_SDK=1 - -title 34-64 build -echo "building" -cd "c:\users\Jeff Reback\documents\github\pandas" -C:\python34-64\python.exe setup.py build > build.34-64.log 2>&1 - -echo "installing" -C:\python34-64\python.exe setup.py bdist --formats=wininst > install.34-64.log 2>&1 - -echo "testing" -C:\python34-64\scripts\nosetests -A "not slow" build\lib.win-amd64-3.4\pandas > test.34-64.log 2>&1 - -echo "versions" -cd build\lib.win-amd64-3.4 -C:\python34-64\python.exe ../../ci/print_versions.py > ../../versions.34-64.log 2>&1 - -exit - - - diff --git a/scripts/windows_builder/check_and_build.bat b/scripts/windows_builder/check_and_build.bat deleted file mode 100644 index 32be1bde1f7f3..0000000000000 --- a/scripts/windows_builder/check_and_build.bat +++ /dev/null @@ -1,2 +0,0 @@ -set PYTHONPATH=c:/python27-64/lib -c:/python27-64/python.exe c:/Builds/check_and_build.py %1 %2 %3 %4 %4 %6 %7 %8 %9 diff --git a/scripts/windows_builder/check_and_build.py b/scripts/windows_builder/check_and_build.py deleted file mode 100644 index 2eb32fb4265d9..0000000000000 --- a/scripts/windows_builder/check_and_build.py +++ /dev/null @@ -1,194 +0,0 @@ -import datetime -import git -import logging -import os, re, time -import subprocess -import argparse -import pysftp - -# parse the args -parser = argparse.ArgumentParser(description='build, test, and install updated versions of master pandas') -parser.add_argument('-b', '--build', - help='run just this build', - dest='build') -parser.add_argument('-u', '--update', - help='get a git update', - dest='update', - action='store_true', - default=False) -parser.add_argument('-t', '--test', - help='run the tests', - dest='test', - action='store_true', - default=False) -parser.add_argument('-c', '--compare', - help='show the last tests compare', - dest='compare', - action='store_true', - default=False) -parser.add_argument('-v', '--version', - help='show the last versions', - dest='version', - action='store_true', - default=False) -parser.add_argument('-i', '--install', - help='run the install', - dest='install', - action='store_true', - default=False) -parser.add_argument('--dry', - help='dry run', - dest='dry', - action='store_true', - default=False) - -args = parser.parse_args() -dry_run = args.dry - -builds = ['27-32','27-64','34-32','34-64'] -base_dir = "C:\Users\Jeff Reback\Documents\GitHub\pandas" -remote_host='pandas.pydata.org' -username='pandas' -password=############ - -# drop python from our environment to avoid -# passing this onto sub-processes -env = os.environ -del env['PYTHONPATH'] - -# the stdout logger -fmt = '%(asctime)s: %(message)s' -logger = logging.getLogger('check_and_build') -logger.setLevel(logging.DEBUG) -stream_handler = logging.StreamHandler() -stream_handler.setFormatter(logging.Formatter(fmt)) -logger.addHandler(stream_handler) - -def run_all(test=False,compare=False,install=False,version=False,build=None): - # run everything - - for b in builds: - if build is not None and build != b: - continue - if test: - do_rebuild(b) - if compare or test: - try: - do_compare(b) - except (Exception) as e: - logger.info("ERROR COMPARE {0} : {1}".format(b,e)) - if version: - try: - do_version(b) - except (Exception) as e: - logger.info("ERROR VERSION {0} : {1}".format(b,e)) - - if install: - run_install() - -def do_rebuild(build): - # trigger the rebuild - - cmd = "c:/Builds/build_{0}.bat".format(build) - logger.info("rebuild : {0}".format(cmd)) - p = subprocess.Popen("start /wait /min {0}".format(cmd),env=env,shell=True,close_fds=True) - ret = p.wait() - -def do_compare(build): - # print the test outputs - - f = os.path.join(base_dir,"test.{0}.log".format(build)) - with open(f,'r') as fh: - for l in fh: - l = l.rstrip() - if l.startswith('ERROR:'): - logger.info("{0} : {1}".format(build,l)) - if l.startswith('Ran') or l.startswith('OK') or l.startswith('FAIL'): - logger.info("{0} : {1}".format(build,l)) - -def do_version(build): - # print the version strings - - f = os.path.join(base_dir,"versions.{0}.log".format(build)) - with open(f,'r') as fh: - for l in fh: - l = l.rstrip() - logger.info("{0} : {1}".format(build,l)) - -def do_update(is_verbose=True): - # update git; return True if the commit has changed - - repo = git.Repo(base_dir) - master = repo.heads.master - origin = repo.remotes.origin - start_commit = master.commit - - if is_verbose: - logger.info("current commit : {0}".format(start_commit)) - - try: - origin.update() - except (Exception) as e: - logger.info("update exception : {0}".format(e)) - try: - origin.pull() - except (Exception) as e: - logger.info("pull exception : {0}".format(e)) - - result = start_commit != master.commit - if result: - if is_verbose: - logger.info("commits changed : {0} -> {1}".format(start_commit,master.commit)) - return result - -def run_install(): - # send the installation binaries - - repo = git.Repo(base_dir) - master = repo.heads.master - commit = master.commit - short_hash = str(commit)[:7] - - logger.info("sending files : {0}".format(commit)) - d = os.path.join(base_dir,"dist") - files = [ f for f in os.listdir(d) if re.search(short_hash,f) ] - srv = pysftp.Connection(host=remote_host,username=username,password=password) - srv.chdir("www/pandas-build/dev") - - # get current files - remote_files = set(srv.listdir(path='.')) - - for f in files: - if f not in remote_files: - logger.info("sending: {0}".format(f)) - local = os.path.join(d,f) - srv.put(localpath=local) - - srv.close() - logger.info("sending files: done") - -# just perform the action -if args.update or args.test or args.compare or args.install or args.version: - if args.update: - do_update() - run_all(test=args.test,compare=args.compare,install=args.install,version=args.version,build=args.build) - exit(0) - -# file logging -file_handler = logging.FileHandler("C:\Builds\logs\check_and_build.log") -file_handler.setFormatter(logging.Formatter(fmt)) -logger.addHandler(file_handler) - -logger.info("start") - -# main loop -while(True): - - if do_update(): - run_all(test=True,install=False) - - time.sleep(60*60) - -logger.info("exit") -file_handler.close() - diff --git a/scripts/windows_builder/readme.txt b/scripts/windows_builder/readme.txt deleted file mode 100644 index 789e2a9ee0c63..0000000000000 --- a/scripts/windows_builder/readme.txt +++ /dev/null @@ -1,17 +0,0 @@ -This is a collection of windows batch scripts (and a python script) -to rebuild the binaries, test, and upload the binaries for public distribution -upon a commit on github. - -Obviously requires that these be setup on windows -Requires an install of Windows SDK 3.5 and 4.0 -Full python installs for each version with the deps - -Currently supporting - -27-32,27-64,34-32,34-64 - -Note that 34 use the 4.0 SDK, while the other suse 3.5 SDK - -I installed these scripts in C:\Builds - -Installed libaries in C:\Installs From 0bd871fb9634e8b73efcc1aeabb93961fbc43d53 Mon Sep 17 00:00:00 2001 From: kernc Date: Mon, 17 Jul 2017 17:11:37 +0200 Subject: [PATCH 47/54] PERF: SparseDataFrame._init_dict uses intermediary dict, not DataFrame (#16883) Closes gh-16773. --- asv_bench/benchmarks/sparse.py | 8 ++++++++ doc/source/whatsnew/v0.21.0.txt | 1 + pandas/core/sparse/frame.py | 9 +++------ pandas/tests/reshape/test_reshape.py | 4 ++++ pandas/tests/sparse/test_frame.py | 2 ++ 5 files changed, 18 insertions(+), 6 deletions(-) diff --git a/asv_bench/benchmarks/sparse.py b/asv_bench/benchmarks/sparse.py index 500149b89b08b..7259e8cdb7d61 100644 --- a/asv_bench/benchmarks/sparse.py +++ b/asv_bench/benchmarks/sparse.py @@ -1,3 +1,5 @@ +from itertools import repeat + from .pandas_vb_common import * import scipy.sparse from pandas import SparseSeries, SparseDataFrame @@ -27,6 +29,12 @@ class sparse_frame_constructor(object): def time_sparse_frame_constructor(self): SparseDataFrame(columns=np.arange(100), index=np.arange(1000)) + def time_sparse_from_scipy(self): + SparseDataFrame(scipy.sparse.rand(1000, 1000, 0.005)) + + def time_sparse_from_dict(self): + SparseDataFrame(dict(zip(range(1000), repeat([0])))) + class sparse_series_from_coo(object): goal_time = 0.2 diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 7c52cf6f450b2..935e9d740b91c 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -136,6 +136,7 @@ Removal of prior version deprecations/changes Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ +- Improved performance of instantiating :class:`SparseDataFrame` (:issue:`16773`) .. _whatsnew_0210.bug_fixes: diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index 461dd50c5da6e..e157ae16e71f9 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -143,7 +143,7 @@ def _init_dict(self, data, index, columns, dtype=None): sp_maker = lambda x: SparseArray(x, kind=self._default_kind, fill_value=self._default_fill_value, copy=True, dtype=dtype) - sdict = DataFrame() + sdict = {} for k, v in compat.iteritems(data): if isinstance(v, Series): # Force alignment, no copy necessary @@ -163,11 +163,8 @@ def _init_dict(self, data, index, columns, dtype=None): # TODO: figure out how to handle this case, all nan's? # add in any other columns we want to have (completeness) - nan_vec = np.empty(len(index)) - nan_vec.fill(nan) - for c in columns: - if c not in sdict: - sdict[c] = sp_maker(nan_vec) + nan_arr = sp_maker(np.full(len(index), np.nan)) + sdict.update((c, nan_arr) for c in columns if c not in sdict) return to_manager(sdict, columns, index) diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_reshape.py index d47a95924bd10..632d3b4ad2e7a 100644 --- a/pandas/tests/reshape/test_reshape.py +++ b/pandas/tests/reshape/test_reshape.py @@ -643,6 +643,10 @@ def test_dataframe_dummies_preserve_categorical_dtype(self): class TestGetDummiesSparse(TestGetDummies): sparse = True + @pytest.mark.xfail(reason='nan in index is problematic (GH 16894)') + def test_include_na(self): + super(TestGetDummiesSparse, self).test_include_na() + class TestMakeAxisDummies(object): diff --git a/pandas/tests/sparse/test_frame.py b/pandas/tests/sparse/test_frame.py index 654d12b782f37..a5d514644a8f1 100644 --- a/pandas/tests/sparse/test_frame.py +++ b/pandas/tests/sparse/test_frame.py @@ -1095,6 +1095,8 @@ def test_as_blocks(self): assert list(df_blocks.keys()) == ['float64'] tm.assert_frame_equal(df_blocks['float64'], df) + @pytest.mark.xfail(reason='nan column names in _init_dict problematic ' + '(GH 16894)') def test_nan_columnname(self): # GH 8822 nan_colname = DataFrame(Series(1.0, index=[0]), columns=[nan]) From dc54b6bbfd1da0947f3b66d4919e4b80e3207bce Mon Sep 17 00:00:00 2001 From: gfyoung Date: Mon, 17 Jul 2017 16:18:55 -0700 Subject: [PATCH 48/54] MAINT: Drop line_width and height from options (#16993) Deprecated since 0.11 and 0.12 respectively. --- doc/source/options.rst | 2 -- doc/source/whatsnew/v0.21.0.txt | 2 ++ pandas/core/config_init.py | 16 ---------------- pandas/io/formats/console.py | 4 ++-- pandas/tests/io/formats/test_format.py | 2 +- 5 files changed, 5 insertions(+), 21 deletions(-) diff --git a/doc/source/options.rst b/doc/source/options.rst index f373705a96f48..c585da64efece 100644 --- a/doc/source/options.rst +++ b/doc/source/options.rst @@ -304,7 +304,6 @@ display.float_format None The callable should accept a fl This is used in some places like SeriesFormatter. See core.format.EngFormatter for an example. -display.height 60 Deprecated. Use `display.max_rows` instead. display.large_repr truncate For DataFrames exceeding max_rows/max_cols, the repr (and HTML repr) can show a truncated table (the default from 0.13), @@ -323,7 +322,6 @@ display.latex.multicolumn_format 'l' Alignment of multicolumn labels display.latex.multirow False Combines rows when using a MultiIndex. Centered instead of top-aligned, separated by clines. -display.line_width 80 Deprecated. Use `display.width` instead. display.max_columns 20 max_rows and max_columns are used in __repr__() methods to decide if to_string() or info() is used to diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 935e9d740b91c..c63d4575bac43 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -124,6 +124,8 @@ Removal of prior version deprecations/changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - :func:`read_excel()` has dropped the ``has_index_names`` parameter (:issue:`10967`) +- The ``pd.options.display.height`` configuration has been dropped (:issue:`3663`) +- The ``pd.options.display.line_width`` configuration has been dropped (:issue:`2881`) - The ``pd.options.display.mpl_style`` configuration has been dropped (:issue:`12190`) - ``Index`` has dropped the ``.sym_diff()`` method in favor of ``.symmetric_difference()`` (:issue:`12591`) - ``Categorical`` has dropped the ``.order()`` and ``.sort()`` methods in favor of ``.sort_values()`` (:issue:`12882`) diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index ae3001564a62f..06ce811703a8c 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -213,14 +213,6 @@ def use_numexpr_cb(key): (currently both are identical) """ -pc_line_width_deprecation_warning = """\ -line_width has been deprecated, use display.width instead (currently both are -identical) -""" - -pc_height_deprecation_warning = """\ -height has been deprecated. -""" pc_width_doc = """ : int @@ -383,14 +375,6 @@ def table_schema_cb(key): cf.register_option('html.border', 1, pc_html_border_doc, validator=is_int) - -cf.deprecate_option('display.line_width', - msg=pc_line_width_deprecation_warning, - rkey='display.width') - -cf.deprecate_option('display.height', msg=pc_height_deprecation_warning, - rkey='display.max_rows') - with cf.config_prefix('html'): cf.register_option('border', 1, pc_html_border_doc, validator=is_int) diff --git a/pandas/io/formats/console.py b/pandas/io/formats/console.py index ab75e3fa253ce..bdff59939a4de 100644 --- a/pandas/io/formats/console.py +++ b/pandas/io/formats/console.py @@ -53,7 +53,7 @@ def get_console_size(): display_width = get_option('display.width') # deprecated. - display_height = get_option('display.height', silent=True) + display_height = get_option('display.max_rows') # Consider # interactive shell terminal, can detect term size @@ -71,7 +71,7 @@ def get_console_size(): # match default for width,height in config_init from pandas.core.config import get_default_val terminal_width = get_default_val('display.width') - terminal_height = get_default_val('display.height') + terminal_height = get_default_val('display.max_rows') else: # pure terminal terminal_width, terminal_height = get_terminal_size() diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 679d43ac492ca..e1499565ce4a6 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -302,7 +302,7 @@ def test_repr_non_interactive(self): df = DataFrame('hello', lrange(1000), lrange(5)) with option_context('mode.sim_interactive', False, 'display.width', 0, - 'display.height', 0, 'display.max_rows', 5000): + 'display.max_rows', 5000): assert not has_truncated_repr(df) assert not has_expanded_repr(df) From 81f8acef11e8d1e2f0ea78a7b57ee04bef1f6038 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Mon, 17 Jul 2017 16:29:57 -0700 Subject: [PATCH 49/54] COMPAT: Add back remove_na for seaborn (#16992) Closes gh-16971. --- pandas/core/series.py | 12 +++++++++++- pandas/tests/series/test_missing.py | 6 ++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 4d5b718ce0ae9..219eca4277f32 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -37,7 +37,6 @@ maybe_convert_platform, maybe_cast_to_datetime, maybe_castable) from pandas.core.dtypes.missing import isnull, notnull, remove_na_arraylike - from pandas.core.common import (is_bool_indexer, _default_index, _asarray_tuplesafe, @@ -88,6 +87,17 @@ versionadded_to_excel='\n .. versionadded:: 0.20.0\n') +# see gh-16971 +def remove_na(arr): + """ + DEPRECATED : this function will be removed in a future version. + """ + + warnings.warn("remove_na is deprecated and is a private " + "function. Do not use.", FutureWarning, stacklevel=2) + return remove_na_arraylike(arr) + + def _coerce_method(converter): """ install the scalar coercion methods """ diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index 8e73c17684a16..b5948e75aa73e 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -15,6 +15,7 @@ MultiIndex, Index, Timestamp, NaT, IntervalIndex) from pandas.compat import range from pandas._libs.tslib import iNaT +from pandas.core.series import remove_na from pandas.util.testing import assert_series_equal, assert_frame_equal import pandas.util.testing as tm @@ -50,6 +51,11 @@ def _simple_ts(start, end, freq='D'): class TestSeriesMissingData(TestData): + def test_remove_na_deprecation(self): + # see gh-16971 + with tm.assert_produces_warning(FutureWarning): + remove_na(Series([])) + def test_timedelta_fillna(self): # GH 3371 s = Series([Timestamp('20130101'), Timestamp('20130101'), Timestamp( From 7b9a57fc99fcd63c55b041ea7c76f5c390c12aa0 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 17 Jul 2017 21:31:42 -0400 Subject: [PATCH 50/54] COMPAT: np.full not available in all versions, xref #16773 (#17000) --- pandas/core/sparse/frame.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index e157ae16e71f9..5fe96d70fc16f 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -163,7 +163,9 @@ def _init_dict(self, data, index, columns, dtype=None): # TODO: figure out how to handle this case, all nan's? # add in any other columns we want to have (completeness) - nan_arr = sp_maker(np.full(len(index), np.nan)) + nan_arr = np.empty(len(index), dtype='float64') + nan_arr.fill(np.nan) + nan_arr = sp_maker(nan_arr) sdict.update((c, nan_arr) for c in columns if c not in sdict) return to_manager(sdict, columns, index) From fcb0263762a31724ba6db39bf1564569dda068a0 Mon Sep 17 00:00:00 2001 From: Lucas Kushner Date: Tue, 18 Jul 2017 00:01:26 -0500 Subject: [PATCH 51/54] DOC, TST: Clarify whitespace behavior in read_fwf documentation (#16950) Closes gh-16772 --- doc/source/io.rst | 6 ++++- pandas/io/parsers.py | 13 ++++++----- pandas/tests/io/parser/test_read_fwf.py | 29 +++++++++++++++++++++++++ 3 files changed, 41 insertions(+), 7 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 9bf84e5419ffa..495d4e9c3a5a3 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1258,7 +1258,8 @@ Files with Fixed Width Columns While ``read_csv`` reads delimited data, the :func:`read_fwf` function works with data files that have known and fixed column widths. The function parameters -to ``read_fwf`` are largely the same as `read_csv` with two extra parameters: +to ``read_fwf`` are largely the same as `read_csv` with two extra parameters, and +a different usage of the ``delimiter`` parameter: - ``colspecs``: A list of pairs (tuples) giving the extents of the fixed-width fields of each line as half-open intervals (i.e., [from, to[ ). @@ -1267,6 +1268,9 @@ to ``read_fwf`` are largely the same as `read_csv` with two extra parameters: behaviour, if not specified, is to infer. - ``widths``: A list of field widths which can be used instead of 'colspecs' if the intervals are contiguous. + - ``delimiter``: Characters to consider as filler characters in the fixed-width file. + Can be used to specify the filler character of the fields + if it is not spaces (e.g., '~'). .. ipython:: python :suppress: diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 343bc7a74fde8..1e7d9d420b35d 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -63,8 +63,6 @@ file. For file URLs, a host is expected. For instance, a local file could be file ://localhost/path/to/table.csv %s -delimiter : str, default ``None`` - Alternative argument name for sep. delim_whitespace : boolean, default False Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be used as the sep. Equivalent to setting ``sep='\s+'``. If this option @@ -316,7 +314,9 @@ be used automatically. In addition, separators longer than 1 character and different from ``'\s+'`` will be interpreted as regular expressions and will also force the use of the Python parsing engine. Note that regex - delimiters are prone to ignoring quoted data. Regex example: ``'\r\t'``""" + delimiters are prone to ignoring quoted data. Regex example: ``'\r\t'`` +delimiter : str, default ``None`` + Alternative argument name for sep.""" _read_csv_doc = """ Read CSV (comma-separated) file into DataFrame @@ -341,15 +341,16 @@ widths : list of ints. optional A list of field widths which can be used instead of 'colspecs' if the intervals are contiguous. +delimiter : str, default ``'\t' + ' '`` + Characters to consider as filler characters in the fixed-width file. + Can be used to specify the filler character of the fields + if it is not spaces (e.g., '~'). """ _read_fwf_doc = """ Read a table of fixed-width formatted lines into DataFrame %s - -Also, 'delimiter' is used to specify the filler character of the -fields if it is not spaces (e.g., '~'). """ % (_parser_params % (_fwf_widths, '')) diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index 0bfeb5215f370..ec1d1a2a51cdc 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -405,3 +405,32 @@ def test_skiprows_inference_empty(self): with pytest.raises(EmptyDataError): read_fwf(StringIO(test), skiprows=3) + + def test_whitespace_preservation(self): + # Addresses Issue #16772 + data_expected = """ + a ,bbb + cc,dd """ + expected = read_csv(StringIO(data_expected), header=None) + + test_data = """ + a bbb + ccdd """ + result = read_fwf(StringIO(test_data), widths=[3, 3], + header=None, skiprows=[0], delimiter="\n\t") + + tm.assert_frame_equal(result, expected) + + def test_default_delimiter(self): + data_expected = """ +a,bbb +cc,dd""" + expected = read_csv(StringIO(data_expected), header=None) + + test_data = """ +a \tbbb +cc\tdd """ + result = read_fwf(StringIO(test_data), widths=[3, 3], + header=None, skiprows=[0]) + + tm.assert_frame_equal(result, expected) From 9e7666dae3b3b10d987ce154a51c78bcee6e0728 Mon Sep 17 00:00:00 2001 From: chris-b1 Date: Tue, 18 Jul 2017 06:26:44 -0500 Subject: [PATCH 52/54] API: add infer_objects for soft conversions (#16915) * API: add infer_objects for soft conversions * doc fixups * fixups * doc --- doc/source/api.rst | 2 + doc/source/basics.rst | 23 ++++++++- doc/source/whatsnew/v0.21.0.txt | 32 +++++++++++++ pandas/core/generic.py | 56 ++++++++++++++++++++-- pandas/tests/frame/test_block_internals.py | 26 ++++++++++ pandas/tests/series/test_dtypes.py | 18 +++++++ 6 files changed, 153 insertions(+), 4 deletions(-) diff --git a/doc/source/api.rst b/doc/source/api.rst index d6053791d6f4b..77d095a965221 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -270,6 +270,7 @@ Conversion :toctree: generated/ Series.astype + Series.infer_objects Series.copy Series.isnull Series.notnull @@ -777,6 +778,7 @@ Conversion DataFrame.astype DataFrame.convert_objects + DataFrame.infer_objects DataFrame.copy DataFrame.isnull DataFrame.notnull diff --git a/doc/source/basics.rst b/doc/source/basics.rst index d8b1602fb104d..4211b15203721 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -2024,7 +2024,28 @@ object conversion ~~~~~~~~~~~~~~~~~ pandas offers various functions to try to force conversion of types from the ``object`` dtype to other types. -The following functions are available for one dimensional object arrays or scalars: +In cases where the data is already of the correct type, but stored in an ``object`` array, the +:meth:`~DataFrame.infer_objects` and :meth:`~Series.infer_objects` can be used to soft convert +to the correct type. + + .. ipython:: python + + df = pd.DataFrame([[1, 2], + ['a', 'b'], + [datetime.datetime(2016, 3, 2), datetime.datetime(2016, 3, 2)]]) + df = df.T + df + df.dtypes + +Because the data transposed the original inference stored all columns as object, which +``infer_objects`` will correct. + + .. ipython:: python + + df.infer_objects().dtypes + +The following functions are available for one dimensional object arrays or scalars to perform +hard conversion of objects to a specified type: - :meth:`~pandas.to_numeric` (conversion to numeric dtypes) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index c63d4575bac43..cba3691b25ab1 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -25,6 +25,38 @@ New features - Added ``__fspath__`` method to :class:`~pandas.HDFStore`, :class:`~pandas.ExcelFile`, and :class:`~pandas.ExcelWriter` to work properly with the file system path protocol (:issue:`13823`) + +.. _whatsnew_0210.enhancements.infer_objects: + +``infer_objects`` type conversion +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The `:meth:`~DataFrame.infer_objects` and :meth:`~Series.infer_objects` +methods have been added to perform dtype inference on object columns, replacing +some of the functionality of the deprecated ``convert_objects`` +method. See the documentation :ref:`here ` +for more details. (:issue:`11221`) + +This function only performs soft conversions on object columns, converting Python objects +to native types, but not any coercive conversions. For example: + +.. ipython:: python + + df = pd.DataFrame({'A': [1, 2, 3], + 'B': np.array([1, 2, 3], dtype='object'), + 'C': ['1', '2', '3']}) + df.dtypes + df.infer_objects().dtype + +Note that column ``'C'`` was not converted - only scalar numeric types +will be inferred to a new type. Other types of conversion should be accomplished +using :func:`to_numeric` function (or :func:`to_datetime`, :func:`to_timedelta`). +.. ipython:: python + + df = df.infer_objects() + df['C'] = pd.to_numeric(df['C'], errors='coerce') + df.dtypes + .. _whatsnew_0210.enhancements.other: Other Enhancements diff --git a/pandas/core/generic.py b/pandas/core/generic.py index f12592feaa4c3..c95129bdaa005 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3671,9 +3671,12 @@ def convert_objects(self, convert_dates=True, convert_numeric=False, converted : same as input object """ from warnings import warn - warn("convert_objects is deprecated. Use the data-type specific " - "converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.", - FutureWarning, stacklevel=2) + msg = ("convert_objects is deprecated. To re-infer data dtypes for " + "object columns, use {klass}.infer_objects()\nFor all " + "other conversions use the data-type specific converters " + "pd.to_datetime, pd.to_timedelta and pd.to_numeric." + ).format(klass=self.__class__.__name__) + warn(msg, FutureWarning, stacklevel=2) return self._constructor( self._data.convert(convert_dates=convert_dates, @@ -3681,6 +3684,53 @@ def convert_objects(self, convert_dates=True, convert_numeric=False, convert_timedeltas=convert_timedeltas, copy=copy)).__finalize__(self) + def infer_objects(self): + """ + Attempt to infer better dtypes for object columns. + + Attempts soft conversion of object-dtyped + columns, leaving non-object and unconvertible + columns unchanged. The inference rules are the + same as during normal Series/DataFrame construction. + + .. versionadded:: 0.20.0 + + See Also + -------- + pandas.to_datetime : Convert argument to datetime. + pandas.to_timedelta : Convert argument to timedelta. + pandas.to_numeric : Convert argument to numeric typeR + + Returns + ------- + converted : same type as input object + + Examples + -------- + >>> df = pd.DataFrame({"A": ["a", 1, 2, 3]}) + >>> df = df.iloc[1:] + >>> df + A + 1 1 + 2 2 + 3 3 + + >>> df.dtypes + A object + dtype: object + + >>> df.infer_objects().dtypes + A int64 + dtype: object + """ + # numeric=False necessary to only soft convert; + # python objects will still be converted to + # native numpy numeric types + return self._constructor( + self._data.convert(datetime=True, numeric=False, + timedelta=True, coerce=False, + copy=True)).__finalize__(self) + # ---------------------------------------------------------------------- # Filling NA's diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index c1a5b437be5d0..f66070fd66813 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -495,6 +495,32 @@ def test_convert_objects_no_conversion(self): mixed2 = mixed1._convert(datetime=True) assert_frame_equal(mixed1, mixed2) + def test_infer_objects(self): + # GH 11221 + df = DataFrame({'a': ['a', 1, 2, 3], + 'b': ['b', 2.0, 3.0, 4.1], + 'c': ['c', datetime(2016, 1, 1), + datetime(2016, 1, 2), + datetime(2016, 1, 3)], + 'd': [1, 2, 3, 'd']}, + columns=['a', 'b', 'c', 'd']) + df = df.iloc[1:].infer_objects() + + assert df['a'].dtype == 'int64' + assert df['b'].dtype == 'float64' + assert df['c'].dtype == 'M8[ns]' + assert df['d'].dtype == 'object' + + expected = DataFrame({'a': [1, 2, 3], + 'b': [2.0, 3.0, 4.1], + 'c': [datetime(2016, 1, 1), + datetime(2016, 1, 2), + datetime(2016, 1, 3)], + 'd': [2, 3, 'd']}, + columns=['a', 'b', 'c', 'd']) + # reconstruct frame to verify inference is same + tm.assert_frame_equal(df.reset_index(drop=True), expected) + def test_stale_cached_series_bug_473(self): # this is chained, but ok diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index 2ec579842e33f..c214280ee8386 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -268,3 +268,21 @@ def test_series_to_categorical(self): expected = Series(['a', 'b', 'c'], dtype='category') tm.assert_series_equal(result, expected) + + def test_infer_objects_series(self): + # GH 11221 + actual = Series(np.array([1, 2, 3], dtype='O')).infer_objects() + expected = Series([1, 2, 3]) + tm.assert_series_equal(actual, expected) + + actual = Series(np.array([1, 2, 3, None], dtype='O')).infer_objects() + expected = Series([1., 2., 3., np.nan]) + tm.assert_series_equal(actual, expected) + + # only soft conversions, uncovertable pass thru unchanged + actual = (Series(np.array([1, 2, 3, None, 'a'], dtype='O')) + .infer_objects()) + expected = Series([1, 2, 3, None, 'a']) + + assert actual.dtype == 'object' + tm.assert_series_equal(actual, expected) From 6a5e56dc9402136e74e8c818a6947fd495bcd3b2 Mon Sep 17 00:00:00 2001 From: Jon Crall Date: Tue, 18 Jul 2017 11:58:55 -0400 Subject: [PATCH 53/54] BUG: np.inf now causes Index to upcast from int to float (#16996) Closes gh-16957. --- doc/source/whatsnew/v0.21.0.txt | 1 + pandas/core/indexes/base.py | 6 +-- pandas/tests/indexing/test_indexing.py | 56 ++++++++++++++++++++++++++ 3 files changed, 60 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index cba3691b25ab1..2259eb7d89534 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -179,6 +179,7 @@ Bug Fixes ~~~~~~~~~ - Fixes regression in 0.20, :func:`Series.aggregate` and :func:`DataFrame.aggregate` allow dictionaries as return values again (:issue:`16741`) +- Fixes bug where indexing with ``np.inf`` caused an ``OverflowError`` to be raised (:issue:`16957`) Conversion ^^^^^^^^^^ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index bbbc19b36964d..5d50f961927c7 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -666,7 +666,7 @@ def _try_convert_to_int_index(cls, data, copy, name): res = data.astype('u8', copy=False) if (res == data).all(): return UInt64Index(res, copy=copy, name=name) - except (TypeError, ValueError): + except (OverflowError, TypeError, ValueError): pass raise ValueError @@ -1640,7 +1640,7 @@ def __contains__(self, key): hash(key) try: return key in self._engine - except (TypeError, ValueError): + except (OverflowError, TypeError, ValueError): return False _index_shared_docs['contains'] = """ @@ -3365,7 +3365,7 @@ def _maybe_cast_indexer(self, key): ckey = int(key) if ckey == key: key = ckey - except (ValueError, TypeError): + except (OverflowError, ValueError, TypeError): pass return key diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 9fa677eb624ae..98f5d5eb140df 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -63,6 +63,34 @@ def f(): pytest.raises(ValueError, f) + def test_inf_upcast(self): + # GH 16957 + # We should be able to use np.inf as a key + # np.inf should cause an index to convert to float + + # Test with np.inf in rows + df = pd.DataFrame(columns=[0]) + df.loc[1] = 1 + df.loc[2] = 2 + df.loc[np.inf] = 3 + + # make sure we can look up the value + assert df.loc[np.inf, 0] == 3 + + result = df.index + expected = pd.Float64Index([1, 2, np.inf]) + tm.assert_index_equal(result, expected) + + # Test with np.inf in columns + df = pd.DataFrame() + df.loc[0, 0] = 1 + df.loc[1, 1] = 2 + df.loc[0, np.inf] = 3 + + result = df.columns + expected = pd.Float64Index([0, 1, np.inf]) + tm.assert_index_equal(result, expected) + def test_setitem_dtype_upcast(self): # GH3216 @@ -542,6 +570,34 @@ def test_astype_assignment_with_dups(self): # result = df.get_dtype_counts().sort_index() # expected = Series({'float64': 2, 'object': 1}).sort_index() + @pytest.mark.parametrize("index,val", [ + (pd.Index([0, 1, 2]), 2), + (pd.Index([0, 1, '2']), '2'), + (pd.Index([0, 1, 2, np.inf, 4]), 4), + (pd.Index([0, 1, 2, np.nan, 4]), 4), + (pd.Index([0, 1, 2, np.inf]), np.inf), + (pd.Index([0, 1, 2, np.nan]), np.nan), + ]) + def test_index_contains(self, index, val): + assert val in index + + @pytest.mark.parametrize("index,val", [ + (pd.Index([0, 1, 2]), '2'), + (pd.Index([0, 1, '2']), 2), + (pd.Index([0, 1, 2, np.inf]), 4), + (pd.Index([0, 1, 2, np.nan]), 4), + (pd.Index([0, 1, 2, np.inf]), np.nan), + (pd.Index([0, 1, 2, np.nan]), np.inf), + # Checking if np.inf in Int64Index should not cause an OverflowError + # Related to GH 16957 + (pd.Int64Index([0, 1, 2]), np.inf), + (pd.Int64Index([0, 1, 2]), np.nan), + (pd.UInt64Index([0, 1, 2]), np.inf), + (pd.UInt64Index([0, 1, 2]), np.nan), + ]) + def test_index_not_contains(self, index, val): + assert val not in index + def test_index_type_coercion(self): with catch_warnings(record=True): From 34210ac4d8c61ec4d695baba24d84bd7a1826af4 Mon Sep 17 00:00:00 2001 From: parchd-1 Date: Tue, 18 Jul 2017 18:08:03 +0200 Subject: [PATCH 54/54] DOC: Make highlight functions match documentation (#16999) Closes gh-16998. --- pandas/io/formats/style.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index b08d3877f3b03..d88a230b42403 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -1054,9 +1054,9 @@ def highlight_max(self, subset=None, color='yellow', axis=0): subset: IndexSlice, default None a valid slice for ``data`` to limit the style application to color: str, default 'yellow' - axis: int, str, or None; default None - 0 or 'index' for columnwise, 1 or 'columns' for rowwise - or ``None`` for tablewise (the default) + axis: int, str, or None; default 0 + 0 or 'index' for columnwise (default), 1 or 'columns' for rowwise, + or ``None`` for tablewise Returns ------- @@ -1076,9 +1076,9 @@ def highlight_min(self, subset=None, color='yellow', axis=0): subset: IndexSlice, default None a valid slice for ``data`` to limit the style application to color: str, default 'yellow' - axis: int, str, or None; default None - 0 or 'index' for columnwise, 1 or 'columns' for rowwise - or ``None`` for tablewise (the default) + axis: int, str, or None; default 0 + 0 or 'index' for columnwise (default), 1 or 'columns' for rowwise, + or ``None`` for tablewise Returns -------