Merge remote-tracking branch 'upstream/master' into multi-index-join

pandas-dev · Oct 7, 2018 · 01ae19e · 01ae19e
2 parents 2d61a12 + 5551bcf
commit 01ae19e
Show file tree

Hide file tree

Showing 167 changed files with 4,125 additions and 2,080 deletions.
diff --git a/.pep8speaks.yml b/.pep8speaks.yml
@@ -8,5 +8,4 @@ pycodestyle:
     ignore:  # Errors and warnings to ignore
         - E402,  # module level import not at top of file
         - E731,  # do not assign a lambda expression, use a def
-        - E741,  # do not use variables named 'l', 'O', or 'I'
         - W503   # line break before binary operator
diff --git a/.travis.yml b/.travis.yml
@@ -53,18 +53,20 @@ matrix:
     - dist: trusty
       env:
         - JOB="3.6, coverage" ENV_FILE="ci/travis-36.yaml" TEST_ARGS="--skip-slow --skip-network" PANDAS_TESTING_MODE="deprecate" COVERAGE=true DOCTEST=true
-    # In allow_failures
-    - dist: trusty
-      env:
-        - JOB="3.6, slow" ENV_FILE="ci/travis-36-slow.yaml" SLOW=true
-    # In allow_failures
+
     - dist: trusty
       env:
         - JOB="3.7, NumPy dev" ENV_FILE="ci/travis-37-numpydev.yaml" TEST_ARGS="--skip-slow --skip-network -W error" PANDAS_TESTING_MODE="deprecate"
       addons:
         apt:
           packages:
           - xsel
+
+    # In allow_failures
+    - dist: trusty
+      env:
+        - JOB="3.6, slow" ENV_FILE="ci/travis-36-slow.yaml" SLOW=true
+
     # In allow_failures
     - dist: trusty
       env:
@@ -73,13 +75,6 @@ matrix:
       - dist: trusty
         env:
           - JOB="3.6, slow" ENV_FILE="ci/travis-36-slow.yaml" SLOW=true
-      - dist: trusty
-        env:
-          - JOB="3.7, NumPy dev" ENV_FILE="ci/travis-37-numpydev.yaml" TEST_ARGS="--skip-slow --skip-network -W error" PANDAS_TESTING_MODE="deprecate"
-        addons:
-          apt:
-            packages:
-            - xsel
       - dist: trusty
         env:
           - JOB="3.6, doc" ENV_FILE="ci/travis-36-doc.yaml" DOC=true

diff --git a/README.md b/README.md
@@ -56,8 +56,8 @@
 <tr>
   <td></td>
   <td>
-    <a href="https://ci.appveyor.com/project/pandas-dev/pandas">
-    <img src="https://ci.appveyor.com/api/projects/status/86vn83mxgnl4xf1s/branch/master?svg=true" alt="appveyor build status" />
+    <a href="https://dev.azure.com/pandas-dev/pandas/_build/latest?definitionId=1&branch=master">
+      <img src="https://dev.azure.com/pandas-dev/pandas/_apis/build/status/pandas-dev.pandas?branch=master" alt="Azure Pipelines build status" />
     </a>
   </td>
 </tr>
@@ -97,7 +97,7 @@ easy and intuitive. It aims to be the fundamental high-level building block for
 doing practical, **real world** data analysis in Python. Additionally, it has
 the broader goal of becoming **the most powerful and flexible open source data
 analysis / manipulation tool available in any language**. It is already well on
-its way toward this goal.
+its way towards this goal.
 
 ## Main Features
 Here are just a few of the things that pandas does well:

diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py
@@ -9,7 +9,7 @@
     try:
         hashing = import_module(imp)
         break
-    except:
+    except (ImportError, TypeError, ValueError):
         pass
 
 from .pandas_vb_common import setup # noqa

diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py
@@ -505,14 +505,21 @@ class NSort(object):
     param_names = ['keep']
 
     def setup(self, keep):
-        self.df = DataFrame(np.random.randn(1000, 3), columns=list('ABC'))
+        self.df = DataFrame(np.random.randn(100000, 3),
+                            columns=list('ABC'))
 
-    def time_nlargest(self, keep):
+    def time_nlargest_one_column(self, keep):
         self.df.nlargest(100, 'A', keep=keep)
 
-    def time_nsmallest(self, keep):
+    def time_nlargest_two_columns(self, keep):
+        self.df.nlargest(100, ['A', 'B'], keep=keep)
+
+    def time_nsmallest_one_column(self, keep):
         self.df.nsmallest(100, 'A', keep=keep)
 
+    def time_nsmallest_two_columns(self, keep):
+        self.df.nsmallest(100, ['A', 'B'], keep=keep)
+
 
 class Describe(object):
 

diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py
@@ -2,104 +2,119 @@
 
 import numpy as np
 import pandas.util.testing as tm
-from pandas import (Series, DataFrame, MultiIndex, Int64Index, Float64Index,
-                    IntervalIndex, CategoricalIndex,
-                    IndexSlice, concat, date_range)
-from .pandas_vb_common import setup, Panel  # noqa
+from pandas import (Series, DataFrame, MultiIndex, Panel,
+                    Int64Index, Float64Index, IntervalIndex,
+                    CategoricalIndex, IndexSlice, concat, date_range)
+from .pandas_vb_common import setup  # noqa
 
 
 class NumericSeriesIndexing(object):
 
     goal_time = 0.2
-    params = [Int64Index, Float64Index]
-    param = ['index']
+    params = [
+        (Int64Index, Float64Index),
+        ('unique_monotonic_inc', 'nonunique_monotonic_inc'),
+    ]
+    param_names = ['index_dtype', 'index_structure']
 
-    def setup(self, index):
+    def setup(self, index, index_structure):
         N = 10**6
-        idx = index(range(N))
-        self.data = Series(np.random.rand(N), index=idx)
+        indices = {
+            'unique_monotonic_inc': index(range(N)),
+            'nonunique_monotonic_inc': index(
+                list(range(55)) + [54] + list(range(55, N - 1))),
+        }
+        self.data = Series(np.random.rand(N), index=indices[index_structure])
         self.array = np.arange(10000)
         self.array_list = self.array.tolist()
 
-    def time_getitem_scalar(self, index):
+    def time_getitem_scalar(self, index, index_structure):
         self.data[800000]
 
-    def time_getitem_slice(self, index):
+    def time_getitem_slice(self, index, index_structure):
         self.data[:800000]
 
-    def time_getitem_list_like(self, index):
+    def time_getitem_list_like(self, index, index_structure):
         self.data[[800000]]
 
-    def time_getitem_array(self, index):
+    def time_getitem_array(self, index, index_structure):
         self.data[self.array]
 
-    def time_getitem_lists(self, index):
+    def time_getitem_lists(self, index, index_structure):
         self.data[self.array_list]
 
-    def time_iloc_array(self, index):
+    def time_iloc_array(self, index, index_structure):
         self.data.iloc[self.array]
 
-    def time_iloc_list_like(self, index):
+    def time_iloc_list_like(self, index, index_structure):
         self.data.iloc[[800000]]
 
-    def time_iloc_scalar(self, index):
+    def time_iloc_scalar(self, index, index_structure):
         self.data.iloc[800000]
 
-    def time_iloc_slice(self, index):
+    def time_iloc_slice(self, index, index_structure):
         self.data.iloc[:800000]
 
-    def time_ix_array(self, index):
+    def time_ix_array(self, index, index_structure):
         self.data.ix[self.array]
 
-    def time_ix_list_like(self, index):
+    def time_ix_list_like(self, index, index_structure):
         self.data.ix[[800000]]
 
-    def time_ix_scalar(self, index):
+    def time_ix_scalar(self, index, index_structure):
         self.data.ix[800000]
 
-    def time_ix_slice(self, index):
+    def time_ix_slice(self, index, index_structure):
         self.data.ix[:800000]
 
-    def time_loc_array(self, index):
+    def time_loc_array(self, index, index_structure):
         self.data.loc[self.array]
 
-    def time_loc_list_like(self, index):
+    def time_loc_list_like(self, index, index_structure):
         self.data.loc[[800000]]
 
-    def time_loc_scalar(self, index):
+    def time_loc_scalar(self, index, index_structure):
         self.data.loc[800000]
 
-    def time_loc_slice(self, index):
+    def time_loc_slice(self, index, index_structure):
         self.data.loc[:800000]
 
 
 class NonNumericSeriesIndexing(object):
 
     goal_time = 0.2
-    params = ['string', 'datetime']
-    param_names = ['index']
+    params = [
+        ('string', 'datetime'),
+        ('unique_monotonic_inc', 'nonunique_monotonic_inc'),
+    ]
+    param_names = ['index_dtype', 'index_structure']
 
-    def setup(self, index):
-        N = 10**5
+    def setup(self, index, index_structure):
+        N = 10**6
         indexes = {'string': tm.makeStringIndex(N),
                    'datetime': date_range('1900', periods=N, freq='s')}
         index = indexes[index]
+        if index_structure == 'nonunique_monotonic_inc':
+            index = index.insert(item=index[2], loc=2)[:-1]
         self.s = Series(np.random.rand(N), index=index)
         self.lbl = index[80000]
 
-    def time_getitem_label_slice(self, index):
+    def time_getitem_label_slice(self, index, index_structure):
         self.s[:self.lbl]
 
-    def time_getitem_pos_slice(self, index):
+    def time_getitem_pos_slice(self, index, index_structure):
         self.s[:80000]
 
-    def time_get_value(self, index):
+    def time_get_value(self, index, index_structure):
         with warnings.catch_warnings(record=True):
             self.s.get_value(self.lbl)
 
-    def time_getitem_scalar(self, index):
+    def time_getitem_scalar(self, index, index_structure):
         self.s[self.lbl]
 
+    def time_getitem_list_like(self, index, index_structure):
+        self.s[[self.lbl]]
+
 
 class DataFrameStringIndexing(object):
 

diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
@@ -1,11 +1,9 @@
 import random
-import timeit
 import string
 
 import numpy as np
 import pandas.util.testing as tm
 from pandas import DataFrame, Categorical, date_range, read_csv
-from pandas.compat import PY2
 from pandas.compat import cStringIO as StringIO
 
 from ..pandas_vb_common import setup, BaseIO  # noqa
@@ -181,8 +179,8 @@ def time_read_csv(self, sep, decimal, float_precision):
                  names=list('abc'), float_precision=float_precision)
 
     def time_read_csv_python_engine(self, sep, decimal, float_precision):
-        read_csv(self.data(self.StringIO_input), sep=sep, header=None, engine='python',
-                 float_precision=None, names=list('abc'))
+        read_csv(self.data(self.StringIO_input), sep=sep, header=None,
+                 engine='python', float_precision=None, names=list('abc'))
 
 
 class ReadCSVCategorical(BaseIO):

diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py
@@ -3,14 +3,15 @@
 
 import numpy as np
 import pandas.util.testing as tm
-from pandas import (DataFrame, Series, MultiIndex, date_range, concat, merge,
-                    merge_asof)
+from pandas import (DataFrame, Series, Panel, MultiIndex,
+                    date_range, concat, merge, merge_asof)
+
 try:
     from pandas import merge_ordered
 except ImportError:
     from pandas import ordered_merge as merge_ordered
 
-from .pandas_vb_common import Panel, setup  # noqa
+from .pandas_vb_common import setup  # noqa
 
 
 class Append(object):
@@ -29,7 +30,7 @@ def setup(self):
         try:
             with warnings.catch_warnings(record=True):
                 self.mdf1.consolidate(inplace=True)
-        except:
+        except (AttributeError, TypeError):
             pass
         self.mdf2 = self.mdf1.copy()
         self.mdf2.index = self.df2.index

diff --git a/asv_bench/benchmarks/pandas_vb_common.py b/asv_bench/benchmarks/pandas_vb_common.py
@@ -2,14 +2,13 @@
 from importlib import import_module
 
 import numpy as np
-from pandas import Panel
 
 # Compatibility import for lib
 for imp in ['pandas._libs.lib', 'pandas.lib']:
     try:
         lib = import_module(imp)
         break
-    except:
+    except (ImportError, TypeError, ValueError):
         pass
 
 numeric_dtypes = [np.int64, np.int32, np.uint32, np.uint64, np.float32,
@@ -34,7 +33,7 @@ def remove(self, f):
         """Remove created files"""
         try:
             os.remove(f)
-        except:
+        except OSError:
             # On Windows, attempting to remove a file that is in use
             # causes an exception to be raised
             pass

diff --git a/asv_bench/benchmarks/panel_ctor.py b/asv_bench/benchmarks/panel_ctor.py
@@ -1,9 +1,9 @@
 import warnings
 from datetime import datetime, timedelta
 
-from pandas import DataFrame, DatetimeIndex, date_range
+from pandas import DataFrame, Panel, DatetimeIndex, date_range
 
-from .pandas_vb_common import Panel, setup  # noqa
+from .pandas_vb_common import setup  # noqa
 
 
 class DifferentIndexes(object):

diff --git a/asv_bench/benchmarks/panel_methods.py b/asv_bench/benchmarks/panel_methods.py
@@ -1,8 +1,9 @@
 import warnings
 
 import numpy as np
+from pandas import Panel
 
-from .pandas_vb_common import Panel, setup  # noqa
+from .pandas_vb_common import setup  # noqa
 
 
 class PanelMethods(object):

diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py
@@ -18,7 +18,7 @@ def setup(self, op, dtype, axis, use_bottleneck):
         df = pd.DataFrame(np.random.randn(100000, 4)).astype(dtype)
         try:
             pd.options.compute.use_bottleneck = use_bottleneck
-        except:
+        except TypeError:
             from pandas.core import nanops
             nanops._USE_BOTTLENECK = use_bottleneck
         self.df_func = getattr(df, op)
@@ -56,7 +56,7 @@ def setup(self, op, dtype, use_bottleneck):
         s = pd.Series(np.random.randn(100000)).astype(dtype)
         try:
             pd.options.compute.use_bottleneck = use_bottleneck
-        except:
+        except TypeError:
             from pandas.core import nanops
             nanops._USE_BOTTLENECK = use_bottleneck
         self.s_func = getattr(s, op)

diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py
@@ -1,4 +1,3 @@
-import warnings
 from datetime import timedelta
 
 import numpy as np

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -18,8 +18,8 @@ jobs:
 - template: ci/azure/windows.yml
   parameters:
     name: Windows
-    vmImage: vs2017-win2017
+    vmImage: vs2017-win2016
 - template: ci/azure/windows-py27.yml
   parameters:
     name: WindowsPy27
-    vmImage: vs2017-win2017
+    vmImage: vs2017-win2016
diff --git a/ci/azure/macos.yml b/ci/azure/macos.yml
@@ -37,3 +37,7 @@ jobs:
     - script: |
         export PATH=$HOME/miniconda3/bin:$PATH
         source activate pandas && pushd /tmp && python -c "import pandas; pandas.show_versions();" && popd
+    - task: PublishTestResults@2
+      inputs:
+        testResultsFiles: '/tmp/*.xml'
+        testRunTitle: 'MacOS-35'
-Original file line number
+Diff line change
@@ Expand Up / @@ -9,7 +9,7 @@ @@
         try:
             hashing = import_module(imp)
             break
-        except:
+        except (ImportError, TypeError, ValueError):
             pass
     from .pandas_vb_common import setup # noqa
@@ Expand Down @@