Merge remote-tracking branch 'upstream/master' into fu1+fillna

pandas-dev · Mar 12, 2018 · a609f48 · a609f48
2 parents b342efe + 0815c43
commit a609f48
Show file tree

Hide file tree

Showing 96 changed files with 6,498 additions and 4,054 deletions.
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
@@ -1,3 +1,27 @@
+Checklist for the pandas documentation sprint (ignore this if you are doing
+an unrelated PR):
+
+- [ ] PR title is "DOC: update the <your-function-or-method> docstring"
+- [ ] The validation script passes: `scripts/validate_docstrings.py <your-function-or-method>`
+- [ ] The PEP8 style check passes: `git diff upstream/master -u -- "*.py" | flake8 --diff`
+- [ ] The html version looks good: `python doc/make.py --single <your-function-or-method>`
+- [ ] It has been proofread on language by another sprint participant
+
+Please include the output of the validation script below between the "```" ticks:
+
+```
+# paste output of "scripts/validate_docstrings.py <your-function-or-method>" here
+# between the "```" (remove this comment, but keep the "```")
+
+```
+
+If the validation script still gives errors, but you think there is a good reason
+to deviate in this case (and there are certainly such cases), please state this
+explicitly.
+
+
+Checklist for other PRs (remove this part if you are doing a PR for the pandas documentation sprint):
+
 - [ ] closes #xxxx
 - [ ] tests added / passed
 - [ ] passes `git diff upstream/master -u -- "*.py" | flake8 --diff`

diff --git a/.gitignore b/.gitignore
@@ -88,8 +88,9 @@ scikits
 *.c
 *.cpp
 
-# Performance Testing #
-#######################
+# Unit / Performance Testing #
+##############################
+.pytest_cache/
 asv_bench/env/
 asv_bench/html/
 asv_bench/results/

diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
@@ -14,7 +14,10 @@
 method_blacklist = {
     'object': {'median', 'prod', 'sem', 'cumsum', 'sum', 'cummin', 'mean',
                'max', 'skew', 'cumprod', 'cummax', 'rank', 'pct_change', 'min',
-               'var', 'mad', 'describe', 'std'}
+               'var', 'mad', 'describe', 'std'},
+    'datetime': {'median', 'prod', 'sem', 'cumsum', 'sum', 'mean', 'skew',
+                 'cumprod', 'cummax', 'pct_change', 'var', 'mad', 'describe',
+                 'std'}
 }
 
 
@@ -90,45 +93,6 @@ def time_series_groups(self, data, key):
         self.ser.groupby(self.ser).groups
 
 
-class FirstLast(object):
-
-    goal_time = 0.2
-
-    param_names = ['dtype']
-    params = ['float32', 'float64', 'datetime', 'object']
-
-    def setup(self, dtype):
-        N = 10**5
-        # with datetimes (GH7555)
-        if dtype == 'datetime':
-            self.df = DataFrame({'values': date_range('1/1/2011',
-                                                      periods=N,
-                                                      freq='s'),
-                                 'key': range(N)})
-        elif dtype == 'object':
-            self.df = DataFrame({'values': ['foo'] * N,
-                                 'key': range(N)})
-        else:
-            labels = np.arange(N / 10).repeat(10)
-            data = Series(np.random.randn(len(labels)), dtype=dtype)
-            data[::3] = np.nan
-            data[1::3] = np.nan
-            labels = labels.take(np.random.permutation(len(labels)))
-            self.df = DataFrame({'values': data, 'key': labels})
-
-    def time_groupby_first(self, dtype):
-        self.df.groupby('key').first()
-
-    def time_groupby_last(self, dtype):
-        self.df.groupby('key').last()
-
-    def time_groupby_nth_all(self, dtype):
-        self.df.groupby('key').nth(0, dropna='all')
-
-    def time_groupby_nth_none(self, dtype):
-        self.df.groupby('key').nth(0)
-
-
 class GroupManyLabels(object):
 
     goal_time = 0.2
@@ -149,39 +113,40 @@ class Nth(object):
 
     goal_time = 0.2
 
-    def setup_cache(self):
-        df = DataFrame(np.random.randint(1, 100, (10000, 2)))
-        df.iloc[1, 1] = np.nan
-        return df
-
-    def time_frame_nth_any(self, df):
-        df.groupby(0).nth(0, dropna='any')
-
-    def time_frame_nth(self, df):
-        df.groupby(0).nth(0)
-
+    param_names = ['dtype']
+    params = ['float32', 'float64', 'datetime', 'object']
 
-    def time_series_nth_any(self, df):
-        df[1].groupby(df[0]).nth(0, dropna='any')
+    def setup(self, dtype):
+        N = 10**5
+        # with datetimes (GH7555)
+        if dtype == 'datetime':
+            values = date_range('1/1/2011', periods=N, freq='s')
+        elif dtype == 'object':
+            values = ['foo'] * N
+        else:
+            values = np.arange(N).astype(dtype)
 
-    def time_series_nth(self, df):
-        df[1].groupby(df[0]).nth(0)
+        key = np.arange(N)
+        self.df = DataFrame({'key': key, 'values': values})
+        self.df.iloc[1, 1] = np.nan  # insert missing data
 
+    def time_frame_nth_any(self, dtype):
+        self.df.groupby('key').nth(0, dropna='any')
 
-class NthObject(object):
+    def time_groupby_nth_all(self, dtype):
+        self.df.groupby('key').nth(0, dropna='all')
 
-    goal_time = 0.2
+    def time_frame_nth(self, dtype):
+        self.df.groupby('key').nth(0)
 
-    def setup_cache(self):
-        df = DataFrame(np.random.randint(1, 100, (10000,)), columns=['g'])
-        df['obj'] = ['a'] * 5000 + ['b'] * 5000
-        return df
+    def time_series_nth_any(self, dtype):
+        self.df['values'].groupby(self.df['key']).nth(0, dropna='any')
 
-    def time_nth(self, df):
-        df.groupby('g').nth(5)
+    def time_groupby_nth_all(self, dtype):
+        self.df['values'].groupby(self.df['key']).nth(0, dropna='all')
 
-    def time_nth_last(self, df):
-        df.groupby('g').last()
+    def time_series_nth(self, dtype):
+        self.df['values'].groupby(self.df['key']).nth(0)
 
 
 class DateAttributes(object):
@@ -243,7 +208,7 @@ def time_multi_count(self, df):
         df.groupby(['key1', 'key2']).count()
 
 
-class CountInt(object):
+class CountMultiInt(object):
 
     goal_time = 0.2
 
@@ -255,18 +220,18 @@ def setup_cache(self):
                         'ints2': np.random.randint(0, 1000, size=n)})
         return df
 
-    def time_int_count(self, df):
+    def time_multi_int_count(self, df):
         df.groupby(['key1', 'key2']).count()
 
-    def time_int_nunique(self, df):
+    def time_multi_int_nunique(self, df):
         df.groupby(['key1', 'key2']).nunique()
 
 
 class AggFunctions(object):
 
     goal_time = 0.2
 
-    def setup_cache(self):
+    def setup_cache():
         N = 10**5
         fac1 = np.array(['A', 'B', 'C'], dtype='O')
         fac2 = np.array(['one', 'two'], dtype='O')
@@ -361,9 +326,6 @@ def setup(self):
     def time_multi_size(self):
         self.df.groupby(['key1', 'key2']).size()
 
-    def time_dt_size(self):
-        self.df.groupby(['dates']).size()
-
     def time_dt_timegrouper_size(self):
         with warnings.catch_warnings(record=True):
             self.df.groupby(TimeGrouper(key='dates', freq='M')).size()
@@ -376,15 +338,16 @@ class GroupByMethods(object):
 
     goal_time = 0.2
 
-    param_names = ['dtype', 'method']
-    params = [['int', 'float', 'object'],
+    param_names = ['dtype', 'method', 'application']
+    params = [['int', 'float', 'object', 'datetime'],
               ['all', 'any', 'bfill', 'count', 'cumcount', 'cummax', 'cummin',
                'cumprod', 'cumsum', 'describe', 'ffill', 'first', 'head',
                'last', 'mad', 'max', 'min', 'median', 'mean', 'nunique',
                'pct_change', 'prod', 'rank', 'sem', 'shift', 'size', 'skew',
-               'std', 'sum', 'tail', 'unique', 'value_counts', 'var']]
+               'std', 'sum', 'tail', 'unique', 'value_counts', 'var'],
+              ['direct', 'transformation']]
 
-    def setup(self, dtype, method):
+    def setup(self, dtype, method, application):
         if method in method_blacklist.get(dtype, {}):
             raise NotImplementedError  # skip benchmark
         ngroups = 1000
@@ -398,12 +361,28 @@ def setup(self, dtype, method):
                                   np.random.random(ngroups) * 10.0])
         elif dtype == 'object':
             key = ['foo'] * size
+        elif dtype == 'datetime':
+            key = date_range('1/1/2011', periods=size, freq='s')
 
         df = DataFrame({'values': values, 'key': key})
-        self.df_groupby_method = getattr(df.groupby('key')['values'], method)
 
-    def time_method(self, dtype, method):
-        self.df_groupby_method()
+        if application == 'transform':
+            if method == 'describe':
+                raise NotImplementedError
+
+            self.as_group_method = lambda: df.groupby(
+                'key')['values'].transform(method)
+            self.as_field_method = lambda: df.groupby(
+                'values')['key'].transform(method)
+        else:
+            self.as_group_method = getattr(df.groupby('key')['values'], method)
+            self.as_field_method = getattr(df.groupby('values')['key'], method)
+
+    def time_dtype_as_group(self, dtype, method, application):
+        self.as_group_method()
+
+    def time_dtype_as_field(self, dtype, method, application):
+        self.as_field_method()
 
 
 class Float32(object):

diff --git a/ci/environment-dev.yaml b/ci/environment-dev.yaml
@@ -5,6 +5,7 @@ channels:
 dependencies:
   - Cython
   - NumPy
+  - flake8
   - moto
   - pytest>=3.1
   - python-dateutil>=2.5.0

diff --git a/ci/requirements-3.6_DOC.run b/ci/requirements-3.6_DOC.run
@@ -5,7 +5,7 @@ sphinx
 nbconvert
 nbformat
 notebook
-matplotlib
+matplotlib=2.1*
 seaborn
 scipy
 lxml

diff --git a/ci/requirements-3.6_NUMPY_DEV.build.sh b/ci/requirements-3.6_NUMPY_DEV.build.sh
@@ -12,8 +12,7 @@ PRE_WHEELS="https://7933911d6844c6c53a7d-47bd50c35cd79bd838daf386af554a83.ssl.cf
 pip install --pre --upgrade --timeout=60 -f $PRE_WHEELS numpy scipy
 
 # install dateutil from master
-# pip install -U git+git://github.com/dateutil/dateutil.git
-pip install dateutil
+pip install -U git+git://github.com/dateutil/dateutil.git
 
 # cython via pip
 pip install cython

diff --git a/ci/requirements_dev.txt b/ci/requirements_dev.txt
@@ -2,9 +2,10 @@
 # Do not modify directly
 Cython
 NumPy
+flake8
 moto
 pytest>=3.1
 python-dateutil>=2.5.0
 pytz
 setuptools>=3.3
-sphinx
+sphinx
diff --git a/doc/make.py b/doc/make.py
@@ -11,6 +11,7 @@
     $ python make.py html
     $ python make.py latex
 """
+import importlib
 import sys
 import os
 import shutil
@@ -20,8 +21,6 @@
 import webbrowser
 import jinja2
 
-import pandas
-
 
 DOC_PATH = os.path.dirname(os.path.abspath(__file__))
 SOURCE_PATH = os.path.join(DOC_PATH, 'source')
@@ -134,7 +133,7 @@ def _process_single_doc(self, single_doc):
             self.single_doc = single_doc
         elif single_doc is not None:
             try:
-                obj = pandas
+                obj = pandas  # noqa: F821
                 for name in single_doc.split('.'):
                     obj = getattr(obj, name)
             except AttributeError:
@@ -332,7 +331,7 @@ def main():
                                  'compile, e.g. "indexing", "DataFrame.join"'))
     argparser.add_argument('--python-path',
                            type=str,
-                           default=os.path.join(DOC_PATH, '..'),
+                           default=os.path.dirname(DOC_PATH),
                            help='path')
     argparser.add_argument('-v', action='count', dest='verbosity', default=0,
                            help=('increase verbosity (can be repeated), '
@@ -343,7 +342,13 @@ def main():
         raise ValueError('Unknown command {}. Available options: {}'.format(
             args.command, ', '.join(cmds)))
 
+    # Below we update both os.environ and sys.path. The former is used by
+    # external libraries (namely Sphinx) to compile this module and resolve
+    # the import of `python_path` correctly. The latter is used to resolve
+    # the import within the module, injecting it into the global namespace
     os.environ['PYTHONPATH'] = args.python_path
+    sys.path.append(args.python_path)
+    globals()['pandas'] = importlib.import_module('pandas')
 
     builder = DocBuilder(args.num_jobs, not args.no_api, args.single,
                          args.verbosity)

diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst
@@ -177,7 +177,7 @@ are consistent among all columns.
 .. note::
 
     To perform table-wise conversion, where all labels in the entire ``DataFrame`` are used as
-    categories for each column, the ``categories`` parameter can be determined programatically by
+    categories for each column, the ``categories`` parameter can be determined programmatically by
     ``categories = pd.unique(df.values.ravel())``.
 
 If you already have ``codes`` and ``categories``, you can use the
-Original file line number
+Diff line change
@@ Expand Up / @@ -5,6 +5,7 @@ channels: @@
     dependencies:
       - Cython
       - NumPy
+      - flake8
       - moto
       - pytest>=3.1
       - python-dateutil>=2.5.0
@@ Expand Down @@