Merge branch 'master' of https://github.com/pandas-dev/pandas

pandas-dev · Mar 11, 2018 · 4708db0 · 4708db0
2 parents f00a2d1 + 0d86742
commit 4708db0
Show file tree

Hide file tree

Showing 155 changed files with 8,465 additions and 4,641 deletions.
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
@@ -1,3 +1,27 @@
+Checklist for the pandas documentation sprint (ignore this if you are doing
+an unrelated PR):
+
+- [ ] PR title is "DOC: update the <your-function-or-method> docstring"
+- [ ] The validation script passes: `scripts/validate_docstrings.py <your-function-or-method>`
+- [ ] The PEP8 style check passes: `git diff upstream/master -u -- "*.py" | flake8 --diff`
+- [ ] The html version looks good: `python doc/make.py --single <your-function-or-method>`
+- [ ] It has been proofread on language by another sprint participant
+
+Please include the output of the validation script below between the "```" ticks:
+
+```
+# paste output of "scripts/validate_docstrings.py <your-function-or-method>" here
+# between the "```" (remove this comment, but keep the "```")
+
+```
+
+If the validation script still gives errors, but you think there is a good reason
+to deviate in this case (and there are certainly such cases), please state this
+explicitly.
+
+
+Checklist for other PRs (remove this part if you are doing a PR for the pandas documentation sprint):
+
 - [ ] closes #xxxx
 - [ ] tests added / passed
 - [ ] passes `git diff upstream/master -u -- "*.py" | flake8 --diff`

diff --git a/.gitignore b/.gitignore
@@ -88,8 +88,9 @@ scikits
 *.c
 *.cpp
 
-# Performance Testing #
-#######################
+# Unit / Performance Testing #
+##############################
+.pytest_cache/
 asv_bench/env/
 asv_bench/html/
 asv_bench/results/

diff --git a/README.md b/README.md
@@ -216,13 +216,16 @@ Further, general questions and discussions can also take place on the [pydata ma
 ## Discussion and Development
 Most development discussion is taking place on github in this repo. Further, the [pandas-dev mailing list](https://mail.python.org/mailman/listinfo/pandas-dev) can also be used for specialized discussions or design issues, and a [Gitter channel](https://gitter.im/pydata/pandas) is available for quick development related questions.
 
-## Contributing to pandas
+## Contributing to pandas [![Open Source Helpers](https://www.codetriage.com/pandas-dev/pandas/badges/users.svg)](https://www.codetriage.com/pandas-dev/pandas)
+
 All contributions, bug reports, bug fixes, documentation improvements, enhancements and ideas are welcome.
 
 A detailed overview on how to contribute can be found in the **[contributing guide.](https://pandas.pydata.org/pandas-docs/stable/contributing.html)**
 
 If you are simply looking to start working with the pandas codebase, navigate to the [GitHub “issues” tab](https://github.com/pandas-dev/pandas/issues) and start looking through interesting issues. There are a number of issues listed under [Docs](https://github.com/pandas-dev/pandas/issues?labels=Docs&sort=updated&state=open) and [Difficulty Novice](https://github.com/pandas-dev/pandas/issues?q=is%3Aopen+is%3Aissue+label%3A%22Difficulty+Novice%22) where you could start out.
 
+You can also triage issues which may include reproducing bug reports, or asking for vital information such as version numbers or reproduction instructions. If you would like to start triaging issues, one easy way to get started is to [subscribe to pandas on CodeTriage](https://www.codetriage.com/pandas-dev/pandas).
+
 Or maybe through using pandas you have an idea of your own or are looking for something in the documentation and thinking ‘this can be improved’...you can do something about it!
 
 Feel free to ask questions on the [mailing list](https://groups.google.com/forum/?fromgroups#!forum/pydata) or on [Gitter](https://gitter.im/pydata/pandas).
diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
@@ -11,6 +11,16 @@
 from .pandas_vb_common import setup  # noqa
 
 
+method_blacklist = {
+    'object': {'median', 'prod', 'sem', 'cumsum', 'sum', 'cummin', 'mean',
+               'max', 'skew', 'cumprod', 'cummax', 'rank', 'pct_change', 'min',
+               'var', 'mad', 'describe', 'std'},
+    'datetime': {'median', 'prod', 'sem', 'cumsum', 'sum', 'mean', 'skew',
+                 'cumprod', 'cummax', 'pct_change', 'var', 'mad', 'describe',
+                 'std'}
+}
+
+
 class ApplyDictReturn(object):
     goal_time = 0.2
 
@@ -83,45 +93,6 @@ def time_series_groups(self, data, key):
         self.ser.groupby(self.ser).groups
 
 
-class FirstLast(object):
-
-    goal_time = 0.2
-
-    param_names = ['dtype']
-    params = ['float32', 'float64', 'datetime', 'object']
-
-    def setup(self, dtype):
-        N = 10**5
-        # with datetimes (GH7555)
-        if dtype == 'datetime':
-            self.df = DataFrame({'values': date_range('1/1/2011',
-                                                      periods=N,
-                                                      freq='s'),
-                                 'key': range(N)})
-        elif dtype == 'object':
-            self.df = DataFrame({'values': ['foo'] * N,
-                                 'key': range(N)})
-        else:
-            labels = np.arange(N / 10).repeat(10)
-            data = Series(np.random.randn(len(labels)), dtype=dtype)
-            data[::3] = np.nan
-            data[1::3] = np.nan
-            labels = labels.take(np.random.permutation(len(labels)))
-            self.df = DataFrame({'values': data, 'key': labels})
-
-    def time_groupby_first(self, dtype):
-        self.df.groupby('key').first()
-
-    def time_groupby_last(self, dtype):
-        self.df.groupby('key').last()
-
-    def time_groupby_nth_all(self, dtype):
-        self.df.groupby('key').nth(0, dropna='all')
-
-    def time_groupby_nth_none(self, dtype):
-        self.df.groupby('key').nth(0)
-
-
 class GroupManyLabels(object):
 
     goal_time = 0.2
@@ -142,38 +113,40 @@ class Nth(object):
 
     goal_time = 0.2
 
-    def setup_cache(self):
-        df = DataFrame(np.random.randint(1, 100, (10000, 2)))
-        df.iloc[1, 1] = np.nan
-        return df
-
-    def time_frame_nth_any(self, df):
-        df.groupby(0).nth(0, dropna='any')
-
-    def time_frame_nth(self, df):
-        df.groupby(0).nth(0)
+    param_names = ['dtype']
+    params = ['float32', 'float64', 'datetime', 'object']
 
-    def time_series_nth_any(self, df):
-        df[1].groupby(df[0]).nth(0, dropna='any')
+    def setup(self, dtype):
+        N = 10**5
+        # with datetimes (GH7555)
+        if dtype == 'datetime':
+            values = date_range('1/1/2011', periods=N, freq='s')
+        elif dtype == 'object':
+            values = ['foo'] * N
+        else:
+            values = np.arange(N).astype(dtype)
 
-    def time_series_nth(self, df):
-        df[1].groupby(df[0]).nth(0)
+        key = np.arange(N)
+        self.df = DataFrame({'key': key, 'values': values})
+        self.df.iloc[1, 1] = np.nan  # insert missing data
 
+    def time_frame_nth_any(self, dtype):
+        self.df.groupby('key').nth(0, dropna='any')
 
-class NthObject(object):
+    def time_groupby_nth_all(self, dtype):
+        self.df.groupby('key').nth(0, dropna='all')
 
-    goal_time = 0.2
+    def time_frame_nth(self, dtype):
+        self.df.groupby('key').nth(0)
 
-    def setup_cache(self):
-        df = DataFrame(np.random.randint(1, 100, (10000,)), columns=['g'])
-        df['obj'] = ['a'] * 5000 + ['b'] * 5000
-        return df
+    def time_series_nth_any(self, dtype):
+        self.df['values'].groupby(self.df['key']).nth(0, dropna='any')
 
-    def time_nth(self, df):
-        df.groupby('g').nth(5)
+    def time_groupby_nth_all(self, dtype):
+        self.df['values'].groupby(self.df['key']).nth(0, dropna='all')
 
-    def time_nth_last(self, df):
-        df.groupby('g').last()
+    def time_series_nth(self, dtype):
+        self.df['values'].groupby(self.df['key']).nth(0)
 
 
 class DateAttributes(object):
@@ -235,7 +208,7 @@ def time_multi_count(self, df):
         df.groupby(['key1', 'key2']).count()
 
 
-class CountInt(object):
+class CountMultiInt(object):
 
     goal_time = 0.2
 
@@ -247,18 +220,18 @@ def setup_cache(self):
                         'ints2': np.random.randint(0, 1000, size=n)})
         return df
 
-    def time_int_count(self, df):
+    def time_multi_int_count(self, df):
         df.groupby(['key1', 'key2']).count()
 
-    def time_int_nunique(self, df):
+    def time_multi_int_nunique(self, df):
         df.groupby(['key1', 'key2']).nunique()
 
 
 class AggFunctions(object):
 
     goal_time = 0.2
 
-    def setup_cache(self):
+    def setup_cache():
         N = 10**5
         fac1 = np.array(['A', 'B', 'C'], dtype='O')
         fac2 = np.array(['one', 'two'], dtype='O')
@@ -353,9 +326,6 @@ def setup(self):
     def time_multi_size(self):
         self.df.groupby(['key1', 'key2']).size()
 
-    def time_dt_size(self):
-        self.df.groupby(['dates']).size()
-
     def time_dt_timegrouper_size(self):
         with warnings.catch_warnings(record=True):
             self.df.groupby(TimeGrouper(key='dates', freq='M')).size()
@@ -368,30 +338,51 @@ class GroupByMethods(object):
 
     goal_time = 0.2
 
-    param_names = ['dtype', 'method']
-    params = [['int', 'float'],
+    param_names = ['dtype', 'method', 'application']
+    params = [['int', 'float', 'object', 'datetime'],
               ['all', 'any', 'bfill', 'count', 'cumcount', 'cummax', 'cummin',
                'cumprod', 'cumsum', 'describe', 'ffill', 'first', 'head',
                'last', 'mad', 'max', 'min', 'median', 'mean', 'nunique',
                'pct_change', 'prod', 'rank', 'sem', 'shift', 'size', 'skew',
-               'std', 'sum', 'tail', 'unique', 'value_counts', 'var']]
+               'std', 'sum', 'tail', 'unique', 'value_counts', 'var'],
+              ['direct', 'transformation']]
 
-    def setup(self, dtype, method):
+    def setup(self, dtype, method, application):
+        if method in method_blacklist.get(dtype, {}):
+            raise NotImplementedError  # skip benchmark
         ngroups = 1000
         size = ngroups * 2
         rng = np.arange(ngroups)
         values = rng.take(np.random.randint(0, ngroups, size=size))
         if dtype == 'int':
             key = np.random.randint(0, size, size=size)
-        else:
+        elif dtype == 'float':
             key = np.concatenate([np.random.random(ngroups) * 0.1,
                                   np.random.random(ngroups) * 10.0])
+        elif dtype == 'object':
+            key = ['foo'] * size
+        elif dtype == 'datetime':
+            key = date_range('1/1/2011', periods=size, freq='s')
 
         df = DataFrame({'values': values, 'key': key})
-        self.df_groupby_method = getattr(df.groupby('key')['values'], method)
 
-    def time_method(self, dtype, method):
-        self.df_groupby_method()
+        if application == 'transform':
+            if method == 'describe':
+                raise NotImplementedError
+
+            self.as_group_method = lambda: df.groupby(
+                'key')['values'].transform(method)
+            self.as_field_method = lambda: df.groupby(
+                'values')['key'].transform(method)
+        else:
+            self.as_group_method = getattr(df.groupby('key')['values'], method)
+            self.as_field_method = getattr(df.groupby('values')['key'], method)
+
+    def time_dtype_as_group(self, dtype, method, application):
+        self.as_group_method()
+
+    def time_dtype_as_field(self, dtype, method, application):
+        self.as_field_method()
 
 
 class Float32(object):

diff --git a/ci/environment-dev.yaml b/ci/environment-dev.yaml
@@ -5,6 +5,7 @@ channels:
 dependencies:
   - Cython
   - NumPy
+  - flake8
   - moto
   - pytest>=3.1
   - python-dateutil>=2.5.0

diff --git a/ci/requirements-3.6_DOC.run b/ci/requirements-3.6_DOC.run
@@ -5,7 +5,7 @@ sphinx
 nbconvert
 nbformat
 notebook
-matplotlib
+matplotlib=2.1*
 seaborn
 scipy
 lxml

diff --git a/ci/requirements_dev.txt b/ci/requirements_dev.txt
@@ -2,9 +2,10 @@
 # Do not modify directly
 Cython
 NumPy
+flake8
 moto
 pytest>=3.1
 python-dateutil>=2.5.0
 pytz
 setuptools>=3.3
-sphinx
+sphinx
-Original file line number
+Diff line change
@@ Expand Up / @@ -5,6 +5,7 @@ channels: @@
     dependencies:
       - Cython
       - NumPy
+      - flake8
       - moto
       - pytest>=3.1
       - python-dateutil>=2.5.0
@@ Expand Down @@