Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/master' into fu1+fillna
Browse files Browse the repository at this point in the history
  • Loading branch information
TomAugspurger committed Mar 12, 2018
2 parents b342efe + 0815c43 commit a609f48
Show file tree
Hide file tree
Showing 96 changed files with 6,498 additions and 4,054 deletions.
24 changes: 24 additions & 0 deletions .github/PULL_REQUEST_TEMPLATE.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,27 @@
Checklist for the pandas documentation sprint (ignore this if you are doing
an unrelated PR):

- [ ] PR title is "DOC: update the <your-function-or-method> docstring"
- [ ] The validation script passes: `scripts/validate_docstrings.py <your-function-or-method>`
- [ ] The PEP8 style check passes: `git diff upstream/master -u -- "*.py" | flake8 --diff`
- [ ] The html version looks good: `python doc/make.py --single <your-function-or-method>`
- [ ] It has been proofread on language by another sprint participant

Please include the output of the validation script below between the "```" ticks:

```
# paste output of "scripts/validate_docstrings.py <your-function-or-method>" here
# between the "```" (remove this comment, but keep the "```")
```

If the validation script still gives errors, but you think there is a good reason
to deviate in this case (and there are certainly such cases), please state this
explicitly.


Checklist for other PRs (remove this part if you are doing a PR for the pandas documentation sprint):

- [ ] closes #xxxx
- [ ] tests added / passed
- [ ] passes `git diff upstream/master -u -- "*.py" | flake8 --diff`
Expand Down
5 changes: 3 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,9 @@ scikits
*.c
*.cpp

# Performance Testing #
#######################
# Unit / Performance Testing #
##############################
.pytest_cache/
asv_bench/env/
asv_bench/html/
asv_bench/results/
Expand Down
137 changes: 58 additions & 79 deletions asv_bench/benchmarks/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,10 @@
method_blacklist = {
'object': {'median', 'prod', 'sem', 'cumsum', 'sum', 'cummin', 'mean',
'max', 'skew', 'cumprod', 'cummax', 'rank', 'pct_change', 'min',
'var', 'mad', 'describe', 'std'}
'var', 'mad', 'describe', 'std'},
'datetime': {'median', 'prod', 'sem', 'cumsum', 'sum', 'mean', 'skew',
'cumprod', 'cummax', 'pct_change', 'var', 'mad', 'describe',
'std'}
}


Expand Down Expand Up @@ -90,45 +93,6 @@ def time_series_groups(self, data, key):
self.ser.groupby(self.ser).groups


class FirstLast(object):

goal_time = 0.2

param_names = ['dtype']
params = ['float32', 'float64', 'datetime', 'object']

def setup(self, dtype):
N = 10**5
# with datetimes (GH7555)
if dtype == 'datetime':
self.df = DataFrame({'values': date_range('1/1/2011',
periods=N,
freq='s'),
'key': range(N)})
elif dtype == 'object':
self.df = DataFrame({'values': ['foo'] * N,
'key': range(N)})
else:
labels = np.arange(N / 10).repeat(10)
data = Series(np.random.randn(len(labels)), dtype=dtype)
data[::3] = np.nan
data[1::3] = np.nan
labels = labels.take(np.random.permutation(len(labels)))
self.df = DataFrame({'values': data, 'key': labels})

def time_groupby_first(self, dtype):
self.df.groupby('key').first()

def time_groupby_last(self, dtype):
self.df.groupby('key').last()

def time_groupby_nth_all(self, dtype):
self.df.groupby('key').nth(0, dropna='all')

def time_groupby_nth_none(self, dtype):
self.df.groupby('key').nth(0)


class GroupManyLabels(object):

goal_time = 0.2
Expand All @@ -149,39 +113,40 @@ class Nth(object):

goal_time = 0.2

def setup_cache(self):
df = DataFrame(np.random.randint(1, 100, (10000, 2)))
df.iloc[1, 1] = np.nan
return df

def time_frame_nth_any(self, df):
df.groupby(0).nth(0, dropna='any')

def time_frame_nth(self, df):
df.groupby(0).nth(0)

param_names = ['dtype']
params = ['float32', 'float64', 'datetime', 'object']

def time_series_nth_any(self, df):
df[1].groupby(df[0]).nth(0, dropna='any')
def setup(self, dtype):
N = 10**5
# with datetimes (GH7555)
if dtype == 'datetime':
values = date_range('1/1/2011', periods=N, freq='s')
elif dtype == 'object':
values = ['foo'] * N
else:
values = np.arange(N).astype(dtype)

def time_series_nth(self, df):
df[1].groupby(df[0]).nth(0)
key = np.arange(N)
self.df = DataFrame({'key': key, 'values': values})
self.df.iloc[1, 1] = np.nan # insert missing data

def time_frame_nth_any(self, dtype):
self.df.groupby('key').nth(0, dropna='any')

class NthObject(object):
def time_groupby_nth_all(self, dtype):
self.df.groupby('key').nth(0, dropna='all')

goal_time = 0.2
def time_frame_nth(self, dtype):
self.df.groupby('key').nth(0)

def setup_cache(self):
df = DataFrame(np.random.randint(1, 100, (10000,)), columns=['g'])
df['obj'] = ['a'] * 5000 + ['b'] * 5000
return df
def time_series_nth_any(self, dtype):
self.df['values'].groupby(self.df['key']).nth(0, dropna='any')

def time_nth(self, df):
df.groupby('g').nth(5)
def time_groupby_nth_all(self, dtype):
self.df['values'].groupby(self.df['key']).nth(0, dropna='all')

def time_nth_last(self, df):
df.groupby('g').last()
def time_series_nth(self, dtype):
self.df['values'].groupby(self.df['key']).nth(0)


class DateAttributes(object):
Expand Down Expand Up @@ -243,7 +208,7 @@ def time_multi_count(self, df):
df.groupby(['key1', 'key2']).count()


class CountInt(object):
class CountMultiInt(object):

goal_time = 0.2

Expand All @@ -255,18 +220,18 @@ def setup_cache(self):
'ints2': np.random.randint(0, 1000, size=n)})
return df

def time_int_count(self, df):
def time_multi_int_count(self, df):
df.groupby(['key1', 'key2']).count()

def time_int_nunique(self, df):
def time_multi_int_nunique(self, df):
df.groupby(['key1', 'key2']).nunique()


class AggFunctions(object):

goal_time = 0.2

def setup_cache(self):
def setup_cache():
N = 10**5
fac1 = np.array(['A', 'B', 'C'], dtype='O')
fac2 = np.array(['one', 'two'], dtype='O')
Expand Down Expand Up @@ -361,9 +326,6 @@ def setup(self):
def time_multi_size(self):
self.df.groupby(['key1', 'key2']).size()

def time_dt_size(self):
self.df.groupby(['dates']).size()

def time_dt_timegrouper_size(self):
with warnings.catch_warnings(record=True):
self.df.groupby(TimeGrouper(key='dates', freq='M')).size()
Expand All @@ -376,15 +338,16 @@ class GroupByMethods(object):

goal_time = 0.2

param_names = ['dtype', 'method']
params = [['int', 'float', 'object'],
param_names = ['dtype', 'method', 'application']
params = [['int', 'float', 'object', 'datetime'],
['all', 'any', 'bfill', 'count', 'cumcount', 'cummax', 'cummin',
'cumprod', 'cumsum', 'describe', 'ffill', 'first', 'head',
'last', 'mad', 'max', 'min', 'median', 'mean', 'nunique',
'pct_change', 'prod', 'rank', 'sem', 'shift', 'size', 'skew',
'std', 'sum', 'tail', 'unique', 'value_counts', 'var']]
'std', 'sum', 'tail', 'unique', 'value_counts', 'var'],
['direct', 'transformation']]

def setup(self, dtype, method):
def setup(self, dtype, method, application):
if method in method_blacklist.get(dtype, {}):
raise NotImplementedError # skip benchmark
ngroups = 1000
Expand All @@ -398,12 +361,28 @@ def setup(self, dtype, method):
np.random.random(ngroups) * 10.0])
elif dtype == 'object':
key = ['foo'] * size
elif dtype == 'datetime':
key = date_range('1/1/2011', periods=size, freq='s')

df = DataFrame({'values': values, 'key': key})
self.df_groupby_method = getattr(df.groupby('key')['values'], method)

def time_method(self, dtype, method):
self.df_groupby_method()
if application == 'transform':
if method == 'describe':
raise NotImplementedError

self.as_group_method = lambda: df.groupby(
'key')['values'].transform(method)
self.as_field_method = lambda: df.groupby(
'values')['key'].transform(method)
else:
self.as_group_method = getattr(df.groupby('key')['values'], method)
self.as_field_method = getattr(df.groupby('values')['key'], method)

def time_dtype_as_group(self, dtype, method, application):
self.as_group_method()

def time_dtype_as_field(self, dtype, method, application):
self.as_field_method()


class Float32(object):
Expand Down
1 change: 1 addition & 0 deletions ci/environment-dev.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ channels:
dependencies:
- Cython
- NumPy
- flake8
- moto
- pytest>=3.1
- python-dateutil>=2.5.0
Expand Down
2 changes: 1 addition & 1 deletion ci/requirements-3.6_DOC.run
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ sphinx
nbconvert
nbformat
notebook
matplotlib
matplotlib=2.1*
seaborn
scipy
lxml
Expand Down
3 changes: 1 addition & 2 deletions ci/requirements-3.6_NUMPY_DEV.build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,7 @@ PRE_WHEELS="https://7933911d6844c6c53a7d-47bd50c35cd79bd838daf386af554a83.ssl.cf
pip install --pre --upgrade --timeout=60 -f $PRE_WHEELS numpy scipy

# install dateutil from master
# pip install -U git+git://github.com/dateutil/dateutil.git
pip install dateutil
pip install -U git+git://github.com/dateutil/dateutil.git

# cython via pip
pip install cython
Expand Down
3 changes: 2 additions & 1 deletion ci/requirements_dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@
# Do not modify directly
Cython
NumPy
flake8
moto
pytest>=3.1
python-dateutil>=2.5.0
pytz
setuptools>=3.3
sphinx
sphinx
13 changes: 9 additions & 4 deletions doc/make.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
$ python make.py html
$ python make.py latex
"""
import importlib
import sys
import os
import shutil
Expand All @@ -20,8 +21,6 @@
import webbrowser
import jinja2

import pandas


DOC_PATH = os.path.dirname(os.path.abspath(__file__))
SOURCE_PATH = os.path.join(DOC_PATH, 'source')
Expand Down Expand Up @@ -134,7 +133,7 @@ def _process_single_doc(self, single_doc):
self.single_doc = single_doc
elif single_doc is not None:
try:
obj = pandas
obj = pandas # noqa: F821
for name in single_doc.split('.'):
obj = getattr(obj, name)
except AttributeError:
Expand Down Expand Up @@ -332,7 +331,7 @@ def main():
'compile, e.g. "indexing", "DataFrame.join"'))
argparser.add_argument('--python-path',
type=str,
default=os.path.join(DOC_PATH, '..'),
default=os.path.dirname(DOC_PATH),
help='path')
argparser.add_argument('-v', action='count', dest='verbosity', default=0,
help=('increase verbosity (can be repeated), '
Expand All @@ -343,7 +342,13 @@ def main():
raise ValueError('Unknown command {}. Available options: {}'.format(
args.command, ', '.join(cmds)))

# Below we update both os.environ and sys.path. The former is used by
# external libraries (namely Sphinx) to compile this module and resolve
# the import of `python_path` correctly. The latter is used to resolve
# the import within the module, injecting it into the global namespace
os.environ['PYTHONPATH'] = args.python_path
sys.path.append(args.python_path)
globals()['pandas'] = importlib.import_module('pandas')

builder = DocBuilder(args.num_jobs, not args.no_api, args.single,
args.verbosity)
Expand Down
2 changes: 1 addition & 1 deletion doc/source/categorical.rst
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ are consistent among all columns.
.. note::

To perform table-wise conversion, where all labels in the entire ``DataFrame`` are used as
categories for each column, the ``categories`` parameter can be determined programatically by
categories for each column, the ``categories`` parameter can be determined programmatically by
``categories = pd.unique(df.values.ravel())``.

If you already have ``codes`` and ``categories``, you can use the
Expand Down
Loading

0 comments on commit a609f48

Please sign in to comment.