Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
harisbal committed Mar 11, 2018
2 parents f00a2d1 + 0d86742 commit 4708db0
Show file tree
Hide file tree
Showing 155 changed files with 8,465 additions and 4,641 deletions.
24 changes: 24 additions & 0 deletions .github/PULL_REQUEST_TEMPLATE.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,27 @@
Checklist for the pandas documentation sprint (ignore this if you are doing
an unrelated PR):

- [ ] PR title is "DOC: update the <your-function-or-method> docstring"
- [ ] The validation script passes: `scripts/validate_docstrings.py <your-function-or-method>`
- [ ] The PEP8 style check passes: `git diff upstream/master -u -- "*.py" | flake8 --diff`
- [ ] The html version looks good: `python doc/make.py --single <your-function-or-method>`
- [ ] It has been proofread on language by another sprint participant

Please include the output of the validation script below between the "```" ticks:

```
# paste output of "scripts/validate_docstrings.py <your-function-or-method>" here
# between the "```" (remove this comment, but keep the "```")
```

If the validation script still gives errors, but you think there is a good reason
to deviate in this case (and there are certainly such cases), please state this
explicitly.


Checklist for other PRs (remove this part if you are doing a PR for the pandas documentation sprint):

- [ ] closes #xxxx
- [ ] tests added / passed
- [ ] passes `git diff upstream/master -u -- "*.py" | flake8 --diff`
Expand Down
5 changes: 3 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,9 @@ scikits
*.c
*.cpp

# Performance Testing #
#######################
# Unit / Performance Testing #
##############################
.pytest_cache/
asv_bench/env/
asv_bench/html/
asv_bench/results/
Expand Down
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -216,13 +216,16 @@ Further, general questions and discussions can also take place on the [pydata ma
## Discussion and Development
Most development discussion is taking place on github in this repo. Further, the [pandas-dev mailing list](https://mail.python.org/mailman/listinfo/pandas-dev) can also be used for specialized discussions or design issues, and a [Gitter channel](https://gitter.im/pydata/pandas) is available for quick development related questions.

## Contributing to pandas
## Contributing to pandas [![Open Source Helpers](https://www.codetriage.com/pandas-dev/pandas/badges/users.svg)](https://www.codetriage.com/pandas-dev/pandas)

All contributions, bug reports, bug fixes, documentation improvements, enhancements and ideas are welcome.

A detailed overview on how to contribute can be found in the **[contributing guide.](https://pandas.pydata.org/pandas-docs/stable/contributing.html)**

If you are simply looking to start working with the pandas codebase, navigate to the [GitHub “issues” tab](https://github.com/pandas-dev/pandas/issues) and start looking through interesting issues. There are a number of issues listed under [Docs](https://github.com/pandas-dev/pandas/issues?labels=Docs&sort=updated&state=open) and [Difficulty Novice](https://github.com/pandas-dev/pandas/issues?q=is%3Aopen+is%3Aissue+label%3A%22Difficulty+Novice%22) where you could start out.

You can also triage issues which may include reproducing bug reports, or asking for vital information such as version numbers or reproduction instructions. If you would like to start triaging issues, one easy way to get started is to [subscribe to pandas on CodeTriage](https://www.codetriage.com/pandas-dev/pandas).

Or maybe through using pandas you have an idea of your own or are looking for something in the documentation and thinking ‘this can be improved’...you can do something about it!

Feel free to ask questions on the [mailing list](https://groups.google.com/forum/?fromgroups#!forum/pydata) or on [Gitter](https://gitter.im/pydata/pandas).
147 changes: 69 additions & 78 deletions asv_bench/benchmarks/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,16 @@
from .pandas_vb_common import setup # noqa


method_blacklist = {
'object': {'median', 'prod', 'sem', 'cumsum', 'sum', 'cummin', 'mean',
'max', 'skew', 'cumprod', 'cummax', 'rank', 'pct_change', 'min',
'var', 'mad', 'describe', 'std'},
'datetime': {'median', 'prod', 'sem', 'cumsum', 'sum', 'mean', 'skew',
'cumprod', 'cummax', 'pct_change', 'var', 'mad', 'describe',
'std'}
}


class ApplyDictReturn(object):
goal_time = 0.2

Expand Down Expand Up @@ -83,45 +93,6 @@ def time_series_groups(self, data, key):
self.ser.groupby(self.ser).groups


class FirstLast(object):

goal_time = 0.2

param_names = ['dtype']
params = ['float32', 'float64', 'datetime', 'object']

def setup(self, dtype):
N = 10**5
# with datetimes (GH7555)
if dtype == 'datetime':
self.df = DataFrame({'values': date_range('1/1/2011',
periods=N,
freq='s'),
'key': range(N)})
elif dtype == 'object':
self.df = DataFrame({'values': ['foo'] * N,
'key': range(N)})
else:
labels = np.arange(N / 10).repeat(10)
data = Series(np.random.randn(len(labels)), dtype=dtype)
data[::3] = np.nan
data[1::3] = np.nan
labels = labels.take(np.random.permutation(len(labels)))
self.df = DataFrame({'values': data, 'key': labels})

def time_groupby_first(self, dtype):
self.df.groupby('key').first()

def time_groupby_last(self, dtype):
self.df.groupby('key').last()

def time_groupby_nth_all(self, dtype):
self.df.groupby('key').nth(0, dropna='all')

def time_groupby_nth_none(self, dtype):
self.df.groupby('key').nth(0)


class GroupManyLabels(object):

goal_time = 0.2
Expand All @@ -142,38 +113,40 @@ class Nth(object):

goal_time = 0.2

def setup_cache(self):
df = DataFrame(np.random.randint(1, 100, (10000, 2)))
df.iloc[1, 1] = np.nan
return df

def time_frame_nth_any(self, df):
df.groupby(0).nth(0, dropna='any')

def time_frame_nth(self, df):
df.groupby(0).nth(0)
param_names = ['dtype']
params = ['float32', 'float64', 'datetime', 'object']

def time_series_nth_any(self, df):
df[1].groupby(df[0]).nth(0, dropna='any')
def setup(self, dtype):
N = 10**5
# with datetimes (GH7555)
if dtype == 'datetime':
values = date_range('1/1/2011', periods=N, freq='s')
elif dtype == 'object':
values = ['foo'] * N
else:
values = np.arange(N).astype(dtype)

def time_series_nth(self, df):
df[1].groupby(df[0]).nth(0)
key = np.arange(N)
self.df = DataFrame({'key': key, 'values': values})
self.df.iloc[1, 1] = np.nan # insert missing data

def time_frame_nth_any(self, dtype):
self.df.groupby('key').nth(0, dropna='any')

class NthObject(object):
def time_groupby_nth_all(self, dtype):
self.df.groupby('key').nth(0, dropna='all')

goal_time = 0.2
def time_frame_nth(self, dtype):
self.df.groupby('key').nth(0)

def setup_cache(self):
df = DataFrame(np.random.randint(1, 100, (10000,)), columns=['g'])
df['obj'] = ['a'] * 5000 + ['b'] * 5000
return df
def time_series_nth_any(self, dtype):
self.df['values'].groupby(self.df['key']).nth(0, dropna='any')

def time_nth(self, df):
df.groupby('g').nth(5)
def time_groupby_nth_all(self, dtype):
self.df['values'].groupby(self.df['key']).nth(0, dropna='all')

def time_nth_last(self, df):
df.groupby('g').last()
def time_series_nth(self, dtype):
self.df['values'].groupby(self.df['key']).nth(0)


class DateAttributes(object):
Expand Down Expand Up @@ -235,7 +208,7 @@ def time_multi_count(self, df):
df.groupby(['key1', 'key2']).count()


class CountInt(object):
class CountMultiInt(object):

goal_time = 0.2

Expand All @@ -247,18 +220,18 @@ def setup_cache(self):
'ints2': np.random.randint(0, 1000, size=n)})
return df

def time_int_count(self, df):
def time_multi_int_count(self, df):
df.groupby(['key1', 'key2']).count()

def time_int_nunique(self, df):
def time_multi_int_nunique(self, df):
df.groupby(['key1', 'key2']).nunique()


class AggFunctions(object):

goal_time = 0.2

def setup_cache(self):
def setup_cache():
N = 10**5
fac1 = np.array(['A', 'B', 'C'], dtype='O')
fac2 = np.array(['one', 'two'], dtype='O')
Expand Down Expand Up @@ -353,9 +326,6 @@ def setup(self):
def time_multi_size(self):
self.df.groupby(['key1', 'key2']).size()

def time_dt_size(self):
self.df.groupby(['dates']).size()

def time_dt_timegrouper_size(self):
with warnings.catch_warnings(record=True):
self.df.groupby(TimeGrouper(key='dates', freq='M')).size()
Expand All @@ -368,30 +338,51 @@ class GroupByMethods(object):

goal_time = 0.2

param_names = ['dtype', 'method']
params = [['int', 'float'],
param_names = ['dtype', 'method', 'application']
params = [['int', 'float', 'object', 'datetime'],
['all', 'any', 'bfill', 'count', 'cumcount', 'cummax', 'cummin',
'cumprod', 'cumsum', 'describe', 'ffill', 'first', 'head',
'last', 'mad', 'max', 'min', 'median', 'mean', 'nunique',
'pct_change', 'prod', 'rank', 'sem', 'shift', 'size', 'skew',
'std', 'sum', 'tail', 'unique', 'value_counts', 'var']]
'std', 'sum', 'tail', 'unique', 'value_counts', 'var'],
['direct', 'transformation']]

def setup(self, dtype, method):
def setup(self, dtype, method, application):
if method in method_blacklist.get(dtype, {}):
raise NotImplementedError # skip benchmark
ngroups = 1000
size = ngroups * 2
rng = np.arange(ngroups)
values = rng.take(np.random.randint(0, ngroups, size=size))
if dtype == 'int':
key = np.random.randint(0, size, size=size)
else:
elif dtype == 'float':
key = np.concatenate([np.random.random(ngroups) * 0.1,
np.random.random(ngroups) * 10.0])
elif dtype == 'object':
key = ['foo'] * size
elif dtype == 'datetime':
key = date_range('1/1/2011', periods=size, freq='s')

df = DataFrame({'values': values, 'key': key})
self.df_groupby_method = getattr(df.groupby('key')['values'], method)

def time_method(self, dtype, method):
self.df_groupby_method()
if application == 'transform':
if method == 'describe':
raise NotImplementedError

self.as_group_method = lambda: df.groupby(
'key')['values'].transform(method)
self.as_field_method = lambda: df.groupby(
'values')['key'].transform(method)
else:
self.as_group_method = getattr(df.groupby('key')['values'], method)
self.as_field_method = getattr(df.groupby('values')['key'], method)

def time_dtype_as_group(self, dtype, method, application):
self.as_group_method()

def time_dtype_as_field(self, dtype, method, application):
self.as_field_method()


class Float32(object):
Expand Down
1 change: 1 addition & 0 deletions ci/environment-dev.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ channels:
dependencies:
- Cython
- NumPy
- flake8
- moto
- pytest>=3.1
- python-dateutil>=2.5.0
Expand Down
2 changes: 1 addition & 1 deletion ci/requirements-3.6_DOC.run
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ sphinx
nbconvert
nbformat
notebook
matplotlib
matplotlib=2.1*
seaborn
scipy
lxml
Expand Down
3 changes: 2 additions & 1 deletion ci/requirements_dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@
# Do not modify directly
Cython
NumPy
flake8
moto
pytest>=3.1
python-dateutil>=2.5.0
pytz
setuptools>=3.3
sphinx
sphinx
Loading

0 comments on commit 4708db0

Please sign in to comment.