'.format(style=div_style))
self.write_style()
- self.write('
'
- .format(border=self.border, cls=' '.join(_classes)), indent)
+
+ if self.table_id is not None:
+ id_section = ' id="{table_id}"'.format(table_id=self.table_id)
+ self.write(''
+ .format(border=self.border, cls=' '.join(_classes),
+ id_section=id_section), indent)
indent += self.indent_delta
indent = self._write_header(indent)
diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py
index e0ce27de5c31f..dddba5b425c3b 100644
--- a/pandas/tests/io/formats/test_format.py
+++ b/pandas/tests/io/formats/test_format.py
@@ -1492,7 +1492,7 @@ def test_repr_html_float(self):
'B': np.arange(41, 41 + h)}).set_index('idx')
reg_repr = df._repr_html_()
assert '..' not in reg_repr
- assert str(40 + h) in reg_repr
+ assert '{val} | '.format(val=str(40 + h)) in reg_repr
h = max_rows + 1
df = DataFrame({'idx': np.linspace(-10, 10, h),
@@ -1500,7 +1500,7 @@ def test_repr_html_float(self):
'B': np.arange(41, 41 + h)}).set_index('idx')
long_repr = df._repr_html_()
assert '..' in long_repr
- assert '31' not in long_repr
+ assert '{val} | '.format(val='31') not in long_repr
assert u('{h} rows ').format(h=h) in long_repr
assert u('2 columns') in long_repr
diff --git a/pandas/tests/io/formats/test_to_html.py b/pandas/tests/io/formats/test_to_html.py
index 9e063c2d176e1..f69cac62513d4 100644
--- a/pandas/tests/io/formats/test_to_html.py
+++ b/pandas/tests/io/formats/test_to_html.py
@@ -1864,3 +1864,10 @@ def test_to_html_with_index_names_false(self):
name='myindexname'))
result = df.to_html(index_names=False)
assert 'myindexname' not in result
+
+ def test_to_html_with_id(self):
+ # gh-8496
+ df = pd.DataFrame({"A": [1, 2]}, index=pd.Index(['a', 'b'],
+ name='myindexname'))
+ result = df.to_html(index_names=False, table_id="TEST_ID")
+ assert ' id="TEST_ID"' in result
From a214915e241ea15f3d072d54930d0e0c8f42ee10 Mon Sep 17 00:00:00 2001
From: Tom Augspurger
Date: Fri, 9 Feb 2018 10:11:17 -0600
Subject: [PATCH 078/214] CI: Fixed NumPy pinning in conda-build (#19575)
* CI: Fixed NumPy pinning in conda-build
* Unpin NumPy
Quite install
* Pin numpy
* Unpin everywhere else
* Build vs. 1.11
* remove one more pin
* Remove one more pin
* bump pyarrow
---
ci/install_travis.sh | 6 +++---
ci/requirements-3.5_CONDA_BUILD_TEST.build | 2 +-
ci/requirements-3.5_CONDA_BUILD_TEST.run | 2 +-
ci/requirements-3.5_CONDA_BUILD_TEST.sh | 2 +-
ci/requirements-3.6.build | 2 +-
conda.recipe/meta.yaml | 4 ++--
6 files changed, 9 insertions(+), 9 deletions(-)
diff --git a/ci/install_travis.sh b/ci/install_travis.sh
index 4ec5b0a9d8820..6e270519e60c3 100755
--- a/ci/install_travis.sh
+++ b/ci/install_travis.sh
@@ -110,7 +110,7 @@ if [ -e ${REQ} ]; then
fi
time conda install -n pandas pytest>=3.1.0
-time pip install pytest-xdist moto
+time pip install -q pytest-xdist moto
if [ "$LINT" ]; then
conda install flake8=3.4.1
@@ -181,10 +181,10 @@ elif [ "$CONDA_BUILD_TEST" ]; then
# build & install testing
echo "[building conda recipe]"
- time conda build ./conda.recipe --numpy 1.13 --python 3.5 -q --no-test
+ time conda build ./conda.recipe --python 3.5 -q --no-test || exit 1
echo "[installing]"
- conda install pandas --use-local
+ conda install pandas --use-local || exit 1
else
diff --git a/ci/requirements-3.5_CONDA_BUILD_TEST.build b/ci/requirements-3.5_CONDA_BUILD_TEST.build
index 6648e3778777c..f7befe3b31865 100644
--- a/ci/requirements-3.5_CONDA_BUILD_TEST.build
+++ b/ci/requirements-3.5_CONDA_BUILD_TEST.build
@@ -2,5 +2,5 @@ python=3.5*
python-dateutil
pytz
nomkl
-numpy=1.13*
+numpy
cython
diff --git a/ci/requirements-3.5_CONDA_BUILD_TEST.run b/ci/requirements-3.5_CONDA_BUILD_TEST.run
index 19d9a91e86585..669cf437f2164 100644
--- a/ci/requirements-3.5_CONDA_BUILD_TEST.run
+++ b/ci/requirements-3.5_CONDA_BUILD_TEST.run
@@ -1,5 +1,5 @@
pytz
-numpy=1.13*
+numpy
openpyxl
xlsxwriter
xlrd
diff --git a/ci/requirements-3.5_CONDA_BUILD_TEST.sh b/ci/requirements-3.5_CONDA_BUILD_TEST.sh
index 09d6775cfc894..093fdbcf21d78 100644
--- a/ci/requirements-3.5_CONDA_BUILD_TEST.sh
+++ b/ci/requirements-3.5_CONDA_BUILD_TEST.sh
@@ -8,4 +8,4 @@ echo "install 35 CONDA_BUILD_TEST"
conda remove -n pandas python-dateutil --force
pip install python-dateutil
-conda install -n pandas -c conda-forge feather-format pyarrow=0.5.0
+conda install -n pandas -c conda-forge feather-format pyarrow=0.7.1
diff --git a/ci/requirements-3.6.build b/ci/requirements-3.6.build
index 94e1152450d87..1c4b46aea3865 100644
--- a/ci/requirements-3.6.build
+++ b/ci/requirements-3.6.build
@@ -2,5 +2,5 @@ python=3.6*
python-dateutil
pytz
nomkl
-numpy=1.13.*
+numpy
cython
diff --git a/conda.recipe/meta.yaml b/conda.recipe/meta.yaml
index 87a79f7e5a987..86bed996c8aab 100644
--- a/conda.recipe/meta.yaml
+++ b/conda.recipe/meta.yaml
@@ -14,14 +14,14 @@ requirements:
build:
- python
- cython
- - {{ pin_compatible('numpy', upper_bound='1.14') }}
+ - numpy 1.11.*
- setuptools >=3.3
- python-dateutil >=2.5.0
- pytz
run:
- python
- - {{ pin_compatible('numpy', upper_bound='1.14') }}
+ - numpy >=1.11.*
- python-dateutil >=2.5.0
- pytz
From 6485a36483884fb817800a8380a4a4197d6df4ad Mon Sep 17 00:00:00 2001
From: Tom Augspurger
Date: Fri, 9 Feb 2018 14:53:41 -0600
Subject: [PATCH 079/214] API: Default ExtensionArray.astype (#19604)
* API: Default ExtensionArray.astype
(cherry picked from commit 943a915562b72bed147c857de927afa0daf31c1a)
* Py2 compat
* Moved
* Moved dtypes
---
pandas/core/arrays/base.py | 21 +++++++++
pandas/tests/dtypes/test_dtypes.py | 32 +------------
pandas/tests/extension/__init__.py | 0
pandas/tests/extension/test_common.py | 67 +++++++++++++++++++++++++++
4 files changed, 89 insertions(+), 31 deletions(-)
create mode 100644 pandas/tests/extension/__init__.py
create mode 100644 pandas/tests/extension/test_common.py
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
index 1556b653819a6..553e1e0ac2066 100644
--- a/pandas/core/arrays/base.py
+++ b/pandas/core/arrays/base.py
@@ -1,4 +1,6 @@
"""An interface for extending pandas with custom arrays."""
+import numpy as np
+
from pandas.errors import AbstractMethodError
_not_implemented_message = "{} does not implement {}."
@@ -138,6 +140,25 @@ def nbytes(self):
# ------------------------------------------------------------------------
# Additional Methods
# ------------------------------------------------------------------------
+ def astype(self, dtype, copy=True):
+ """Cast to a NumPy array with 'dtype'.
+
+ Parameters
+ ----------
+ dtype : str or dtype
+ Typecode or data-type to which the array is cast.
+ copy : bool, default True
+ Whether to copy the data, even if not necessary. If False,
+ a copy is made only if the old dtype does not match the
+ new dtype.
+
+ Returns
+ -------
+ array : ndarray
+ NumPy ndarray with 'dtype' for its dtype.
+ """
+ return np.array(self, dtype=dtype, copy=copy)
+
def isna(self):
# type: () -> np.ndarray
"""Boolean NumPy array indicating if each value is missing.
diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py
index eca4dd4cf2106..d800a7b92b559 100644
--- a/pandas/tests/dtypes/test_dtypes.py
+++ b/pandas/tests/dtypes/test_dtypes.py
@@ -10,14 +10,12 @@
Series, Categorical, CategoricalIndex, IntervalIndex, date_range)
from pandas.compat import string_types
-from pandas.core.arrays import ExtensionArray
from pandas.core.dtypes.dtypes import (
DatetimeTZDtype, PeriodDtype,
- IntervalDtype, CategoricalDtype, ExtensionDtype)
+ IntervalDtype, CategoricalDtype)
from pandas.core.dtypes.common import (
is_categorical_dtype, is_categorical,
is_datetime64tz_dtype, is_datetimetz,
- is_extension_array_dtype,
is_period_dtype, is_period,
is_dtype_equal, is_datetime64_ns_dtype,
is_datetime64_dtype, is_interval_dtype,
@@ -744,31 +742,3 @@ def test_categorical_categories(self):
tm.assert_index_equal(c1.categories, pd.Index(['a', 'b']))
c1 = CategoricalDtype(CategoricalIndex(['a', 'b']))
tm.assert_index_equal(c1.categories, pd.Index(['a', 'b']))
-
-
-class DummyArray(ExtensionArray):
- pass
-
-
-class DummyDtype(ExtensionDtype):
- pass
-
-
-class TestExtensionArrayDtype(object):
-
- @pytest.mark.parametrize('values', [
- pd.Categorical([]),
- pd.Categorical([]).dtype,
- pd.Series(pd.Categorical([])),
- DummyDtype(),
- DummyArray(),
- ])
- def test_is_extension_array_dtype(self, values):
- assert is_extension_array_dtype(values)
-
- @pytest.mark.parametrize('values', [
- np.array([]),
- pd.Series(np.array([])),
- ])
- def test_is_not_extension_array_dtype(self, values):
- assert not is_extension_array_dtype(values)
diff --git a/pandas/tests/extension/__init__.py b/pandas/tests/extension/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/pandas/tests/extension/test_common.py b/pandas/tests/extension/test_common.py
new file mode 100644
index 0000000000000..1f4582f687415
--- /dev/null
+++ b/pandas/tests/extension/test_common.py
@@ -0,0 +1,67 @@
+import numpy as np
+import pytest
+
+import pandas as pd
+import pandas.util.testing as tm
+from pandas.core.arrays import ExtensionArray
+from pandas.core.dtypes.common import is_extension_array_dtype
+from pandas.core.dtypes.dtypes import ExtensionDtype
+
+
+class DummyDtype(ExtensionDtype):
+ pass
+
+
+class DummyArray(ExtensionArray):
+
+ def __init__(self, data):
+ self.data = data
+
+ def __array__(self, dtype):
+ return self.data
+
+ @property
+ def dtype(self):
+ return self.data.dtype
+
+
+class TestExtensionArrayDtype(object):
+
+ @pytest.mark.parametrize('values', [
+ pd.Categorical([]),
+ pd.Categorical([]).dtype,
+ pd.Series(pd.Categorical([])),
+ DummyDtype(),
+ DummyArray(np.array([1, 2])),
+ ])
+ def test_is_extension_array_dtype(self, values):
+ assert is_extension_array_dtype(values)
+
+ @pytest.mark.parametrize('values', [
+ np.array([]),
+ pd.Series(np.array([])),
+ ])
+ def test_is_not_extension_array_dtype(self, values):
+ assert not is_extension_array_dtype(values)
+
+
+def test_astype():
+
+ arr = DummyArray(np.array([1, 2, 3]))
+ expected = np.array([1, 2, 3], dtype=object)
+
+ result = arr.astype(object)
+ tm.assert_numpy_array_equal(result, expected)
+
+ result = arr.astype('object')
+ tm.assert_numpy_array_equal(result, expected)
+
+
+def test_astype_no_copy():
+ arr = DummyArray(np.array([1, 2, 3], dtype=np.int64))
+ result = arr.astype(arr.dtype, copy=False)
+
+ assert arr.data is result
+
+ result = arr.astype(arr.dtype)
+ assert arr.data is not result
From c1068d9d242c22cb2199156f6fb82eb5759178ae Mon Sep 17 00:00:00 2001
From: William Ayd
Date: Sat, 10 Feb 2018 08:05:51 -0800
Subject: [PATCH 080/214] PERF: Cythonize Groupby Rank (#19481)
---
doc/source/whatsnew/v0.23.0.txt | 1 +
pandas/_libs/algos.pxd | 8 ++
pandas/_libs/algos.pyx | 8 --
pandas/_libs/groupby.pyx | 5 +-
pandas/_libs/groupby_helper.pxi.in | 165 ++++++++++++++++++++++++++
pandas/core/groupby.py | 76 +++++++++---
pandas/tests/groupby/test_groupby.py | 166 +++++++++++++++++++++++++++
7 files changed, 406 insertions(+), 23 deletions(-)
diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
index 083242cd69b74..cf5a44442045b 100644
--- a/doc/source/whatsnew/v0.23.0.txt
+++ b/doc/source/whatsnew/v0.23.0.txt
@@ -581,6 +581,7 @@ Performance Improvements
- Improved performance of :func:`DataFrame.median` with ``axis=1`` when bottleneck is not installed (:issue:`16468`)
- Improved performance of :func:`MultiIndex.get_loc` for large indexes, at the cost of a reduction in performance for small ones (:issue:`18519`)
- Improved performance of pairwise ``.rolling()`` and ``.expanding()`` with ``.cov()`` and ``.corr()`` operations (:issue:`17917`)
+- Improved performance of :func:`DataFrameGroupBy.rank` (:issue:`15779`)
.. _whatsnew_0230.docs:
diff --git a/pandas/_libs/algos.pxd b/pandas/_libs/algos.pxd
index 6d80e6f0073eb..a535872ff7279 100644
--- a/pandas/_libs/algos.pxd
+++ b/pandas/_libs/algos.pxd
@@ -11,3 +11,11 @@ cdef inline Py_ssize_t swap(numeric *a, numeric *b) nogil:
a[0] = b[0]
b[0] = t
return 0
+
+cdef enum TiebreakEnumType:
+ TIEBREAK_AVERAGE
+ TIEBREAK_MIN,
+ TIEBREAK_MAX
+ TIEBREAK_FIRST
+ TIEBREAK_FIRST_DESCENDING
+ TIEBREAK_DENSE
diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
index 5d17488963b1c..a418e54e4da9b 100644
--- a/pandas/_libs/algos.pyx
+++ b/pandas/_libs/algos.pyx
@@ -31,14 +31,6 @@ cdef double nan = NaN
cdef int64_t iNaT = get_nat()
-cdef:
- int TIEBREAK_AVERAGE = 0
- int TIEBREAK_MIN = 1
- int TIEBREAK_MAX = 2
- int TIEBREAK_FIRST = 3
- int TIEBREAK_FIRST_DESCENDING = 4
- int TIEBREAK_DENSE = 5
-
tiebreakers = {
'average': TIEBREAK_AVERAGE,
'min': TIEBREAK_MIN,
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
index 55de700c9af52..d75c3a71896e3 100644
--- a/pandas/_libs/groupby.pyx
+++ b/pandas/_libs/groupby.pyx
@@ -16,8 +16,9 @@ from numpy cimport (ndarray,
from libc.stdlib cimport malloc, free
from util cimport numeric, get_nat
-from algos cimport swap
-from algos import take_2d_axis1_float64_float64, groupsort_indexer
+from algos cimport (swap, TiebreakEnumType, TIEBREAK_AVERAGE, TIEBREAK_MIN,
+ TIEBREAK_MAX, TIEBREAK_FIRST, TIEBREAK_DENSE)
+from algos import take_2d_axis1_float64_float64, groupsort_indexer, tiebreakers
cdef int64_t iNaT = get_nat()
diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in
index a751fadaf48cf..b24444c422efa 100644
--- a/pandas/_libs/groupby_helper.pxi.in
+++ b/pandas/_libs/groupby_helper.pxi.in
@@ -444,8 +444,173 @@ def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
else:
out[i, j] = resx[i, j]
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
+ ndarray[{{c_type}}, ndim=2] values,
+ ndarray[int64_t] labels,
+ bint is_datetimelike, object ties_method,
+ bint ascending, bint pct, object na_option):
+ """Provides the rank of values within each group
+
+ Parameters
+ ----------
+ out : array of float64_t values which this method will write its results to
+ values : array of {{c_type}} values to be ranked
+ labels : array containing unique label for each group, with its ordering
+ matching up to the corresponding record in `values`
+ is_datetimelike : bool
+ unused in this method but provided for call compatability with other
+ Cython transformations
+ ties_method : {'keep', 'top', 'bottom'}
+ * keep: leave NA values where they are
+ * top: smallest rank if ascending
+ * bottom: smallest rank if descending
+ ascending : boolean
+ False for ranks by high (1) to low (N)
+ pct : boolean
+ Compute percentage rank of data within each group
+
+ Notes
+ -----
+ This method modifies the `out` parameter rather than returning an object
+ """
+ cdef:
+ TiebreakEnumType tiebreak
+ Py_ssize_t i, j, N, K, val_start=0, grp_start=0, dups=0, sum_ranks=0
+ Py_ssize_t grp_vals_seen=1, grp_na_count=0
+ ndarray[int64_t] _as
+ ndarray[float64_t, ndim=2] grp_sizes
+ ndarray[{{c_type}}] masked_vals
+ ndarray[uint8_t] mask
+ bint keep_na
+ {{c_type}} nan_fill_val
+
+ tiebreak = tiebreakers[ties_method]
+ keep_na = na_option == 'keep'
+ N, K = (