From 888d1fae80a975147e3f99f9254bf1dbca3affd3 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Fri, 23 Aug 2019 01:06:03 -0700 Subject: [PATCH 01/95] DOC: update GroupBy.head()/tail() documentation (#27844) --- pandas/core/groupby/groupby.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 3e8d079e47326b..3eeecd9c149e1b 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2370,8 +2370,9 @@ def head(self, n=5): """ Return first n rows of each group. - Essentially equivalent to ``.apply(lambda x: x.head(n))``, - except ignores as_index flag. + Similar to ``.apply(lambda x: x.head(n))``, but it returns a subset of rows + from the original DataFrame with original index and order preserved + (``as_index`` flag is ignored). Returns ------- @@ -2382,10 +2383,6 @@ def head(self, n=5): >>> df = pd.DataFrame([[1, 2], [1, 4], [5, 6]], ... columns=['A', 'B']) - >>> df.groupby('A', as_index=False).head(1) - A B - 0 1 2 - 2 5 6 >>> df.groupby('A').head(1) A B 0 1 2 @@ -2401,8 +2398,9 @@ def tail(self, n=5): """ Return last n rows of each group. - Essentially equivalent to ``.apply(lambda x: x.tail(n))``, - except ignores as_index flag. + Similar to ``.apply(lambda x: x.tail(n))``, but it returns a subset of rows + from the original DataFrame with original index and order preserved + (``as_index`` flag is ignored). Returns ------- @@ -2417,10 +2415,6 @@ def tail(self, n=5): A B 1 a 2 3 b 2 - >>> df.groupby('A').head(1) - A B - 0 a 1 - 2 b 1 """ self._reset_group_selection() mask = self._cumcount_array(ascending=False) < n From d5ba4c14c62c1a23f53773c4e3ecb3bd9a792a91 Mon Sep 17 00:00:00 2001 From: Wuraola Oyewusi Date: Fri, 23 Aug 2019 10:01:28 +0100 Subject: [PATCH 02/95] DOC: Remove alias for numpy.random.randn from the docs (#28082) --- doc/source/conf.py | 1 - doc/source/whatsnew/v0.10.0.rst | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/source/conf.py b/doc/source/conf.py index 3ebc5d8b6333b2..a4b7d97c2cf5e2 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -315,7 +315,6 @@ import numpy as np import pandas as pd - randn = np.random.randn np.random.seed(123456) np.set_printoptions(precision=4, suppress=True) pd.options.display.max_rows = 15 diff --git a/doc/source/whatsnew/v0.10.0.rst b/doc/source/whatsnew/v0.10.0.rst index 59ea6b97762327..2e0442364b2f32 100644 --- a/doc/source/whatsnew/v0.10.0.rst +++ b/doc/source/whatsnew/v0.10.0.rst @@ -498,7 +498,7 @@ Here is a taste of what to expect. .. code-block:: ipython - In [58]: p4d = Panel4D(randn(2, 2, 5, 4), + In [58]: p4d = Panel4D(np.random.randn(2, 2, 5, 4), ....: labels=['Label1','Label2'], ....: items=['Item1', 'Item2'], ....: major_axis=date_range('1/1/2000', periods=5), From c7ceff98395b13aded759a6ac8d1fbe49fc9113c Mon Sep 17 00:00:00 2001 From: "Martina G. Vilas" Date: Fri, 23 Aug 2019 11:03:00 +0200 Subject: [PATCH 03/95] DOC: Fix docstrings lack of punctuation (#28031) --- pandas/core/arrays/base.py | 4 ++-- pandas/core/arrays/datetimes.py | 4 ++-- pandas/core/arrays/period.py | 2 +- pandas/core/indexes/datetimes.py | 12 ++++++------ pandas/core/indexes/multi.py | 6 +++--- pandas/core/indexes/timedeltas.py | 22 +++++++++++----------- pandas/core/indexing.py | 2 +- 7 files changed, 26 insertions(+), 26 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 5c121172d0e4fc..0778b6726d1041 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -514,7 +514,7 @@ def fillna(self, value=None, method=None, limit=None): def dropna(self): """ - Return ExtensionArray without NA values + Return ExtensionArray without NA values. Returns ------- @@ -957,7 +957,7 @@ def _concat_same_type( cls, to_concat: Sequence[ABCExtensionArray] ) -> ABCExtensionArray: """ - Concatenate multiple array + Concatenate multiple array. Parameters ---------- diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 093334a815938e..70df708d36b3bf 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -1158,7 +1158,7 @@ def tz_localize(self, tz, ambiguous="raise", nonexistent="raise", errors=None): def to_pydatetime(self): """ Return Datetime Array/Index as object ndarray of datetime.datetime - objects + objects. Returns ------- @@ -1283,7 +1283,7 @@ def to_perioddelta(self, freq): """ Calculate TimedeltaArray of difference between index values and index converted to PeriodArray at specified - freq. Used for vectorized offsets + freq. Used for vectorized offsets. Parameters ---------- diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 20ce11c70c3443..f2d74794eadf53 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -426,7 +426,7 @@ def __array__(self, dtype=None): @property def is_leap_year(self): """ - Logical indicating if the date belongs to a leap year + Logical indicating if the date belongs to a leap year. """ return isleapyear_arr(np.asarray(self.year)) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 51daad3b426493..272066d476ce34 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -661,7 +661,7 @@ def _get_time_micros(self): def to_series(self, keep_tz=None, index=None, name=None): """ Create a Series with both index and values equal to the index keys - useful with map for returning an indexer based on an index + useful with map for returning an indexer based on an index. Parameters ---------- @@ -687,10 +687,10 @@ def to_series(self, keep_tz=None, index=None, name=None): behaviour and silence the warning. index : Index, optional - index of resulting Series. If None, defaults to original index - name : string, optional - name of resulting Series. If None, defaults to name of original - index + Index of resulting Series. If None, defaults to original index. + name : str, optional + Name of resulting Series. If None, defaults to name of original + index. Returns ------- @@ -735,7 +735,7 @@ def to_series(self, keep_tz=None, index=None, name=None): def snap(self, freq="S"): """ - Snap time stamps to nearest occurring frequency + Snap time stamps to nearest occurring frequency. Returns ------- diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index b614952ba1e043..761862b9f30e98 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1250,7 +1250,7 @@ def _set_names(self, names, level=None, validate=True): self.levels[l].rename(name, inplace=True) names = property( - fset=_set_names, fget=_get_names, doc="""\nNames of levels in MultiIndex\n""" + fset=_set_names, fget=_get_names, doc="""\nNames of levels in MultiIndex.\n""" ) @Appender(_index_shared_docs["_get_grouper_for_level"]) @@ -1762,7 +1762,7 @@ def is_all_dates(self): def is_lexsorted(self): """ - Return True if the codes are lexicographically sorted + Return True if the codes are lexicographically sorted. Returns ------- @@ -2246,7 +2246,7 @@ def swaplevel(self, i=-2, j=-1): def reorder_levels(self, order): """ - Rearrange levels using input order. May not drop or duplicate levels + Rearrange levels using input order. May not drop or duplicate levels. Parameters ---------- diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index d06afa3daa792f..8cf14e2ca777e4 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -68,20 +68,20 @@ class TimedeltaIndex( ): """ Immutable ndarray of timedelta64 data, represented internally as int64, and - which can be boxed to timedelta objects + which can be boxed to timedelta objects. Parameters ---------- data : array-like (1-dimensional), optional - Optional timedelta-like data to construct index with + Optional timedelta-like data to construct index with. unit : unit of the arg (D,h,m,s,ms,us,ns) denote the unit, optional - which is an integer/float number - freq : string or pandas offset object, optional + Which is an integer/float number. + freq : str or pandas offset object, optional One of pandas date offset strings or corresponding objects. The string 'infer' can be passed in order to set the frequency of the index as the - inferred frequency upon creation + inferred frequency upon creation. copy : bool - Make a copy of input ndarray + Make a copy of input ndarray. start : starting value, timedelta-like, optional If data is None, start is used as the start point in generating regular timedelta data. @@ -90,24 +90,24 @@ class TimedeltaIndex( periods : int, optional, > 0 Number of periods to generate, if generating index. Takes precedence - over end argument + over end argument. .. deprecated:: 0.24.0 end : end time, timedelta-like, optional If periods is none, generated index will extend to first conforming - time on or just past end argument + time on or just past end argument. .. deprecated:: 0.24. 0 - closed : string or None, default None + closed : str or None, default None Make the interval closed with respect to the given frequency to - the 'left', 'right', or both sides (None) + the 'left', 'right', or both sides (None). .. deprecated:: 0.24. 0 name : object - Name to be stored in the index + Name to be stored in the index. Attributes ---------- diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 7bb5e2fa3018d1..b8ca3419af4d7e 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -49,7 +49,7 @@ def get_indexers_list(): # the public IndexSlicerMaker class _IndexSlice: """ - Create an object to more easily perform multi-index slicing + Create an object to more easily perform multi-index slicing. See Also -------- From 9dc4d718e093ccbb15e024da6d3bad80f4e99ba6 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 23 Aug 2019 08:36:19 -0500 Subject: [PATCH 04/95] DOC: Start 0.25.2 (#28111) * DOC: Start 0.25.2 --- doc/source/whatsnew/index.rst | 1 + doc/source/whatsnew/v0.25.2.rst | 110 ++++++++++++++++++++++++++++++++ 2 files changed, 111 insertions(+) create mode 100644 doc/source/whatsnew/v0.25.2.rst diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index aeab2cf5809e79..fe80cc8bb959a5 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -24,6 +24,7 @@ Version 0.25 .. toctree:: :maxdepth: 2 + v0.25.2 v0.25.1 v0.25.0 diff --git a/doc/source/whatsnew/v0.25.2.rst b/doc/source/whatsnew/v0.25.2.rst new file mode 100644 index 00000000000000..76473405374e84 --- /dev/null +++ b/doc/source/whatsnew/v0.25.2.rst @@ -0,0 +1,110 @@ +.. _whatsnew_0252: + +What's new in 0.25.2 (October XX, 2019) +--------------------------------------- + +These are the changes in pandas 0.25.2. See :ref:`release` for a full changelog +including other versions of pandas. + +.. _whatsnew_0252.bug_fixes: + +Bug fixes +~~~~~~~~~ + +Categorical +^^^^^^^^^^^ + +- + +Datetimelike +^^^^^^^^^^^^ + +- +- +- + +Timezones +^^^^^^^^^ + +- + +Numeric +^^^^^^^ + +- +- +- +- + +Conversion +^^^^^^^^^^ + +- + +Interval +^^^^^^^^ + +- + +Indexing +^^^^^^^^ + +- +- +- +- + +Missing +^^^^^^^ + +- + +I/O +^^^ + +- +- +- + +Plotting +^^^^^^^^ + +- +- +- + +Groupby/resample/rolling +^^^^^^^^^^^^^^^^^^^^^^^^ + +- +- +- +- +- + +Reshaping +^^^^^^^^^ + +- +- +- +- +- + +Sparse +^^^^^^ + +- + +Other +^^^^^ + +- +- + +.. _whatsnew_0.252.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v0.25.1..HEAD From 347ad8564ec7dbf679f61e88f6914ab20d7ae3da Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 23 Aug 2019 11:11:49 -0700 Subject: [PATCH 05/95] TST: fix compression tests when run without virtualenv/condaenv (#28051) --- pandas/tests/io/test_compression.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index 16ca1109f266cc..d68b6a1effaa0a 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -1,6 +1,7 @@ import contextlib import os import subprocess +import sys import textwrap import warnings @@ -139,7 +140,7 @@ def test_with_missing_lzma(): import pandas """ ) - subprocess.check_output(["python", "-c", code]) + subprocess.check_output([sys.executable, "-c", code]) def test_with_missing_lzma_runtime(): @@ -156,4 +157,4 @@ def test_with_missing_lzma_runtime(): df.to_csv('foo.csv', compression='xz') """ ) - subprocess.check_output(["python", "-c", code]) + subprocess.check_output([sys.executable, "-c", code]) From e2483c022d58d0871cf2d961b9636bbf7d81917c Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 23 Aug 2019 23:36:58 +0100 Subject: [PATCH 06/95] TYPING: more type hints for io.formats.printing (#27765) --- pandas/io/formats/printing.py | 40 ++++++++++++++--------------------- 1 file changed, 16 insertions(+), 24 deletions(-) diff --git a/pandas/io/formats/printing.py b/pandas/io/formats/printing.py index 4ec9094ce4abe4..ead51693da7919 100644 --- a/pandas/io/formats/printing.py +++ b/pandas/io/formats/printing.py @@ -3,12 +3,14 @@ """ import sys -from typing import Callable, Dict, Iterable, List, Optional, Sequence, Tuple, Union +from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple, Union from pandas._config import get_option from pandas.core.dtypes.inference import is_sequence +EscapeChars = Union[Dict[str, str], Iterable[str]] + def adjoin(space: int, *lists: List[str], **kwargs) -> str: """ @@ -148,19 +150,16 @@ def _pprint_dict( def pprint_thing( - thing, + thing: Any, _nest_lvl: int = 0, - escape_chars: Optional[Union[Dict[str, str], Iterable[str]]] = None, + escape_chars: Optional[EscapeChars] = None, default_escapes: bool = False, quote_strings: bool = False, max_seq_items: Optional[int] = None, ) -> str: """ This function is the sanctioned way of converting objects - to a unicode representation. - - properly handles nested sequences containing unicode strings - (unicode(object) does not) + to a string representation and properly handles nested sequences. Parameters ---------- @@ -178,21 +177,13 @@ def pprint_thing( Returns ------- - result - unicode str + str """ - def as_escaped_unicode(thing, escape_chars=escape_chars): - # Unicode is fine, else we try to decode using utf-8 and 'replace' - # if that's not it either, we have no way of knowing and the user - # should deal with it himself. - - try: - result = str(thing) # we should try this first - except UnicodeDecodeError: - # either utf-8 or we replace errors - result = str(thing).decode("utf-8", "replace") - + def as_escaped_string( + thing: Any, escape_chars: Optional[EscapeChars] = escape_chars + ) -> str: translate = {"\t": r"\t", "\n": r"\n", "\r": r"\r"} if isinstance(escape_chars, dict): if default_escapes: @@ -202,10 +193,11 @@ def as_escaped_unicode(thing, escape_chars=escape_chars): escape_chars = list(escape_chars.keys()) else: escape_chars = escape_chars or tuple() + + result = str(thing) for c in escape_chars: result = result.replace(c, translate[c]) - - return str(result) + return result if hasattr(thing, "__next__"): return str(thing) @@ -224,11 +216,11 @@ def as_escaped_unicode(thing, escape_chars=escape_chars): max_seq_items=max_seq_items, ) elif isinstance(thing, str) and quote_strings: - result = "'{thing}'".format(thing=as_escaped_unicode(thing)) + result = "'{thing}'".format(thing=as_escaped_string(thing)) else: - result = as_escaped_unicode(thing) + result = as_escaped_string(thing) - return str(result) # always unicode + return result def pprint_thing_encoded( From d75ee703efc0d201af2f05bd166b0f58ec5977b5 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Sat, 24 Aug 2019 00:38:17 +0200 Subject: [PATCH 07/95] Remove Encoding of values in char** For Labels (#27618) --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/_libs/src/ujson/lib/ultrajson.h | 7 - pandas/_libs/src/ujson/lib/ultrajsonenc.c | 6 + pandas/_libs/src/ujson/python/objToJSON.c | 234 +++++++++++++++------- pandas/tests/io/json/test_pandas.py | 134 ++++++++----- 5 files changed, 250 insertions(+), 132 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 4decc99087a9e4..8e25857e5ad693 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -159,6 +159,7 @@ I/O ^^^ - :meth:`read_csv` now accepts binary mode file buffers when using the Python csv engine (:issue:`23779`) +- Bug in :meth:`DataFrame.to_json` where using a Tuple as a column or index value and using ``orient="columns"`` or ``orient="index"`` would produce invalid JSON (:issue:`20500`) - Plotting diff --git a/pandas/_libs/src/ujson/lib/ultrajson.h b/pandas/_libs/src/ujson/lib/ultrajson.h index 0470fef450dde3..ee6e7081bf00e2 100644 --- a/pandas/_libs/src/ujson/lib/ultrajson.h +++ b/pandas/_libs/src/ujson/lib/ultrajson.h @@ -307,11 +307,4 @@ EXPORTFUNCTION JSOBJ JSON_DecodeObject(JSONObjectDecoder *dec, const char *buffer, size_t cbBuffer); EXPORTFUNCTION void encode(JSOBJ, JSONObjectEncoder *, const char *, size_t); -#define Buffer_Reserve(__enc, __len) \ - if ((size_t)((__enc)->end - (__enc)->offset) < (size_t)(__len)) { \ - Buffer_Realloc((__enc), (__len)); \ - } - -void Buffer_Realloc(JSONObjectEncoder *enc, size_t cbNeeded); - #endif // PANDAS__LIBS_SRC_UJSON_LIB_ULTRAJSON_H_ diff --git a/pandas/_libs/src/ujson/lib/ultrajsonenc.c b/pandas/_libs/src/ujson/lib/ultrajsonenc.c index 2d6c823a45515e..d5b379bee585b4 100644 --- a/pandas/_libs/src/ujson/lib/ultrajsonenc.c +++ b/pandas/_libs/src/ujson/lib/ultrajsonenc.c @@ -714,6 +714,12 @@ int Buffer_EscapeStringValidated(JSOBJ obj, JSONObjectEncoder *enc, } } +#define Buffer_Reserve(__enc, __len) \ + if ( (size_t) ((__enc)->end - (__enc)->offset) < (size_t) (__len)) \ + { \ + Buffer_Realloc((__enc), (__len));\ + } \ + #define Buffer_AppendCharUnchecked(__enc, __chr) *((__enc)->offset++) = __chr; FASTCALL_ATTR INLINE_PREFIX void FASTCALL_MSVC strreverse(char *begin, diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 926440218b5d93..de336fb3aa1dcb 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -48,13 +48,13 @@ Numeric decoder derived from from TCL library #include <../../../tslibs/src/datetime/np_datetime_strings.h> #include "datetime.h" -#define NPY_JSON_BUFSIZE 32768 - static PyTypeObject *type_decimal; static PyTypeObject *cls_dataframe; static PyTypeObject *cls_series; static PyTypeObject *cls_index; static PyTypeObject *cls_nat; +PyObject *cls_timestamp; +PyObject *cls_timedelta; npy_int64 get_nat(void) { return NPY_MIN_INT64; } @@ -166,6 +166,8 @@ void *initObjToJSON(void) cls_index = (PyTypeObject *)PyObject_GetAttrString(mod_pandas, "Index"); cls_series = (PyTypeObject *)PyObject_GetAttrString(mod_pandas, "Series"); + cls_timestamp = PyObject_GetAttrString(mod_pandas, "Timestamp"); + cls_timedelta = PyObject_GetAttrString(mod_pandas, "Timedelta"); Py_DECREF(mod_pandas); } @@ -787,30 +789,23 @@ JSOBJ NpyArr_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -static void NpyArr_getLabel(JSOBJ obj, JSONTypeContext *tc, size_t *outLen, - npy_intp idx, char **labels) { - JSONObjectEncoder *enc = (JSONObjectEncoder *)tc->encoder; - PRINTMARK(); - *outLen = strlen(labels[idx]); - Buffer_Reserve(enc, *outLen); - memcpy(enc->offset, labels[idx], sizeof(char) * (*outLen)); - enc->offset += *outLen; - *outLen = 0; -} - char *NpyArr_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { NpyArrContext *npyarr = GET_TC(tc)->npyarr; npy_intp idx; PRINTMARK(); + char *cStr; if (GET_TC(tc)->iterNext == NpyArr_iterNextItem) { idx = npyarr->index[npyarr->stridedim] - 1; - NpyArr_getLabel(obj, tc, outLen, idx, npyarr->columnLabels); + cStr = npyarr->columnLabels[idx]; } else { idx = npyarr->index[npyarr->stridedim - npyarr->inc] - 1; - NpyArr_getLabel(obj, tc, outLen, idx, npyarr->rowLabels); + cStr = npyarr->rowLabels[idx]; } - return NULL; + + *outLen = strlen(cStr); + + return cStr; } //============================================================================= @@ -852,19 +847,22 @@ char *PdBlock_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; NpyArrContext *npyarr = blkCtxt->npyCtxts[0]; npy_intp idx; + char *cStr; PRINTMARK(); if (GET_TC(tc)->iterNext == PdBlock_iterNextItem) { idx = blkCtxt->colIdx - 1; - NpyArr_getLabel(obj, tc, outLen, idx, npyarr->columnLabels); + cStr = npyarr->columnLabels[idx]; } else { idx = GET_TC(tc)->iterNext != PdBlock_iterNext ? npyarr->index[npyarr->stridedim - npyarr->inc] - 1 : npyarr->index[npyarr->stridedim]; - NpyArr_getLabel(obj, tc, outLen, idx, npyarr->rowLabels); + cStr = npyarr->rowLabels[idx]; } - return NULL; + + *outLen = strlen(cStr); + return cStr; } char *PdBlock_iterGetName_Transpose(JSOBJ obj, JSONTypeContext *tc, @@ -872,16 +870,19 @@ char *PdBlock_iterGetName_Transpose(JSOBJ obj, JSONTypeContext *tc, PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; NpyArrContext *npyarr = blkCtxt->npyCtxts[blkCtxt->colIdx]; npy_intp idx; + char *cStr; PRINTMARK(); if (GET_TC(tc)->iterNext == NpyArr_iterNextItem) { idx = npyarr->index[npyarr->stridedim] - 1; - NpyArr_getLabel(obj, tc, outLen, idx, npyarr->columnLabels); + cStr = npyarr->columnLabels[idx]; } else { idx = blkCtxt->colIdx; - NpyArr_getLabel(obj, tc, outLen, idx, npyarr->rowLabels); + cStr = npyarr->rowLabels[idx]; } - return NULL; + + *outLen = strlen(cStr); + return cStr; } int PdBlock_iterNext(JSOBJ obj, JSONTypeContext *tc) { @@ -1578,16 +1579,30 @@ void NpyArr_freeLabels(char **labels, npy_intp len) { } } -char **NpyArr_encodeLabels(PyArrayObject *labels, JSONObjectEncoder *enc, +/* + * Function: NpyArr_encodeLabels + * ----------------------------- + * + * Builds an array of "encoded" labels. + * + * labels: PyArrayObject pointer for labels to be "encoded" + * num : number of labels + * + * "encode" is quoted above because we aren't really doing encoding + * For historical reasons this function would actually encode the entire + * array into a separate buffer with a separate call to JSON_Encode + * and would leave it to complex pointer manipulation from there to + * unpack values as needed. To make things simpler and more idiomatic + * this has instead just stringified any input save for datetime values, + * which may need to be represented in various formats. + */ +char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, npy_intp num) { // NOTE this function steals a reference to labels. - PyObjectEncoder *pyenc = (PyObjectEncoder *)enc; PyObject *item = NULL; - npy_intp i, stride, len, need_quotes; + npy_intp i, stride, len; char **ret; - char *dataptr, *cLabel, *origend, *origst, *origoffset; - char labelBuffer[NPY_JSON_BUFSIZE]; - PyArray_GetItemFunc *getitem; + char *dataptr, *cLabel; int type_num; PRINTMARK(); @@ -1614,68 +1629,136 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, JSONObjectEncoder *enc, ret[i] = NULL; } - origst = enc->start; - origend = enc->end; - origoffset = enc->offset; - stride = PyArray_STRIDE(labels, 0); dataptr = PyArray_DATA(labels); - getitem = (PyArray_GetItemFunc *)PyArray_DESCR(labels)->f->getitem; type_num = PyArray_TYPE(labels); for (i = 0; i < num; i++) { - if (PyTypeNum_ISDATETIME(type_num) || PyTypeNum_ISNUMBER(type_num)) - { - item = (PyObject *)labels; - pyenc->npyType = type_num; - pyenc->npyValue = dataptr; - } else { - item = getitem(dataptr, labels); - if (!item) { - NpyArr_freeLabels(ret, num); - ret = 0; - break; - } - } - - cLabel = JSON_EncodeObject(item, enc, labelBuffer, NPY_JSON_BUFSIZE); - - if (item != (PyObject *)labels) { - Py_DECREF(item); - } - - if (PyErr_Occurred() || enc->errorMsg) { + item = PyArray_GETITEM(labels, dataptr); + if (!item) { + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } + + // TODO: for any matches on type_num (date and timedeltas) should use a + // vectorized solution to convert to epoch or iso formats + if (enc->datetimeIso && (type_num == NPY_TIMEDELTA || PyDelta_Check(item))) { + PyObject *td = PyObject_CallFunction(cls_timedelta, "(O)", item); + if (td == NULL) { + Py_DECREF(item); + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } + + PyObject *iso = PyObject_CallMethod(td, "isoformat", NULL); + Py_DECREF(td); + if (iso == NULL) { + Py_DECREF(item); + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } + + cLabel = (char *)PyUnicode_AsUTF8(iso); + Py_DECREF(iso); + len = strlen(cLabel); + } + else if (PyTypeNum_ISDATETIME(type_num) || + PyDateTime_Check(item) || PyDate_Check(item)) { + PyObject *ts = PyObject_CallFunction(cls_timestamp, "(O)", item); + if (ts == NULL) { + Py_DECREF(item); + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } + + if (enc->datetimeIso) { + PyObject *iso = PyObject_CallMethod(ts, "isoformat", NULL); + Py_DECREF(ts); + if (iso == NULL) { + Py_DECREF(item); + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } + + cLabel = (char *)PyUnicode_AsUTF8(iso); + Py_DECREF(iso); + len = strlen(cLabel); + } else { + npy_int64 value; + // TODO: refactor to not duplicate what goes on in beginTypeContext + if (PyObject_HasAttrString(ts, "value")) { + PRINTMARK(); + value = get_long_attr(ts, "value"); + } else { + PRINTMARK(); + value = + total_seconds(ts) * 1000000000LL; // nanoseconds per second + } + Py_DECREF(ts); + + switch (enc->datetimeUnit) { + case NPY_FR_ns: + break; + case NPY_FR_us: + value /= 1000LL; + break; + case NPY_FR_ms: + value /= 1000000LL; + break; + case NPY_FR_s: + value /= 1000000000LL; + break; + default: + Py_DECREF(item); + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } + + char buf[21] = {0}; // 21 chars for 2**63 as string + cLabel = buf; + sprintf(buf, "%" NPY_INT64_FMT, value); + len = strlen(cLabel); + } + } else { // Fallack to string representation + PyObject *str = PyObject_Str(item); + if (str == NULL) { + Py_DECREF(item); + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } + + cLabel = (char *)PyUnicode_AsUTF8(str); + Py_DECREF(str); + len = strlen(cLabel); + } + + Py_DECREF(item); + // Add 1 to include NULL terminator + ret[i] = PyObject_Malloc(len + 1); + memcpy(ret[i], cLabel, len + 1); + + if (PyErr_Occurred()) { NpyArr_freeLabels(ret, num); ret = 0; break; } - need_quotes = ((*cLabel) != '"'); - len = enc->offset - cLabel + 1 + 2 * need_quotes; - ret[i] = PyObject_Malloc(sizeof(char) * len); - if (!ret[i]) { PyErr_NoMemory(); ret = 0; break; } - if (need_quotes) { - ret[i][0] = '"'; - memcpy(ret[i] + 1, cLabel, sizeof(char) * (len - 4)); - ret[i][len - 3] = '"'; - } else { - memcpy(ret[i], cLabel, sizeof(char) * (len - 2)); - } - ret[i][len - 2] = ':'; - ret[i][len - 1] = '\0'; dataptr += stride; } - enc->start = origst; - enc->end = origend; - enc->offset = origoffset; - Py_DECREF(labels); return ret; } @@ -1972,7 +2055,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { } pc->columnLabelsLen = PyArray_DIM(pc->newObj, 0); pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, - (JSONObjectEncoder *)enc, + enc, pc->columnLabelsLen); if (!pc->columnLabels) { goto INVALID; @@ -2075,7 +2158,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { } pc->columnLabelsLen = PyObject_Size(tmpObj); pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, - (JSONObjectEncoder *)enc, + enc, pc->columnLabelsLen); Py_DECREF(tmpObj); if (!pc->columnLabels) { @@ -2098,7 +2181,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { pc->rowLabelsLen = PyObject_Size(tmpObj); pc->rowLabels = NpyArr_encodeLabels((PyArrayObject *)values, - (JSONObjectEncoder *)enc, pc->rowLabelsLen); + enc, pc->rowLabelsLen); Py_DECREF(tmpObj); tmpObj = (enc->outputFormat == INDEX ? PyObject_GetAttrString(obj, "columns") @@ -2117,7 +2200,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { } pc->columnLabelsLen = PyObject_Size(tmpObj); pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, - (JSONObjectEncoder *)enc, + enc, pc->columnLabelsLen); Py_DECREF(tmpObj); if (!pc->columnLabels) { @@ -2429,7 +2512,6 @@ PyObject *objToJSON(PyObject *self, PyObject *args, PyObject *kwargs) { PRINTMARK(); ret = JSON_EncodeObject(oinput, encoder, buffer, sizeof(buffer)); PRINTMARK(); - if (PyErr_Occurred()) { PRINTMARK(); return NULL; diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 9c687f036aa684..9842a706f43d78 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1012,60 +1012,70 @@ def test_convert_dates_infer(self): result = read_json(dumps(data))[["id", infer_word]] assert_frame_equal(result, expected) - def test_date_format_frame(self): + @pytest.mark.parametrize( + "date,date_unit", + [ + ("20130101 20:43:42.123", None), + ("20130101 20:43:42", "s"), + ("20130101 20:43:42.123", "ms"), + ("20130101 20:43:42.123456", "us"), + ("20130101 20:43:42.123456789", "ns"), + ], + ) + def test_date_format_frame(self, date, date_unit): df = self.tsframe.copy() - def test_w_date(date, date_unit=None): - df["date"] = Timestamp(date) - df.iloc[1, df.columns.get_loc("date")] = pd.NaT - df.iloc[5, df.columns.get_loc("date")] = pd.NaT - if date_unit: - json = df.to_json(date_format="iso", date_unit=date_unit) - else: - json = df.to_json(date_format="iso") - result = read_json(json) - expected = df.copy() - expected.index = expected.index.tz_localize("UTC") - expected["date"] = expected["date"].dt.tz_localize("UTC") - assert_frame_equal(result, expected) - - test_w_date("20130101 20:43:42.123") - test_w_date("20130101 20:43:42", date_unit="s") - test_w_date("20130101 20:43:42.123", date_unit="ms") - test_w_date("20130101 20:43:42.123456", date_unit="us") - test_w_date("20130101 20:43:42.123456789", date_unit="ns") + df["date"] = Timestamp(date) + df.iloc[1, df.columns.get_loc("date")] = pd.NaT + df.iloc[5, df.columns.get_loc("date")] = pd.NaT + if date_unit: + json = df.to_json(date_format="iso", date_unit=date_unit) + else: + json = df.to_json(date_format="iso") + result = read_json(json) + expected = df.copy() + # expected.index = expected.index.tz_localize("UTC") + expected["date"] = expected["date"].dt.tz_localize("UTC") + assert_frame_equal(result, expected) + def test_date_format_frame_raises(self): + df = self.tsframe.copy() msg = "Invalid value 'foo' for option 'date_unit'" with pytest.raises(ValueError, match=msg): df.to_json(date_format="iso", date_unit="foo") - def test_date_format_series(self): - def test_w_date(date, date_unit=None): - ts = Series(Timestamp(date), index=self.ts.index) - ts.iloc[1] = pd.NaT - ts.iloc[5] = pd.NaT - if date_unit: - json = ts.to_json(date_format="iso", date_unit=date_unit) - else: - json = ts.to_json(date_format="iso") - result = read_json(json, typ="series") - expected = ts.copy() - expected.index = expected.index.tz_localize("UTC") - expected = expected.dt.tz_localize("UTC") - assert_series_equal(result, expected) - - test_w_date("20130101 20:43:42.123") - test_w_date("20130101 20:43:42", date_unit="s") - test_w_date("20130101 20:43:42.123", date_unit="ms") - test_w_date("20130101 20:43:42.123456", date_unit="us") - test_w_date("20130101 20:43:42.123456789", date_unit="ns") + @pytest.mark.parametrize( + "date,date_unit", + [ + ("20130101 20:43:42.123", None), + ("20130101 20:43:42", "s"), + ("20130101 20:43:42.123", "ms"), + ("20130101 20:43:42.123456", "us"), + ("20130101 20:43:42.123456789", "ns"), + ], + ) + def test_date_format_series(self, date, date_unit): + ts = Series(Timestamp(date), index=self.ts.index) + ts.iloc[1] = pd.NaT + ts.iloc[5] = pd.NaT + if date_unit: + json = ts.to_json(date_format="iso", date_unit=date_unit) + else: + json = ts.to_json(date_format="iso") + result = read_json(json, typ="series") + expected = ts.copy() + # expected.index = expected.index.tz_localize("UTC") + expected = expected.dt.tz_localize("UTC") + assert_series_equal(result, expected) + def test_date_format_series_raises(self): ts = Series(Timestamp("20130101 20:43:42.123"), index=self.ts.index) msg = "Invalid value 'foo' for option 'date_unit'" with pytest.raises(ValueError, match=msg): ts.to_json(date_format="iso", date_unit="foo") - def test_date_unit(self): + @pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"]) + def test_date_unit(self, unit): df = self.tsframe.copy() df["date"] = Timestamp("20130101 20:43:42") dl = df.columns.get_loc("date") @@ -1073,16 +1083,15 @@ def test_date_unit(self): df.iloc[2, dl] = Timestamp("21460101 20:43:42") df.iloc[4, dl] = pd.NaT - for unit in ("s", "ms", "us", "ns"): - json = df.to_json(date_format="epoch", date_unit=unit) + json = df.to_json(date_format="epoch", date_unit=unit) - # force date unit - result = read_json(json, date_unit=unit) - assert_frame_equal(result, df) + # force date unit + result = read_json(json, date_unit=unit) + assert_frame_equal(result, df) - # detect date unit - result = read_json(json, date_unit=None) - assert_frame_equal(result, df) + # detect date unit + result = read_json(json, date_unit=None) + assert_frame_equal(result, df) def test_weird_nested_json(self): # this used to core dump the parser @@ -1611,3 +1620,30 @@ def test_read_timezone_information(self): ) expected = Series([88], index=DatetimeIndex(["2019-01-01 11:00:00"], tz="UTC")) assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "date_format,key", [("epoch", 86400000), ("iso", "P1DT0H0M0S")] + ) + def test_timedelta_as_label(self, date_format, key): + df = pd.DataFrame([[1]], columns=[pd.Timedelta("1D")]) + expected = '{{"{key}":{{"0":1}}}}'.format(key=key) + result = df.to_json(date_format=date_format) + + assert result == expected + + @pytest.mark.parametrize( + "orient,expected", + [ + ("index", "{\"('a', 'b')\":{\"('c', 'd')\":1}}"), + ("columns", "{\"('c', 'd')\":{\"('a', 'b')\":1}}"), + # TODO: the below have separate encoding procedures + # They produce JSON but not in a consistent manner + pytest.param("split", "", marks=pytest.mark.skip), + pytest.param("table", "", marks=pytest.mark.skip), + ], + ) + def test_tuple_labels(self, orient, expected): + # GH 20500 + df = pd.DataFrame([[1]], index=[("a", "b")], columns=[("c", "d")]) + result = df.to_json(orient=orient) + assert result == expected From 5c0da7dd4034427745038381e8e2b77ac8c59d08 Mon Sep 17 00:00:00 2001 From: steveayers124 <46000954+steveayers124@users.noreply.github.com> Date: Sat, 24 Aug 2019 04:32:54 -0500 Subject: [PATCH 08/95] DOC: Fix GL01 and GL02 errors in the docstrings (#27988) --- pandas/conftest.py | 37 +++++++++++++++++++++++++------------ pandas/io/html.py | 24 ++++++++++++++++-------- 2 files changed, 41 insertions(+), 20 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 2cf7bf6a6df41c..b032e14d8f7e1d 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -123,18 +123,22 @@ def ip(): @pytest.fixture(params=[True, False, None]) def observed(request): - """ pass in the observed keyword to groupby for [True, False] + """ + Pass in the observed keyword to groupby for [True, False] This indicates whether categoricals should return values for values which are not in the grouper [False / None], or only values which appear in the grouper [True]. [None] is supported for future compatibility if we decide to change the default (and would need to warn if this - parameter is not passed)""" + parameter is not passed). + """ return request.param @pytest.fixture(params=[True, False, None]) def ordered_fixture(request): - """Boolean 'ordered' parameter for Categorical.""" + """ + Boolean 'ordered' parameter for Categorical. + """ return request.param @@ -234,7 +238,8 @@ def cython_table_items(request): def _get_cython_table_params(ndframe, func_names_and_expected): - """combine frame, functions from SelectionMixin._cython_table + """ + Combine frame, functions from SelectionMixin._cython_table keys and expected result. Parameters @@ -242,7 +247,7 @@ def _get_cython_table_params(ndframe, func_names_and_expected): ndframe : DataFrame or Series func_names_and_expected : Sequence of two items The first item is a name of a NDFrame method ('sum', 'prod') etc. - The second item is the expected return value + The second item is the expected return value. Returns ------- @@ -341,7 +346,8 @@ def strict_data_files(pytestconfig): @pytest.fixture def datapath(strict_data_files): - """Get the path to a data file. + """ + Get the path to a data file. Parameters ---------- @@ -375,7 +381,9 @@ def deco(*args): @pytest.fixture def iris(datapath): - """The iris dataset as a DataFrame.""" + """ + The iris dataset as a DataFrame. + """ return pd.read_csv(datapath("data", "iris.csv")) @@ -504,7 +512,8 @@ def tz_aware_fixture(request): @pytest.fixture(params=STRING_DTYPES) def string_dtype(request): - """Parametrized fixture for string dtypes. + """ + Parametrized fixture for string dtypes. * str * 'str' @@ -515,7 +524,8 @@ def string_dtype(request): @pytest.fixture(params=BYTES_DTYPES) def bytes_dtype(request): - """Parametrized fixture for bytes dtypes. + """ + Parametrized fixture for bytes dtypes. * bytes * 'bytes' @@ -525,7 +535,8 @@ def bytes_dtype(request): @pytest.fixture(params=OBJECT_DTYPES) def object_dtype(request): - """Parametrized fixture for object dtypes. + """ + Parametrized fixture for object dtypes. * object * 'object' @@ -535,7 +546,8 @@ def object_dtype(request): @pytest.fixture(params=DATETIME64_DTYPES) def datetime64_dtype(request): - """Parametrized fixture for datetime64 dtypes. + """ + Parametrized fixture for datetime64 dtypes. * 'datetime64[ns]' * 'M8[ns]' @@ -545,7 +557,8 @@ def datetime64_dtype(request): @pytest.fixture(params=TIMEDELTA64_DTYPES) def timedelta64_dtype(request): - """Parametrized fixture for timedelta64 dtypes. + """ + Parametrized fixture for timedelta64 dtypes. * 'timedelta64[ns]' * 'm8[ns]' diff --git a/pandas/io/html.py b/pandas/io/html.py index 9d2647f226f009..490c574463b9bd 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -1,4 +1,5 @@ -""":mod:`pandas.io.html` is a module containing functionality for dealing with +""" +:mod:`pandas.io.html` is a module containing functionality for dealing with HTML IO. """ @@ -58,7 +59,8 @@ def _importers(): def _remove_whitespace(s, regex=_RE_WHITESPACE): - """Replace extra whitespace inside of a string with a single space. + """ + Replace extra whitespace inside of a string with a single space. Parameters ---------- @@ -77,7 +79,8 @@ def _remove_whitespace(s, regex=_RE_WHITESPACE): def _get_skiprows(skiprows): - """Get an iterator given an integer, slice or container. + """ + Get an iterator given an integer, slice or container. Parameters ---------- @@ -107,7 +110,8 @@ def _get_skiprows(skiprows): def _read(obj): - """Try to read from a url, file or string. + """ + Try to read from a url, file or string. Parameters ---------- @@ -136,7 +140,8 @@ def _read(obj): class _HtmlFrameParser: - """Base class for parsers that parse HTML into DataFrames. + """ + Base class for parsers that parse HTML into DataFrames. Parameters ---------- @@ -515,7 +520,8 @@ def _handle_hidden_tables(self, tbl_list, attr_name): class _BeautifulSoupHtml5LibFrameParser(_HtmlFrameParser): - """HTML to DataFrame parser that uses BeautifulSoup under the hood. + """ + HTML to DataFrame parser that uses BeautifulSoup under the hood. See Also -------- @@ -622,7 +628,8 @@ def _build_xpath_expr(attrs): class _LxmlFrameParser(_HtmlFrameParser): - """HTML to DataFrame parser that uses lxml under the hood. + """ + HTML to DataFrame parser that uses lxml under the hood. Warning ------- @@ -937,7 +944,8 @@ def read_html( keep_default_na=True, displayed_only=True, ): - r"""Read HTML tables into a ``list`` of ``DataFrame`` objects. + r""" + Read HTML tables into a ``list`` of ``DataFrame`` objects. Parameters ---------- From 518d8aea8f1a7053b541fc6491a50fca30e6fb08 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 25 Aug 2019 08:54:40 -0700 Subject: [PATCH 09/95] Change trys to checks (#28121) --- pandas/core/internals/blocks.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index e24e6e088b92aa..f0ee56f403325a 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -2830,9 +2830,9 @@ def _replace_single( regex = regex_re or to_rep_re # try to get the pattern attribute (compiled re) or it's a string - try: + if is_re(to_replace): pattern = to_replace.pattern - except AttributeError: + else: pattern = to_replace # if the pattern is not empty and to_replace is either a string or a @@ -2853,18 +2853,18 @@ def _replace_single( if isna(value) or not isinstance(value, str): def re_replacer(s): - try: + if is_re(rx) and isinstance(s, str): return value if rx.search(s) is not None else s - except TypeError: + else: return s else: # value is guaranteed to be a string here, s can be either a string # or null if it's null it gets returned def re_replacer(s): - try: + if is_re(rx) and isinstance(s, str): return rx.sub(value, s) - except TypeError: + else: return s f = np.vectorize(re_replacer, otypes=[self.dtype]) From 2165a6a64d4064af2bf79d7e6889bda2b6adb86f Mon Sep 17 00:00:00 2001 From: Bryant Moscon Date: Sun, 25 Aug 2019 11:56:15 -0400 Subject: [PATCH 10/95] Remove outdated docstring that no longer applies (#28137) --- pandas/compat/pickle_compat.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py index bca33513b00698..87240a9f986c33 100644 --- a/pandas/compat/pickle_compat.py +++ b/pandas/compat/pickle_compat.py @@ -196,10 +196,6 @@ def load_newobj_ex(self): def load(fh, encoding=None, is_verbose=False): """load a pickle, with a provided encoding - if compat is True: - fake the old class hierarchy - if it works, then return the new type objects - Parameters ---------- fh : a filelike object From 09ab18f6dca48d4dde677ce9ed86444f8a937e32 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 25 Aug 2019 16:57:58 +0100 Subject: [PATCH 11/95] TYPING: _pytest.mark.structures.MarkDecorator -> Callable (#28134) --- pandas/util/_test_decorators.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index 3de4e5d66d5774..627757aaa37412 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -25,9 +25,8 @@ def test_foo(): """ from distutils.version import LooseVersion import locale -from typing import Optional +from typing import Callable, Optional -from _pytest.mark.structures import MarkDecorator import pytest from pandas.compat import is_platform_32bit, is_platform_windows @@ -103,7 +102,7 @@ def _skip_if_no_scipy(): ) -def skip_if_installed(package: str,) -> MarkDecorator: +def skip_if_installed(package: str,) -> Callable: """ Skip a test if a package is installed. @@ -117,7 +116,7 @@ def skip_if_installed(package: str,) -> MarkDecorator: ) -def skip_if_no(package: str, min_version: Optional[str] = None) -> MarkDecorator: +def skip_if_no(package: str, min_version: Optional[str] = None) -> Callable: """ Generic function to help skip tests when required packages are not present on the testing system. From 97f9bbf6d4b8af8691fabb7014b7e5aa006e1cf2 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Sun, 25 Aug 2019 09:04:59 -0700 Subject: [PATCH 12/95] Contributing Guide for Type Hints (#27050) --- doc/source/development/contributing.rst | 130 ++++++++++++++++++++++++ 1 file changed, 130 insertions(+) diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index b38f7767ae0733..be6555b2ab9368 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -699,6 +699,136 @@ You'll also need to See :ref:`contributing.warnings` for more. +.. _contributing.type_hints: + +Type Hints +---------- + +*pandas* strongly encourages the use of :pep:`484` style type hints. New development should contain type hints and pull requests to annotate existing code are accepted as well! + +Style Guidelines +~~~~~~~~~~~~~~~~ + +Types imports should follow the ``from typing import ...`` convention. So rather than + +.. code-block:: python + + import typing + + primes = [] # type: typing.List[int] + +You should write + +.. code-block:: python + + from typing import List, Optional, Union + + primes = [] # type: List[int] + +``Optional`` should be used where applicable, so instead of + +.. code-block:: python + + maybe_primes = [] # type: List[Union[int, None]] + +You should write + +.. code-block:: python + + maybe_primes = [] # type: List[Optional[int]] + +In some cases in the code base classes may define class variables that shadow builtins. This causes an issue as described in `Mypy 1775 `_. The defensive solution here is to create an unambiguous alias of the builtin and use that without your annotation. For example, if you come across a definition like + +.. code-block:: python + + class SomeClass1: + str = None + +The appropriate way to annotate this would be as follows + +.. code-block:: python + + str_type = str + + class SomeClass2: + str = None # type: str_type + +In some cases you may be tempted to use ``cast`` from the typing module when you know better than the analyzer. This occurs particularly when using custom inference functions. For example + +.. code-block:: python + + from typing import cast + + from pandas.core.dtypes.common import is_number + + def cannot_infer_bad(obj: Union[str, int, float]): + + if is_number(obj): + ... + else: # Reasonably only str objects would reach this but... + obj = cast(str, obj) # Mypy complains without this! + return obj.upper() + +The limitation here is that while a human can reasonably understand that ``is_number`` would catch the ``int`` and ``float`` types mypy cannot make that same inference just yet (see `mypy #5206 `_. While the above works, the use of ``cast`` is **strongly discouraged**. Where applicable a refactor of the code to appease static analysis is preferable + +.. code-block:: python + + def cannot_infer_good(obj: Union[str, int, float]): + + if isinstance(obj, str): + return obj.upper() + else: + ... + +With custom types and inference this is not always possible so exceptions are made, but every effort should be exhausted to avoid ``cast`` before going down such paths. + +Syntax Requirements +~~~~~~~~~~~~~~~~~~~ + +Because *pandas* still supports Python 3.5, :pep:`526` does not apply and variables **must** be annotated with type comments. Specifically, this is a valid annotation within pandas: + +.. code-block:: python + + primes = [] # type: List[int] + +Whereas this is **NOT** allowed: + +.. code-block:: python + + primes: List[int] = [] # not supported in Python 3.5! + +Note that function signatures can always be annotated per :pep:`3107`: + +.. code-block:: python + + def sum_of_primes(primes: List[int] = []) -> int: + ... + + +Pandas-specific Types +~~~~~~~~~~~~~~~~~~~~~ + +Commonly used types specific to *pandas* will appear in `pandas._typing `_ and you should use these where applicable. This module is private for now but ultimately this should be exposed to third party libraries who want to implement type checking against pandas. + +For example, quite a few functions in *pandas* accept a ``dtype`` argument. This can be expressed as a string like ``"object"``, a ``numpy.dtype`` like ``np.int64`` or even a pandas ``ExtensionDtype`` like ``pd.CategoricalDtype``. Rather than burden the user with having to constantly annotate all of those options, this can simply be imported and reused from the pandas._typing module + +.. code-block:: python + + from pandas._typing import Dtype + + def as_type(dtype: Dtype) -> ...: + ... + +This module will ultimately house types for repeatedly used concepts like "path-like", "array-like", "numeric", etc... and can also hold aliases for commonly appearing parameters like `axis`. Development of this module is active so be sure to refer to the source for the most up to date list of available types. + +Validating Type Hints +~~~~~~~~~~~~~~~~~~~~~ + +*pandas* uses `mypy `_ to statically analyze the code base and type hints. After making any change you can ensure your type hints are correct by running + +.. code-block:: shell + + mypy pandas .. _contributing.ci: From 5d9fd7e3b226b68e695d87121f584202aa6d4abc Mon Sep 17 00:00:00 2001 From: John Ward Date: Sun, 25 Aug 2019 15:11:00 -0500 Subject: [PATCH 13/95] DOC: Fixes to docstrings formatting (#28096) --- pandas/core/generic.py | 2 +- pandas/io/clipboards.py | 9 ++-- pandas/io/excel/_base.py | 4 +- pandas/io/pytables.py | 93 ++++++++++++++++++++++----------------- pandas/tseries/offsets.py | 10 +++-- 5 files changed, 68 insertions(+), 50 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ba1c516b9b444e..90779baea32cbf 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1875,7 +1875,7 @@ def __iter__(self): # can we get a better explanation of this? def keys(self): """ - Get the 'info axis' (see Indexing for more) + Get the 'info axis' (see Indexing for more). This is index for Series, columns for DataFrame. diff --git a/pandas/io/clipboards.py b/pandas/io/clipboards.py index d38221d7842739..76c01535a26e79 100644 --- a/pandas/io/clipboards.py +++ b/pandas/io/clipboards.py @@ -9,8 +9,7 @@ def read_clipboard(sep=r"\s+", **kwargs): # pragma: no cover r""" - Read text from clipboard and pass to read_csv. See read_csv for the - full argument list + Read text from clipboard and pass to read_csv. Parameters ---------- @@ -18,9 +17,13 @@ def read_clipboard(sep=r"\s+", **kwargs): # pragma: no cover A string or regex delimiter. The default of '\s+' denotes one or more whitespace characters. + **kwargs + See read_csv for the full argument list. + Returns ------- - parsed : DataFrame + DataFrame + A parsed DataFrame object. """ encoding = kwargs.pop("encoding", "utf-8") diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 154656fbb250b5..997edf49d9e8fc 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -837,10 +837,10 @@ def parse( **kwds ): """ - Parse specified sheet(s) into a DataFrame + Parse specified sheet(s) into a DataFrame. Equivalent to read_excel(ExcelFile, ...) See the read_excel - docstring for more info on accepted parameters + docstring for more info on accepted parameters. Returns ------- diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 6af5dd6f1bf372..576c45a2f8097e 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -431,8 +431,9 @@ def _is_metadata_of(group, parent_group): class HDFStore: """ - Dict-like IO interface for storing pandas objects in PyTables - either Fixed or Table format. + Dict-like IO interface for storing pandas objects in PyTables. + + Either Fixed or Table format. Parameters ---------- @@ -564,13 +565,12 @@ def __exit__(self, exc_type, exc_value, traceback): def keys(self): """ - Return a (potentially unordered) list of the keys corresponding to the - objects stored in the HDFStore. These are ABSOLUTE path-names (e.g. - have the leading '/' + Return a list of keys corresponding to objects stored in HDFStore. Returns ------- list + List of ABSOLUTE path-names (e.g. have the leading '/'). """ return [n._v_pathname for n in self.groups()] @@ -703,7 +703,7 @@ def flush(self, fsync=False): def get(self, key): """ - Retrieve pandas object stored in file + Retrieve pandas object stored in file. Parameters ---------- @@ -711,7 +711,8 @@ def get(self, key): Returns ------- - obj : same type as object stored in file + object + Same type as object stored in file. """ group = self.get_node(key) if group is None: @@ -731,25 +732,31 @@ def select( **kwargs ): """ - Retrieve pandas object stored in file, optionally based on where - criteria + Retrieve pandas object stored in file, optionally based on where criteria. Parameters ---------- key : object - where : list of Term (or convertible) objects, optional - start : integer (defaults to None), row number to start selection - stop : integer (defaults to None), row number to stop selection - columns : a list of columns that if not None, will limit the return - columns - iterator : boolean, return an iterator, default False - chunksize : nrows to include in iteration, return an iterator - auto_close : boolean, should automatically close the store when - finished, default is False + Object being retrieved from file. + where : list, default None + List of Term (or convertible) objects, optional. + start : int, default None + Row number to start selection. + stop : int, default None + Row number to stop selection. + columns : list, default None + A list of columns that if not None, will limit the return columns. + iterator : bool, default False + Returns an iterator. + chunksize : int, default None + Number or rows to include in iteration, return an iterator. + auto_close : bool, default False + Should automatically close the store when finished. Returns ------- - The selected object + object + Retrieved object from file. """ group = self.get_node(key) if group is None: @@ -929,28 +936,30 @@ def func(_start, _stop, _where): def put(self, key, value, format=None, append=False, **kwargs): """ - Store object in HDFStore + Store object in HDFStore. Parameters ---------- - key : object - value : {Series, DataFrame} - format : 'fixed(f)|table(t)', default is 'fixed' + key : object + value : {Series, DataFrame} + format : 'fixed(f)|table(t)', default is 'fixed' fixed(f) : Fixed format - Fast writing/reading. Not-appendable, nor searchable + Fast writing/reading. Not-appendable, nor searchable. table(t) : Table format Write as a PyTables Table structure which may perform worse but allow more flexible operations like searching - / selecting subsets of the data - append : boolean, default False + / selecting subsets of the data. + append : bool, default False This will force Table format, append the input data to the existing. - data_columns : list of columns to create as data columns, or True to + data_columns : list, default None + List of columns to create as data columns, or True to use all columns. See `here `__. - encoding : default None, provide an encoding for strings - dropna : boolean, default False, do not write an ALL nan row to - the store settable by the option 'io.hdf.dropna_table' + encoding : str, default None + Provide an encoding for strings. + dropna : bool, default False, do not write an ALL nan row to + The store settable by the option 'io.hdf.dropna_table'. """ if format is None: format = get_option("io.hdf.default_format") or "fixed" @@ -1165,12 +1174,15 @@ def create_table_index(self, key, **kwargs): s.create_index(**kwargs) def groups(self): - """return a list of all the top-level nodes (that are not themselves a - pandas storage object) + """ + Return a list of all the top-level nodes. + + Each node returned is not a pandas storage object. Returns ------- list + List of objects. """ _tables() self._check_if_open() @@ -1188,10 +1200,12 @@ def groups(self): ] def walk(self, where="/"): - """ Walk the pytables group hierarchy for pandas objects + """ + Walk the pytables group hierarchy for pandas objects. This generator will yield the group path, subgroups and pandas object names for each group. + Any non-pandas PyTables objects that are not a group will be ignored. The `where` group itself is listed first (preorder), then each of its @@ -1202,18 +1216,17 @@ def walk(self, where="/"): Parameters ---------- - where : str, optional + where : str, default "/" Group where to start walking. - If not supplied, the root group is used. Yields ------ path : str - Full path to a group (without trailing '/') - groups : list of str - names of the groups contained in `path` - leaves : list of str - names of the pandas objects contained in `path` + Full path to a group (without trailing '/'). + groups : list + Names (strings) of the groups contained in `path`. + leaves : list + Names (strings) of the pandas objects contained in `path`. """ _tables() self._check_if_open() diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index a208d5ad2fea99..edf58ba3850a1c 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -204,8 +204,7 @@ def __add__(date): normalize : bool, default False Whether to round the result of a DateOffset addition down to the previous midnight. - **kwds - Temporal parameter that add to or replace the offset value. + **kwds : Temporal parameter that add to or replace the offset value. Parameters that **add** to the offset (like Timedelta): @@ -233,16 +232,19 @@ def __add__(date): See Also -------- - dateutil.relativedelta.relativedelta + dateutil.relativedelta.relativedelta : The relativedelta type is designed + to be applied to an existing datetime an can replace specific components of + that datetime, or represents an interval of time. Examples -------- + >>> from pandas.tseries.offsets import DateOffset >>> ts = pd.Timestamp('2017-01-01 09:10:11') >>> ts + DateOffset(months=3) Timestamp('2017-04-01 09:10:11') >>> ts = pd.Timestamp('2017-01-01 09:10:11') - >>> ts + DateOffset(month=3) + >>> ts + DateOffset(months=2) Timestamp('2017-03-01 09:10:11') """ From 2c9c4223442cd555a1fbc894eb5e89792c09ea63 Mon Sep 17 00:00:00 2001 From: Bhuvana KA Date: Mon, 26 Aug 2019 07:49:37 +0530 Subject: [PATCH 14/95] DOC: Fix RangeIndex and other docstrings for missing period in summary (#28123) --- pandas/core/arrays/categorical.py | 2 +- pandas/core/base.py | 2 +- pandas/core/groupby/grouper.py | 2 +- pandas/core/indexes/base.py | 4 ++-- pandas/core/indexes/datetimes.py | 2 +- pandas/core/indexes/interval.py | 2 +- pandas/core/indexes/period.py | 2 +- pandas/core/indexes/range.py | 12 ++++++------ pandas/core/indexes/timedeltas.py | 2 +- pandas/core/reshape/merge.py | 2 +- pandas/core/util/hashing.py | 2 +- 11 files changed, 17 insertions(+), 17 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index a895da6184eeba..5929a8d51fe430 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -471,7 +471,7 @@ def ordered(self) -> Ordered: @property def dtype(self) -> CategoricalDtype: """ - The :class:`~pandas.api.types.CategoricalDtype` for this instance + The :class:`~pandas.api.types.CategoricalDtype` for this instance. """ return self._dtype diff --git a/pandas/core/base.py b/pandas/core/base.py index 7d2a62318232c3..767b5594450385 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1462,7 +1462,7 @@ def is_monotonic_decreasing(self): def memory_usage(self, deep=False): """ - Memory usage of the values + Memory usage of the values. Parameters ---------- diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 143755a47b97b3..3415c0e056a1ce 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -37,7 +37,7 @@ class Grouper: """ A Grouper allows the user to specify a groupby instruction for a target - object + object. This specification will select a column via the key parameter, or if the level and/or axis parameters are given, a level of the index of the target diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 415255cdbad06c..38c5e136d0e600 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2020,7 +2020,7 @@ def notna(self): _index_shared_docs[ "fillna" ] = """ - Fill NA/NaN values with the specified value + Fill NA/NaN values with the specified value. Parameters ---------- @@ -2051,7 +2051,7 @@ def fillna(self, value=None, downcast=None): _index_shared_docs[ "dropna" ] = """ - Return Index without NA/NaN values + Return Index without NA/NaN values. Parameters ---------- diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 272066d476ce34..cce390d98c0378 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -1594,7 +1594,7 @@ def bdate_range( ): """ Return a fixed frequency DatetimeIndex, with business day as the default - frequency + frequency. Parameters ---------- diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 9361408290bb16..3874c6404565c7 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -1310,7 +1310,7 @@ def interval_range( start=None, end=None, periods=None, freq=None, name=None, closed="right" ): """ - Return a fixed frequency IntervalIndex + Return a fixed frequency IntervalIndex. Parameters ---------- diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 5a2ca109597e85..f7bf77928bdc7c 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -994,7 +994,7 @@ def memory_usage(self, deep=False): def period_range(start=None, end=None, periods=None, freq=None, name=None): """ Return a fixed frequency PeriodIndex, with day (calendar) as the default - frequency + frequency. Parameters ---------- diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 43ed6e7b122eae..8783351cc74d1c 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -236,7 +236,7 @@ def _format_with_header(self, header, na_rep="NaN", **kwargs): @cache_readonly def start(self): """ - The value of the `start` parameter (``0`` if this was not supplied) + The value of the `start` parameter (``0`` if this was not supplied). """ # GH 25710 return self._range.start @@ -244,7 +244,7 @@ def start(self): @property def _start(self): """ - The value of the `start` parameter (``0`` if this was not supplied) + The value of the `start` parameter (``0`` if this was not supplied). .. deprecated:: 0.25.0 Use ``start`` instead. @@ -259,14 +259,14 @@ def _start(self): @cache_readonly def stop(self): """ - The value of the `stop` parameter + The value of the `stop` parameter. """ return self._range.stop @property def _stop(self): """ - The value of the `stop` parameter + The value of the `stop` parameter. .. deprecated:: 0.25.0 Use ``stop`` instead. @@ -282,7 +282,7 @@ def _stop(self): @cache_readonly def step(self): """ - The value of the `step` parameter (``1`` if this was not supplied) + The value of the `step` parameter (``1`` if this was not supplied). """ # GH 25710 return self._range.step @@ -290,7 +290,7 @@ def step(self): @property def _step(self): """ - The value of the `step` parameter (``1`` if this was not supplied) + The value of the `step` parameter (``1`` if this was not supplied). .. deprecated:: 0.25.0 Use ``step`` instead. diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 8cf14e2ca777e4..b03d60c7b5b371 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -713,7 +713,7 @@ def timedelta_range( ): """ Return a fixed frequency TimedeltaIndex, with day as the default - frequency + frequency. Parameters ---------- diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 225de3f11cf7d7..d7fbe464cb1e52 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -178,7 +178,7 @@ def merge_ordered( """ Perform merge with optional filling/interpolation designed for ordered data like time series data. Optionally perform group-wise merge (see - examples) + examples). Parameters ---------- diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index 73e126cf230a5e..bcdbf0855cbb49 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -58,7 +58,7 @@ def hash_pandas_object( obj, index=True, encoding="utf8", hash_key=None, categorize=True ): """ - Return a data hash of the Index/Series/DataFrame + Return a data hash of the Index/Series/DataFrame. Parameters ---------- From ea60c1966bf7291829a1479512d7aa89d08bd6dd Mon Sep 17 00:00:00 2001 From: jalbritt Date: Sun, 25 Aug 2019 21:21:36 -0500 Subject: [PATCH 15/95] DOC: Added periods to end of docstrings in explode function (#27973) --- pandas/core/frame.py | 8 ++++---- pandas/core/series.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f2bb964f35dbd4..9da7999724a186 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6183,14 +6183,14 @@ def stack(self, level=-1, dropna=True): def explode(self, column: Union[str, Tuple]) -> "DataFrame": """ - Transform each element of a list-like to a row, replicating the - index values. + Transform each element of a list-like to a row, replicating index values. .. versionadded:: 0.25.0 Parameters ---------- column : str or tuple + Column to explode. Returns ------- @@ -6206,8 +6206,8 @@ def explode(self, column: Union[str, Tuple]) -> "DataFrame": See Also -------- DataFrame.unstack : Pivot a level of the (necessarily hierarchical) - index labels - DataFrame.melt : Unpivot a DataFrame from wide format to long format + index labels. + DataFrame.melt : Unpivot a DataFrame from wide format to long format. Series.explode : Explode a DataFrame from list-like columns to long format. Notes diff --git a/pandas/core/series.py b/pandas/core/series.py index 8b6c963e40e9d7..6fb39c422de932 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3620,7 +3620,7 @@ def explode(self) -> "Series": Series.str.split : Split string values on specified separator. Series.unstack : Unstack, a.k.a. pivot, Series with MultiIndex to produce DataFrame. - DataFrame.melt : Unpivot a DataFrame from wide format to long format + DataFrame.melt : Unpivot a DataFrame from wide format to long format. DataFrame.explode : Explode a DataFrame from list-like columns to long format. From 765eb8d8a02aed564bb9d3be93cf36e355ba0d64 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 26 Aug 2019 09:22:30 -0500 Subject: [PATCH 16/95] COMPAT: 3.8 compat for tests and DataFrame.query (#28101) * COMPAT: implement visit_Constant for 3.8 compat * Updated tests for new error messages. --- doc/source/whatsnew/v0.25.2.rst | 2 +- pandas/compat/__init__.py | 1 + pandas/core/computation/expr.py | 3 +++ pandas/tests/computation/test_eval.py | 27 +++++++++++++++++++++++++-- pandas/tests/io/parser/test_common.py | 5 ++++- pandas/tests/scalar/test_nat.py | 3 +++ 6 files changed, 37 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.25.2.rst b/doc/source/whatsnew/v0.25.2.rst index 76473405374e84..403c02c3ff129d 100644 --- a/doc/source/whatsnew/v0.25.2.rst +++ b/doc/source/whatsnew/v0.25.2.rst @@ -99,7 +99,7 @@ Sparse Other ^^^^^ -- +- Compatibility with Python 3.8 in :meth:`DataFrame.query` (:issue:`27261`) - .. _whatsnew_0.252.contributors: diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index b32da8da3a1fbe..9c778f68727c6b 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -15,6 +15,7 @@ PY35 = sys.version_info[:2] == (3, 5) PY36 = sys.version_info >= (3, 6) PY37 = sys.version_info >= (3, 7) +PY38 = sys.version_info >= (3, 8) PYPY = platform.python_implementation() == "PyPy" diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index a58f256cf61d41..4c164968575a16 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -582,6 +582,9 @@ def visit_NameConstant(self, node, **kwargs): def visit_Num(self, node, **kwargs): return self.const_type(node.n, self.env) + def visit_Constant(self, node, **kwargs): + return self.const_type(node.n, self.env) + def visit_Str(self, node, **kwargs): name = self.env.add_tmp(node.s) return self.term_type(name, self.env) diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index c500760fa1390a..b6ffd8a83e409d 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -14,7 +14,7 @@ from pandas.core.dtypes.common import is_bool, is_list_like, is_scalar import pandas as pd -from pandas import DataFrame, Series, date_range +from pandas import DataFrame, Series, compat, date_range from pandas.core.computation import pytables from pandas.core.computation.check import _NUMEXPR_VERSION from pandas.core.computation.engines import NumExprClobberingError, _engines @@ -1267,7 +1267,10 @@ def test_assignment_column(self): msg = "left hand side of an assignment must be a single name" with pytest.raises(SyntaxError, match=msg): df.eval("d,c = a + b") - msg = "can't assign to function call" + if compat.PY38: + msg = "cannot assign to function call" + else: + msg = "can't assign to function call" with pytest.raises(SyntaxError, match=msg): df.eval('Timestamp("20131001") = a + b') @@ -1967,6 +1970,26 @@ def test_bool_ops_fails_on_scalars(lhs, cmp, rhs, engine, parser): pd.eval(ex, engine=engine, parser=parser) +@pytest.mark.parametrize( + "other", + [ + "'x'", + pytest.param( + "...", marks=pytest.mark.xfail(not compat.PY38, reason="GH-28116") + ), + ], +) +def test_equals_various(other): + df = DataFrame({"A": ["a", "b", "c"]}) + result = df.eval("A == {}".format(other)) + expected = Series([False, False, False], name="A") + if _USE_NUMEXPR: + # https://github.com/pandas-dev/pandas/issues/10239 + # lose name with numexpr engine. Remove when that's fixed. + expected.name = None + tm.assert_series_equal(result, expected) + + def test_inf(engine, parser): s = "inf + 1" expected = np.inf diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index e5366a8357adbc..e04535df56663c 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -1898,7 +1898,10 @@ def test_null_byte_char(all_parsers): out = parser.read_csv(StringIO(data), names=names) tm.assert_frame_equal(out, expected) else: - msg = "NULL byte detected" + if compat.PY38: + msg = "line contains NUL" + else: + msg = "NULL byte detected" with pytest.raises(ParserError, match=msg): parser.read_csv(StringIO(data), names=names) diff --git a/pandas/tests/scalar/test_nat.py b/pandas/tests/scalar/test_nat.py index 5b1c4f92bf3419..5eb69fb2952dcb 100644 --- a/pandas/tests/scalar/test_nat.py +++ b/pandas/tests/scalar/test_nat.py @@ -252,6 +252,7 @@ def _get_overlap_public_nat_methods(klass, as_tuple=False): "day_name", "dst", "floor", + "fromisocalendar", "fromisoformat", "fromordinal", "fromtimestamp", @@ -296,6 +297,8 @@ def test_overlap_public_nat_methods(klass, expected): # "fromisoformat" was introduced in 3.7 if klass is Timestamp and not compat.PY37: expected.remove("fromisoformat") + if klass is Timestamp and not compat.PY38: + expected.remove("fromisocalendar") assert _get_overlap_public_nat_methods(klass) == expected From cebc34327c74fed38ad8ee4cffb7b63999c83b9a Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 26 Aug 2019 15:26:25 +0100 Subject: [PATCH 17/95] TYPING: --check-untyped-defs for Index.__new__ (#28141) --- pandas/core/indexes/base.py | 56 ++++++++++--------------------------- 1 file changed, 14 insertions(+), 42 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 38c5e136d0e600..2dbd592fc67873 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -10,6 +10,7 @@ import pandas._libs.join as libjoin from pandas._libs.lib import is_datetime_array from pandas._libs.tslibs import OutOfBoundsDatetime, Timestamp +from pandas._libs.tslibs.period import IncompatibleFrequency from pandas._libs.tslibs.timezones import tz_compare from pandas.compat import set_function_name from pandas.compat.numpy import function as nv @@ -262,7 +263,13 @@ def __new__( fastpath=None, tupleize_cols=True, **kwargs - ): + ) -> "Index": + + from .range import RangeIndex + from pandas import PeriodIndex, DatetimeIndex, TimedeltaIndex + from .numeric import Float64Index, Int64Index, UInt64Index + from .interval import IntervalIndex + from .category import CategoricalIndex if name is None and hasattr(data, "name"): name = data.name @@ -277,8 +284,6 @@ def __new__( if fastpath: return cls._simple_new(data, name) - from .range import RangeIndex - if isinstance(data, ABCPandasArray): # ensure users don't accidentally put a PandasArray in an index. data = data.to_numpy() @@ -291,16 +296,12 @@ def __new__( # categorical elif is_categorical_dtype(data) or is_categorical_dtype(dtype): - from .category import CategoricalIndex - return CategoricalIndex(data, dtype=dtype, copy=copy, name=name, **kwargs) # interval elif ( is_interval_dtype(data) or is_interval_dtype(dtype) ) and not is_object_dtype(dtype): - from .interval import IntervalIndex - closed = kwargs.get("closed", None) return IntervalIndex(data, dtype=dtype, name=name, copy=copy, closed=closed) @@ -309,8 +310,6 @@ def __new__( or is_datetime64_any_dtype(dtype) or "tz" in kwargs ): - from pandas import DatetimeIndex - if is_dtype_equal(_o_dtype, dtype): # GH#23524 passing `dtype=object` to DatetimeIndex is invalid, # will raise in the where `data` is already tz-aware. So @@ -318,33 +317,24 @@ def __new__( # the DatetimeIndex construction. # Note we can pass copy=False because the .astype below # will always make a copy - result = DatetimeIndex(data, copy=False, name=name, **kwargs) + result = DatetimeIndex( + data, copy=False, name=name, **kwargs + ) # type: "Index" return result.astype(object) else: - result = DatetimeIndex( - data, copy=copy, name=name, dtype=dtype, **kwargs - ) - return result + return DatetimeIndex(data, copy=copy, name=name, dtype=dtype, **kwargs) elif is_timedelta64_dtype(data) or is_timedelta64_dtype(dtype): - from pandas import TimedeltaIndex - if is_dtype_equal(_o_dtype, dtype): # Note we can pass copy=False because the .astype below # will always make a copy result = TimedeltaIndex(data, copy=False, name=name, **kwargs) return result.astype(object) else: - result = TimedeltaIndex( - data, copy=copy, name=name, dtype=dtype, **kwargs - ) - return result + return TimedeltaIndex(data, copy=copy, name=name, dtype=dtype, **kwargs) elif is_period_dtype(data) and not is_object_dtype(dtype): - from pandas import PeriodIndex - - result = PeriodIndex(data, copy=copy, name=name, **kwargs) - return result + return PeriodIndex(data, copy=copy, name=name, **kwargs) # extension dtype elif is_extension_array_dtype(data) or is_extension_array_dtype(dtype): @@ -387,8 +377,6 @@ def __new__( pass # Return an actual float index. - from .numeric import Float64Index - return Float64Index(data, copy=copy, dtype=dtype, name=name) elif inferred == "string": @@ -405,19 +393,11 @@ def __new__( data = np.array(data, dtype=dtype, copy=copy) # maybe coerce to a sub-class - from pandas.core.indexes.period import PeriodIndex, IncompatibleFrequency - if is_signed_integer_dtype(data.dtype): - from .numeric import Int64Index - return Int64Index(data, copy=copy, dtype=dtype, name=name) elif is_unsigned_integer_dtype(data.dtype): - from .numeric import UInt64Index - return UInt64Index(data, copy=copy, dtype=dtype, name=name) elif is_float_dtype(data.dtype): - from .numeric import Float64Index - return Float64Index(data, copy=copy, dtype=dtype, name=name) elif issubclass(data.dtype.type, np.bool) or is_bool_dtype(data): subarr = data.astype("object") @@ -440,12 +420,8 @@ def __new__( return Index(subarr, copy=copy, dtype=object, name=name) elif inferred in ["floating", "mixed-integer-float", "integer-na"]: # TODO: Returns IntegerArray for integer-na case in the future - from .numeric import Float64Index - return Float64Index(subarr, copy=copy, name=name) elif inferred == "interval": - from .interval import IntervalIndex - try: return IntervalIndex(subarr, name=name, copy=copy) except ValueError: @@ -456,8 +432,6 @@ def __new__( pass elif inferred != "string": if inferred.startswith("datetime"): - from pandas import DatetimeIndex - try: return DatetimeIndex(subarr, copy=copy, name=name, **kwargs) except (ValueError, OutOfBoundsDatetime): @@ -467,8 +441,6 @@ def __new__( pass elif inferred.startswith("timedelta"): - from pandas import TimedeltaIndex - return TimedeltaIndex(subarr, copy=copy, name=name, **kwargs) elif inferred == "period": try: From 0d0daa8466d257c3329c54633a9a98867c86d009 Mon Sep 17 00:00:00 2001 From: Drew Heenan Date: Mon, 26 Aug 2019 07:27:40 -0700 Subject: [PATCH 18/95] ENH: Allow compression in NDFrame.to_csv to be a dict with optional arguments (#26023) (#26024) --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/core/generic.py | 91 ++++++++++++------- pandas/io/common.py | 115 +++++++++++++++++++------ pandas/io/formats/csvs.py | 10 ++- pandas/tests/io/formats/test_to_csv.py | 41 +++++++++ 5 files changed, 200 insertions(+), 58 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 8e25857e5ad693..2bfc09e52c68b5 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -206,6 +206,7 @@ ExtensionArray Other ^^^^^ - Trying to set the ``display.precision``, ``display.max_rows`` or ``display.max_columns`` using :meth:`set_option` to anything but a ``None`` or a positive int will raise a ``ValueError`` (:issue:`23348`) +- :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now support dicts as ``compression`` argument with key ``'method'`` being the compression method and others as additional compression options when the compression method is ``'zip'``. (:issue:`26023`) .. _whatsnew_1000.contributors: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 90779baea32cbf..fac5e0f085fc62 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7,7 +7,17 @@ import pickle import re from textwrap import dedent -from typing import Callable, Dict, FrozenSet, List, Optional, Set +from typing import ( + Callable, + Dict, + FrozenSet, + Hashable, + List, + Optional, + Sequence, + Set, + Union, +) import warnings import weakref @@ -50,7 +60,7 @@ from pandas.core.dtypes.missing import isna, notna import pandas as pd -from pandas._typing import Dtype +from pandas._typing import Dtype, FilePathOrBuffer from pandas.core import missing, nanops import pandas.core.algorithms as algos from pandas.core.base import PandasObject, SelectionMixin @@ -122,6 +132,9 @@ def _single_replace(self, to_replace, method, inplace, limit): return result +bool_t = bool # Need alias because NDFrame has def bool: + + class NDFrame(PandasObject, SelectionMixin): """ N-dimensional analogue of DataFrame. Store multi-dimensional in a @@ -3051,26 +3064,26 @@ def to_latex( def to_csv( self, - path_or_buf=None, - sep=",", - na_rep="", - float_format=None, - columns=None, - header=True, - index=True, - index_label=None, - mode="w", - encoding=None, - compression="infer", - quoting=None, - quotechar='"', - line_terminator=None, - chunksize=None, - date_format=None, - doublequote=True, - escapechar=None, - decimal=".", - ): + path_or_buf: Optional[FilePathOrBuffer] = None, + sep: str = ",", + na_rep: str = "", + float_format: Optional[str] = None, + columns: Optional[Sequence[Hashable]] = None, + header: Union[bool_t, List[str]] = True, + index: bool_t = True, + index_label: Optional[Union[bool_t, str, Sequence[Hashable]]] = None, + mode: str = "w", + encoding: Optional[str] = None, + compression: Optional[Union[str, Dict[str, str]]] = "infer", + quoting: Optional[int] = None, + quotechar: str = '"', + line_terminator: Optional[str] = None, + chunksize: Optional[int] = None, + date_format: Optional[str] = None, + doublequote: bool_t = True, + escapechar: Optional[str] = None, + decimal: Optional[str] = ".", + ) -> Optional[str]: r""" Write object to a comma-separated values (csv) file. @@ -3117,16 +3130,21 @@ def to_csv( encoding : str, optional A string representing the encoding to use in the output file, defaults to 'utf-8'. - compression : str, default 'infer' - Compression mode among the following possible values: {'infer', - 'gzip', 'bz2', 'zip', 'xz', None}. If 'infer' and `path_or_buf` - is path-like, then detect compression from the following - extensions: '.gz', '.bz2', '.zip' or '.xz'. (otherwise no - compression). - - .. versionchanged:: 0.24.0 - - 'infer' option added and set to default. + compression : str or dict, default 'infer' + If str, represents compression mode. If dict, value at 'method' is + the compression mode. Compression mode may be any of the following + possible values: {'infer', 'gzip', 'bz2', 'zip', 'xz', None}. If + compression mode is 'infer' and `path_or_buf` is path-like, then + detect compression mode from the following extensions: '.gz', + '.bz2', '.zip' or '.xz'. (otherwise no compression). If dict given + and mode is 'zip' or inferred as 'zip', other entries passed as + additional compression options. + + .. versionchanged:: 0.25.0 + + May now be a dict with key 'method' as compression mode + and other entries as additional compression options if + compression mode is 'zip'. quoting : optional constant from csv module Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format` @@ -3171,6 +3189,13 @@ def to_csv( ... 'weapon': ['sai', 'bo staff']}) >>> df.to_csv(index=False) 'name,mask,weapon\nRaphael,red,sai\nDonatello,purple,bo staff\n' + + # create 'out.zip' containing 'out.csv' + >>> compression_opts = dict(method='zip', + ... archive_name='out.csv') # doctest: +SKIP + + >>> df.to_csv('out.zip', index=False, + ... compression=compression_opts) # doctest: +SKIP """ df = self if isinstance(self, ABCDataFrame) else self.to_frame() @@ -3204,6 +3229,8 @@ def to_csv( if path_or_buf is None: return formatter.path_or_buf.getvalue() + return None + # ---------------------------------------------------------------------- # Fancy Indexing diff --git a/pandas/io/common.py b/pandas/io/common.py index 26b68dda7b464a..290022167e5205 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -9,7 +9,19 @@ import mmap import os import pathlib -from typing import IO, AnyStr, BinaryIO, Optional, TextIO, Type +from typing import ( + IO, + Any, + AnyStr, + BinaryIO, + Dict, + List, + Optional, + TextIO, + Tuple, + Type, + Union, +) from urllib.error import URLError # noqa from urllib.parse import ( # noqa urlencode, @@ -255,6 +267,40 @@ def file_path_to_url(path: str) -> str: _compression_to_extension = {"gzip": ".gz", "bz2": ".bz2", "zip": ".zip", "xz": ".xz"} +def _get_compression_method( + compression: Optional[Union[str, Dict[str, str]]] +) -> Tuple[Optional[str], Dict[str, str]]: + """ + Simplifies a compression argument to a compression method string and + a dict containing additional arguments. + + Parameters + ---------- + compression : str or dict + If string, specifies the compression method. If dict, value at key + 'method' specifies compression method. + + Returns + ------- + tuple of ({compression method}, Optional[str] + {compression arguments}, Dict[str, str]) + + Raises + ------ + ValueError on dict missing 'method' key + """ + # Handle dict + if isinstance(compression, dict): + compression_args = compression.copy() + try: + compression = compression_args.pop("method") + except KeyError: + raise ValueError("If dict, compression must have key 'method'") + else: + compression_args = {} + return compression, compression_args + + def _infer_compression( filepath_or_buffer: FilePathOrBuffer, compression: Optional[str] ) -> Optional[str]: @@ -266,8 +312,8 @@ def _infer_compression( Parameters ---------- - filepath_or_buffer : - a path (str) or buffer + filepath_or_buffer : str or file handle + File path or object. compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None} If 'infer' and `filepath_or_buffer` is path-like, then detect compression from the following extensions: '.gz', '.bz2', '.zip', @@ -275,12 +321,11 @@ def _infer_compression( Returns ------- - string or None : - compression method + string or None Raises ------ - ValueError on invalid compression specified + ValueError on invalid compression specified. """ # No compression has been explicitly specified @@ -312,32 +357,49 @@ def _infer_compression( def _get_handle( - path_or_buf, mode, encoding=None, compression=None, memory_map=False, is_text=True + path_or_buf, + mode: str, + encoding=None, + compression: Optional[Union[str, Dict[str, Any]]] = None, + memory_map: bool = False, + is_text: bool = True, ): """ Get file handle for given path/buffer and mode. Parameters ---------- - path_or_buf : - a path (str) or buffer + path_or_buf : str or file handle + File path or object. mode : str - mode to open path_or_buf with + Mode to open path_or_buf with. encoding : str or None - compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default None - If 'infer' and `filepath_or_buffer` is path-like, then detect - compression from the following extensions: '.gz', '.bz2', '.zip', - or '.xz' (otherwise no compression). + Encoding to use. + compression : str or dict, default None + If string, specifies compression mode. If dict, value at key 'method' + specifies compression mode. Compression mode must be one of {'infer', + 'gzip', 'bz2', 'zip', 'xz', None}. If compression mode is 'infer' + and `filepath_or_buffer` is path-like, then detect compression from + the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise + no compression). If dict and compression mode is 'zip' or inferred as + 'zip', other entries passed as additional compression options. + + .. versionchanged:: 1.0.0 + + May now be a dict with key 'method' as compression mode + and other keys as compression options if compression + mode is 'zip'. + memory_map : boolean, default False See parsers._parser_params for more information. is_text : boolean, default True whether file/buffer is in text format (csv, json, etc.), or in binary - mode (pickle, etc.) + mode (pickle, etc.). Returns ------- f : file-like - A file-like object + A file-like object. handles : list of file-like objects A list of file-like object that were opened in this function. """ @@ -346,15 +408,16 @@ def _get_handle( need_text_wrapping = (BufferedIOBase, S3File) except ImportError: - need_text_wrapping = BufferedIOBase + need_text_wrapping = BufferedIOBase # type: ignore - handles = list() + handles = list() # type: List[IO] f = path_or_buf # Convert pathlib.Path/py.path.local or string path_or_buf = _stringify_path(path_or_buf) is_path = isinstance(path_or_buf, str) + compression, compression_args = _get_compression_method(compression) if is_path: compression = _infer_compression(path_or_buf, compression) @@ -376,7 +439,7 @@ def _get_handle( # ZIP Compression elif compression == "zip": - zf = BytesZipFile(path_or_buf, mode) + zf = BytesZipFile(path_or_buf, mode, **compression_args) # Ensure the container is closed as well. handles.append(zf) if zf.mode == "w": @@ -429,9 +492,9 @@ def _get_handle( if memory_map and hasattr(f, "fileno"): try: - g = MMapWrapper(f) + wrapped = MMapWrapper(f) f.close() - f = g + f = wrapped except Exception: # we catch any errors that may have occurred # because that is consistent with the lower-level @@ -456,15 +519,19 @@ def __init__( self, file: FilePathOrBuffer, mode: str, - compression: int = zipfile.ZIP_DEFLATED, + archive_name: Optional[str] = None, **kwargs ): if mode in ["wb", "rb"]: mode = mode.replace("b", "") - super().__init__(file, mode, compression, **kwargs) + self.archive_name = archive_name + super().__init__(file, mode, zipfile.ZIP_DEFLATED, **kwargs) def write(self, data): - super().writestr(self.filename, data) + archive_name = self.filename + if self.archive_name is not None: + archive_name = self.archive_name + super().writestr(archive_name, data) @property def closed(self): diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 60daf311397e80..e25862537cbfc5 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -22,6 +22,7 @@ from pandas.io.common import ( UnicodeWriter, + _get_compression_method, _get_handle, _infer_compression, get_filepath_or_buffer, @@ -58,6 +59,9 @@ def __init__( if path_or_buf is None: path_or_buf = StringIO() + # Extract compression mode as given, if dict + compression, self.compression_args = _get_compression_method(compression) + self.path_or_buf, _, _, _ = get_filepath_or_buffer( path_or_buf, encoding=encoding, compression=compression, mode=mode ) @@ -178,7 +182,7 @@ def save(self): self.path_or_buf, self.mode, encoding=self.encoding, - compression=self.compression, + compression=dict(self.compression_args, method=self.compression), ) close = True @@ -206,11 +210,13 @@ def save(self): if hasattr(self.path_or_buf, "write"): self.path_or_buf.write(buf) else: + compression = dict(self.compression_args, method=self.compression) + f, handles = _get_handle( self.path_or_buf, self.mode, encoding=self.encoding, - compression=self.compression, + compression=compression, ) f.write(buf) close = True diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index ee236a8253b01a..ab44b8b8059eb4 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -514,3 +514,44 @@ def test_to_csv_compression(self, compression_only, read_infer, to_infer): df.to_csv(path, compression=to_compression) result = pd.read_csv(path, index_col=0, compression=read_compression) tm.assert_frame_equal(result, df) + + def test_to_csv_compression_dict(self, compression_only): + # GH 26023 + method = compression_only + df = DataFrame({"ABC": [1]}) + filename = "to_csv_compress_as_dict." + filename += "gz" if method == "gzip" else method + with tm.ensure_clean(filename) as path: + df.to_csv(path, compression={"method": method}) + read_df = pd.read_csv(path, index_col=0) + tm.assert_frame_equal(read_df, df) + + def test_to_csv_compression_dict_no_method_raises(self): + # GH 26023 + df = DataFrame({"ABC": [1]}) + compression = {"some_option": True} + msg = "must have key 'method'" + + with tm.ensure_clean("out.zip") as path: + with pytest.raises(ValueError, match=msg): + df.to_csv(path, compression=compression) + + @pytest.mark.parametrize("compression", ["zip", "infer"]) + @pytest.mark.parametrize( + "archive_name", [None, "test_to_csv.csv", "test_to_csv.zip"] + ) + def test_to_csv_zip_arguments(self, compression, archive_name): + # GH 26023 + from zipfile import ZipFile + + df = DataFrame({"ABC": [1]}) + with tm.ensure_clean("to_csv_archive_name.zip") as path: + df.to_csv( + path, compression={"method": compression, "archive_name": archive_name} + ) + zp = ZipFile(path) + expected_arcname = path if archive_name is None else archive_name + expected_arcname = os.path.basename(expected_arcname) + assert len(zp.filelist) == 1 + archived_file = os.path.basename(zp.filelist[0].filename) + assert archived_file == expected_arcname From a1bdacfaf0693336b957b1bd3821f15c05120aff Mon Sep 17 00:00:00 2001 From: Katrin Leinweber <9948149+katrinleinweber@users.noreply.github.com> Date: Mon, 26 Aug 2019 18:37:14 +0200 Subject: [PATCH 19/95] DOC: Harmonize column selection to bracket notation (#27562) * Harmonize column selection to bracket notation As suggested by https://medium.com/dunder-data/minimally-sufficient-pandas-a8e67f2a2428#46f9 --- doc/source/getting_started/10min.rst | 2 +- doc/source/getting_started/basics.rst | 12 +++--- .../comparison/comparison_with_r.rst | 8 ++-- doc/source/user_guide/advanced.rst | 2 +- doc/source/user_guide/cookbook.rst | 6 +-- doc/source/user_guide/enhancingperf.rst | 12 +++--- doc/source/user_guide/indexing.rst | 39 ++++++++++--------- doc/source/user_guide/reshaping.rst | 10 ++--- doc/source/user_guide/visualization.rst | 14 +++---- 9 files changed, 54 insertions(+), 51 deletions(-) diff --git a/doc/source/getting_started/10min.rst b/doc/source/getting_started/10min.rst index 9045e5b32c29fe..41520795bde62e 100644 --- a/doc/source/getting_started/10min.rst +++ b/doc/source/getting_started/10min.rst @@ -278,7 +278,7 @@ Using a single column's values to select data. .. ipython:: python - df[df.A > 0] + df[df['A'] > 0] Selecting values from a DataFrame where a boolean condition is met. diff --git a/doc/source/getting_started/basics.rst b/doc/source/getting_started/basics.rst index 3f6f56376861fd..802ffadf2a81ef 100644 --- a/doc/source/getting_started/basics.rst +++ b/doc/source/getting_started/basics.rst @@ -926,7 +926,7 @@ Single aggregations on a ``Series`` this will return a scalar value: .. ipython:: python - tsdf.A.agg('sum') + tsdf['A'].agg('sum') Aggregating with multiple functions @@ -950,13 +950,13 @@ On a ``Series``, multiple functions return a ``Series``, indexed by the function .. ipython:: python - tsdf.A.agg(['sum', 'mean']) + tsdf['A'].agg(['sum', 'mean']) Passing a ``lambda`` function will yield a ```` named row: .. ipython:: python - tsdf.A.agg(['sum', lambda x: x.mean()]) + tsdf['A'].agg(['sum', lambda x: x.mean()]) Passing a named function will yield that name for the row: @@ -965,7 +965,7 @@ Passing a named function will yield that name for the row: def mymean(x): return x.mean() - tsdf.A.agg(['sum', mymean]) + tsdf['A'].agg(['sum', mymean]) Aggregating with a dict +++++++++++++++++++++++ @@ -1065,7 +1065,7 @@ Passing a single function to ``.transform()`` with a ``Series`` will yield a sin .. ipython:: python - tsdf.A.transform(np.abs) + tsdf['A'].transform(np.abs) Transform with multiple functions @@ -1084,7 +1084,7 @@ resulting column names will be the transforming functions. .. ipython:: python - tsdf.A.transform([np.abs, lambda x: x + 1]) + tsdf['A'].transform([np.abs, lambda x: x + 1]) Transforming with a dict diff --git a/doc/source/getting_started/comparison/comparison_with_r.rst b/doc/source/getting_started/comparison/comparison_with_r.rst index 444e886bc951d2..f67f46fc2b29ba 100644 --- a/doc/source/getting_started/comparison/comparison_with_r.rst +++ b/doc/source/getting_started/comparison/comparison_with_r.rst @@ -81,7 +81,7 @@ R pandas =========================================== =========================================== ``select(df, col_one = col1)`` ``df.rename(columns={'col1': 'col_one'})['col_one']`` ``rename(df, col_one = col1)`` ``df.rename(columns={'col1': 'col_one'})`` -``mutate(df, c=a-b)`` ``df.assign(c=df.a-df.b)`` +``mutate(df, c=a-b)`` ``df.assign(c=df['a']-df['b'])`` =========================================== =========================================== @@ -258,8 +258,8 @@ index/slice as well as standard boolean indexing: df = pd.DataFrame({'a': np.random.randn(10), 'b': np.random.randn(10)}) df.query('a <= b') - df[df.a <= df.b] - df.loc[df.a <= df.b] + df[df['a'] <= df['b']] + df.loc[df['a'] <= df['b']] For more details and examples see :ref:`the query documentation `. @@ -284,7 +284,7 @@ In ``pandas`` the equivalent expression, using the df = pd.DataFrame({'a': np.random.randn(10), 'b': np.random.randn(10)}) df.eval('a + b') - df.a + df.b # same as the previous expression + df['a'] + df['b'] # same as the previous expression In certain cases :meth:`~pandas.DataFrame.eval` will be much faster than evaluation in pure Python. For more details and examples see :ref:`the eval diff --git a/doc/source/user_guide/advanced.rst b/doc/source/user_guide/advanced.rst index 22a9791ffde30e..62a9b6396404a7 100644 --- a/doc/source/user_guide/advanced.rst +++ b/doc/source/user_guide/advanced.rst @@ -738,7 +738,7 @@ and allows efficient indexing and storage of an index with a large number of dup df['B'] = df['B'].astype(CategoricalDtype(list('cab'))) df df.dtypes - df.B.cat.categories + df['B'].cat.categories Setting the index will create a ``CategoricalIndex``. diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst index 15af5208a4f1f3..c9d3bc3a28c704 100644 --- a/doc/source/user_guide/cookbook.rst +++ b/doc/source/user_guide/cookbook.rst @@ -592,8 +592,8 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to .. ipython:: python df = pd.DataFrame([0, 1, 0, 1, 1, 1, 0, 1, 1], columns=['A']) - df.A.groupby((df.A != df.A.shift()).cumsum()).groups - df.A.groupby((df.A != df.A.shift()).cumsum()).cumsum() + df['A'].groupby((df['A'] != df['A'].shift()).cumsum()).groups + df['A'].groupby((df['A'] != df['A'].shift()).cumsum()).cumsum() Expanding data ************** @@ -719,7 +719,7 @@ Rolling Apply to multiple columns where function calculates a Series before a Sc df def gm(df, const): - v = ((((df.A + df.B) + 1).cumprod()) - 1) * const + v = ((((df['A'] + df['B']) + 1).cumprod()) - 1) * const return v.iloc[-1] s = pd.Series({df.index[i]: gm(df.iloc[i:min(i + 51, len(df) - 1)], 5) diff --git a/doc/source/user_guide/enhancingperf.rst b/doc/source/user_guide/enhancingperf.rst index a4eefadd54d8c4..2df5b9d82dcc37 100644 --- a/doc/source/user_guide/enhancingperf.rst +++ b/doc/source/user_guide/enhancingperf.rst @@ -393,15 +393,15 @@ Consider the following toy example of doubling each observation: .. code-block:: ipython # Custom function without numba - In [5]: %timeit df['col1_doubled'] = df.a.apply(double_every_value_nonumba) # noqa E501 + In [5]: %timeit df['col1_doubled'] = df['a'].apply(double_every_value_nonumba) # noqa E501 1000 loops, best of 3: 797 us per loop # Standard implementation (faster than a custom function) - In [6]: %timeit df['col1_doubled'] = df.a * 2 + In [6]: %timeit df['col1_doubled'] = df['a'] * 2 1000 loops, best of 3: 233 us per loop # Custom function with numba - In [7]: %timeit (df['col1_doubled'] = double_every_value_withnumba(df.a.to_numpy()) + In [7]: %timeit (df['col1_doubled'] = double_every_value_withnumba(df['a'].to_numpy()) 1000 loops, best of 3: 145 us per loop Caveats @@ -643,8 +643,8 @@ The equivalent in standard Python would be .. ipython:: python df = pd.DataFrame(dict(a=range(5), b=range(5, 10))) - df['c'] = df.a + df.b - df['d'] = df.a + df.b + df.c + df['c'] = df['a'] + df['b'] + df['d'] = df['a'] + df['b'] + df['c'] df['a'] = 1 df @@ -688,7 +688,7 @@ name in an expression. a = np.random.randn() df.query('@a < a') - df.loc[a < df.a] # same as the previous expression + df.loc[a < df['a']] # same as the previous expression With :func:`pandas.eval` you cannot use the ``@`` prefix *at all*, because it isn't defined in that context. ``pandas`` will let you know this if you try to diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index e3b75afcf945e2..cf55ce0c9a6d4e 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -210,7 +210,7 @@ as an attribute: See `here for an explanation of valid identifiers `__. - - The attribute will not be available if it conflicts with an existing method name, e.g. ``s.min`` is not allowed. + - The attribute will not be available if it conflicts with an existing method name, e.g. ``s.min`` is not allowed, but ``s['min']`` is possible. - Similarly, the attribute will not be available if it conflicts with any of the following list: ``index``, ``major_axis``, ``minor_axis``, ``items``. @@ -540,7 +540,7 @@ The ``callable`` must be a function with one argument (the calling Series or Dat columns=list('ABCD')) df1 - df1.loc[lambda df: df.A > 0, :] + df1.loc[lambda df: df['A'] > 0, :] df1.loc[:, lambda df: ['A', 'B']] df1.iloc[:, lambda df: [0, 1]] @@ -552,7 +552,7 @@ You can use callable indexing in ``Series``. .. ipython:: python - df1.A.loc[lambda s: s > 0] + df1['A'].loc[lambda s: s > 0] Using these methods / indexers, you can chain data selection operations without using a temporary variable. @@ -561,7 +561,7 @@ without using a temporary variable. bb = pd.read_csv('data/baseball.csv', index_col='id') (bb.groupby(['year', 'team']).sum() - .loc[lambda df: df.r > 100]) + .loc[lambda df: df['r'] > 100]) .. _indexing.deprecate_ix: @@ -871,9 +871,9 @@ Boolean indexing Another common operation is the use of boolean vectors to filter the data. The operators are: ``|`` for ``or``, ``&`` for ``and``, and ``~`` for ``not``. These **must** be grouped by using parentheses, since by default Python will -evaluate an expression such as ``df.A > 2 & df.B < 3`` as -``df.A > (2 & df.B) < 3``, while the desired evaluation order is -``(df.A > 2) & (df.B < 3)``. +evaluate an expression such as ``df['A'] > 2 & df['B'] < 3`` as +``df['A'] > (2 & df['B']) < 3``, while the desired evaluation order is +``(df['A > 2) & (df['B'] < 3)``. Using a boolean vector to index a Series works exactly as in a NumPy ndarray: @@ -1134,7 +1134,7 @@ between the values of columns ``a`` and ``c``. For example: df # pure python - df[(df.a < df.b) & (df.b < df.c)] + df[(df['a'] < df['b']) & (df['b'] < df['c'])] # query df.query('(a < b) & (b < c)') @@ -1241,7 +1241,7 @@ Full numpy-like syntax: df = pd.DataFrame(np.random.randint(n, size=(n, 3)), columns=list('abc')) df df.query('(a < b) & (b < c)') - df[(df.a < df.b) & (df.b < df.c)] + df[(df['a'] < df['b']) & (df['b'] < df['c'])] Slightly nicer by removing the parentheses (by binding making comparison operators bind tighter than ``&`` and ``|``). @@ -1279,12 +1279,12 @@ The ``in`` and ``not in`` operators df.query('a in b') # How you'd do it in pure Python - df[df.a.isin(df.b)] + df[df['a'].isin(df['b'])] df.query('a not in b') # pure Python - df[~df.a.isin(df.b)] + df[~df['a'].isin(df['b'])] You can combine this with other expressions for very succinct queries: @@ -1297,7 +1297,7 @@ You can combine this with other expressions for very succinct queries: df.query('a in b and c < d') # pure Python - df[df.b.isin(df.a) & (df.c < df.d)] + df[df['b'].isin(df['a']) & (df['c'] < df['d'])] .. note:: @@ -1326,7 +1326,7 @@ to ``in``/``not in``. df.query('b == ["a", "b", "c"]') # pure Python - df[df.b.isin(["a", "b", "c"])] + df[df['b'].isin(["a", "b", "c"])] df.query('c == [1, 2]') @@ -1338,7 +1338,7 @@ to ``in``/``not in``. df.query('[1, 2] not in c') # pure Python - df[df.c.isin([1, 2])] + df[df['c'].isin([1, 2])] Boolean operators @@ -1352,7 +1352,7 @@ You can negate boolean expressions with the word ``not`` or the ``~`` operator. df['bools'] = np.random.rand(len(df)) > 0.5 df.query('~bools') df.query('not bools') - df.query('not bools') == df[~df.bools] + df.query('not bools') == df[~df['bools']] Of course, expressions can be arbitrarily complex too: @@ -1362,7 +1362,10 @@ Of course, expressions can be arbitrarily complex too: shorter = df.query('a < b < c and (not bools) or bools > 2') # equivalent in pure Python - longer = df[(df.a < df.b) & (df.b < df.c) & (~df.bools) | (df.bools > 2)] + longer = df[(df['a'] < df['b']) + & (df['b'] < df['c']) + & (~df['bools']) + | (df['bools'] > 2)] shorter longer @@ -1835,14 +1838,14 @@ chained indexing expression, you can set the :ref:`option ` # This will show the SettingWithCopyWarning # but the frame values will be set - dfb['c'][dfb.a.str.startswith('o')] = 42 + dfb['c'][dfb['a'].str.startswith('o')] = 42 This however is operating on a copy and will not work. :: >>> pd.set_option('mode.chained_assignment','warn') - >>> dfb[dfb.a.str.startswith('o')]['c'] = 42 + >>> dfb[dfb['a'].str.startswith('o')]['c'] = 42 Traceback (most recent call last) ... SettingWithCopyWarning: diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index f118fe84d523a6..dd6d3062a8f0ae 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -469,7 +469,7 @@ If ``crosstab`` receives only two Series, it will provide a frequency table. 'C': [1, 1, np.nan, 1, 1]}) df - pd.crosstab(df.A, df.B) + pd.crosstab(df['A'], df['B']) Any input passed containing ``Categorical`` data will have **all** of its categories included in the cross-tabulation, even if the actual data does @@ -489,13 +489,13 @@ using the ``normalize`` argument: .. ipython:: python - pd.crosstab(df.A, df.B, normalize=True) + pd.crosstab(df['A'], df['B'], normalize=True) ``normalize`` can also normalize values within each row or within each column: .. ipython:: python - pd.crosstab(df.A, df.B, normalize='columns') + pd.crosstab(df['A'], df['B'], normalize='columns') ``crosstab`` can also be passed a third ``Series`` and an aggregation function (``aggfunc``) that will be applied to the values of the third ``Series`` within @@ -503,7 +503,7 @@ each group defined by the first two ``Series``: .. ipython:: python - pd.crosstab(df.A, df.B, values=df.C, aggfunc=np.sum) + pd.crosstab(df['A'], df['B'], values=df['C'], aggfunc=np.sum) Adding margins ~~~~~~~~~~~~~~ @@ -512,7 +512,7 @@ Finally, one can also add margins or normalize this output. .. ipython:: python - pd.crosstab(df.A, df.B, values=df.C, aggfunc=np.sum, normalize=True, + pd.crosstab(df['A'], df['B'], values=df['C'], aggfunc=np.sum, normalize=True, margins=True) .. _reshaping.tile: diff --git a/doc/source/user_guide/visualization.rst b/doc/source/user_guide/visualization.rst index fdceaa5868cecd..fa16b2f2166105 100644 --- a/doc/source/user_guide/visualization.rst +++ b/doc/source/user_guide/visualization.rst @@ -1148,10 +1148,10 @@ To plot data on a secondary y-axis, use the ``secondary_y`` keyword: .. ipython:: python - df.A.plot() + df['A'].plot() @savefig series_plot_secondary_y.png - df.B.plot(secondary_y=True, style='g') + df['B'].plot(secondary_y=True, style='g') .. ipython:: python :suppress: @@ -1205,7 +1205,7 @@ Here is the default behavior, notice how the x-axis tick labeling is performed: plt.figure() @savefig ser_plot_suppress.png - df.A.plot() + df['A'].plot() .. ipython:: python :suppress: @@ -1219,7 +1219,7 @@ Using the ``x_compat`` parameter, you can suppress this behavior: plt.figure() @savefig ser_plot_suppress_parm.png - df.A.plot(x_compat=True) + df['A'].plot(x_compat=True) .. ipython:: python :suppress: @@ -1235,9 +1235,9 @@ in ``pandas.plotting.plot_params`` can be used in a `with statement`: @savefig ser_plot_suppress_context.png with pd.plotting.plot_params.use('x_compat', True): - df.A.plot(color='r') - df.B.plot(color='g') - df.C.plot(color='b') + df['A'].plot(color='r') + df['B'].plot(color='g') + df['C'].plot(color='b') .. ipython:: python :suppress: From 7528d088c9aa597174fbccbc1bddb9290ba2556e Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 26 Aug 2019 18:10:26 +0100 Subject: [PATCH 20/95] TYPING: add stubs for _packer and _unpacker (#28135) --- pandas/io/msgpack/_packer.pyi | 22 ++++++++++++ pandas/io/msgpack/_unpacker.pyi | 59 +++++++++++++++++++++++++++++++++ 2 files changed, 81 insertions(+) create mode 100644 pandas/io/msgpack/_packer.pyi create mode 100644 pandas/io/msgpack/_unpacker.pyi diff --git a/pandas/io/msgpack/_packer.pyi b/pandas/io/msgpack/_packer.pyi new file mode 100644 index 00000000000000..e95a1622c56153 --- /dev/null +++ b/pandas/io/msgpack/_packer.pyi @@ -0,0 +1,22 @@ +# flake8: noqa + +class Packer: + def __cinit__(self): ... + def __init__( + self, + default=..., + encoding=..., + unicode_errors=..., + use_single_float=..., + autoreset: int = ..., + use_bin_type: int = ..., + ): ... + def __dealloc__(self): ... + def _pack(self, o, nest_limit: int = ...) -> int: ... + def pack(self, obj): ... + def pack_ext_type(self, typecode, data): ... + def pack_array_header(self, size): ... + def pack_map_header(self, size): ... + def pack_map_pairs(self, pairs): ... + def reset(self) -> None: ... + def bytes(self): ... diff --git a/pandas/io/msgpack/_unpacker.pyi b/pandas/io/msgpack/_unpacker.pyi new file mode 100644 index 00000000000000..9910895947fb64 --- /dev/null +++ b/pandas/io/msgpack/_unpacker.pyi @@ -0,0 +1,59 @@ +# flake8: noqa + +def unpackb( + packed, + object_hook=..., + list_hook=..., + use_list=..., + encoding=..., + unicode_errors=..., + object_pairs_hook=..., + ext_hook=..., + max_str_len=..., + max_bin_len=..., + max_array_len=..., + max_map_len=..., + max_ext_len=..., +): ... +def unpack( + stream, + object_hook=..., + list_hook=..., + use_list=..., + encoding=..., + unicode_errors=..., + object_pairs_hook=..., +): ... + +class Unpacker: + def __cinit__(self): ... + def __dealloc__(self): ... + def __init__( + self, + file_like=..., + read_size=..., + use_list=..., + object_hook=..., + object_pairs_hook=..., + list_hook=..., + encoding=..., + unicode_errors=..., + max_buffer_size: int = ..., + ext_hook=..., + max_str_len=..., + max_bin_len=..., + max_array_len=..., + max_map_len=..., + max_ext_len=..., + ): ... + def feed(self, next_bytes): ... + def append_buffer(self, _buf, _buf_len): ... + def read_from_file(self): ... + def _unpack(self, execute, write_bytes, iter=...): ... + def read_bytes(self, nbytes): ... + def unpack(self, write_bytes=...): ... + def skip(self, write_bytes=...): ... + def read_array_header(self, write_bytes=...): ... + def read_map_header(self, write_bytes=...): ... + def __iter__(self): ... + def __next__(self): ... From bca39a72b073758d3cfa7afa470462255f1bc066 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Mon, 26 Aug 2019 10:53:59 -0700 Subject: [PATCH 21/95] Run clang-format on objToJSON (#28144) --- pandas/_libs/src/ujson/python/objToJSON.c | 381 +++++++++++----------- 1 file changed, 188 insertions(+), 193 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index de336fb3aa1dcb..4b612bb033761d 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -16,18 +16,19 @@ derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE +GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF +THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc) https://github.com/client9/stringencoders -Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. +Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights +reserved. Numeric decoder derived from from TCL library http://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms @@ -64,9 +65,9 @@ typedef void *(*PFN_PyTypeToJSON)(JSOBJ obj, JSONTypeContext *ti, typedef struct __NpyArrContext { PyObject *array; char *dataptr; - int curdim; // current dimension in array's order - int stridedim; // dimension we are striding over - int inc; // stride dimension increment (+/- 1) + int curdim; // current dimension in array's order + int stridedim; // dimension we are striding over + int inc; // stride dimension increment (+/- 1) npy_intp dim; npy_intp stride; npy_intp ndim; @@ -83,8 +84,8 @@ typedef struct __PdBlockContext { int ncols; int transpose; - int *cindices; // frame column -> block column map - NpyArrContext **npyCtxts; // NpyArrContext for each column + int *cindices; // frame column -> block column map + NpyArrContext **npyCtxts; // NpyArrContext for each column } PdBlockContext; typedef struct __TypeContext { @@ -148,13 +149,12 @@ enum PANDAS_FORMAT { SPLIT, RECORDS, INDEX, COLUMNS, VALUES }; int PdBlock_iterNext(JSOBJ, JSONTypeContext *); -void *initObjToJSON(void) -{ +void *initObjToJSON(void) { PyObject *mod_pandas; PyObject *mod_nattype; PyObject *mod_decimal = PyImport_ImportModule("decimal"); type_decimal = - (PyTypeObject *)PyObject_GetAttrString(mod_decimal, "Decimal"); + (PyTypeObject *)PyObject_GetAttrString(mod_decimal, "Decimal"); Py_DECREF(mod_decimal); PyDateTime_IMPORT; @@ -167,14 +167,14 @@ void *initObjToJSON(void) cls_series = (PyTypeObject *)PyObject_GetAttrString(mod_pandas, "Series"); cls_timestamp = PyObject_GetAttrString(mod_pandas, "Timestamp"); - cls_timedelta = PyObject_GetAttrString(mod_pandas, "Timedelta"); + cls_timedelta = PyObject_GetAttrString(mod_pandas, "Timedelta"); Py_DECREF(mod_pandas); } mod_nattype = PyImport_ImportModule("pandas._libs.tslibs.nattype"); if (mod_nattype) { - cls_nat = (PyTypeObject *)PyObject_GetAttrString(mod_nattype, - "NaTType"); + cls_nat = + (PyTypeObject *)PyObject_GetAttrString(mod_nattype, "NaTType"); Py_DECREF(mod_nattype); } @@ -212,7 +212,6 @@ static TypeContext *createTypeContext(void) { return pc; } - static int is_sparse_array(PyObject *obj) { // TODO can be removed again once SparseArray.values is removed (GH26421) if (PyObject_HasAttrString(obj, "_subtyp")) { @@ -227,7 +226,6 @@ static int is_sparse_array(PyObject *obj) { return 0; } - static PyObject *get_values(PyObject *obj) { PyObject *values = NULL; @@ -242,7 +240,8 @@ static PyObject *get_values(PyObject *obj) { values = PyObject_CallMethod(values, "to_numpy", NULL); } - if (!is_sparse_array(values) && PyObject_HasAttrString(values, "values")) { + if (!is_sparse_array(values) && + PyObject_HasAttrString(values, "values")) { PyObject *subvals = get_values(values); PyErr_Clear(); PRINTMARK(); @@ -357,20 +356,20 @@ static Py_ssize_t get_attr_length(PyObject *obj, char *attr) { } static npy_int64 get_long_attr(PyObject *o, const char *attr) { - npy_int64 long_val; - PyObject *value = PyObject_GetAttrString(o, attr); - long_val = (PyLong_Check(value) ? - PyLong_AsLongLong(value) : PyLong_AsLong(value)); - Py_DECREF(value); - return long_val; + npy_int64 long_val; + PyObject *value = PyObject_GetAttrString(o, attr); + long_val = + (PyLong_Check(value) ? PyLong_AsLongLong(value) : PyLong_AsLong(value)); + Py_DECREF(value); + return long_val; } static npy_float64 total_seconds(PyObject *td) { - npy_float64 double_val; - PyObject *value = PyObject_CallMethod(td, "total_seconds", NULL); - double_val = PyFloat_AS_DOUBLE(value); - Py_DECREF(value); - return double_val; + npy_float64 double_val; + PyObject *value = PyObject_CallMethod(td, "total_seconds", NULL); + double_val = PyFloat_AS_DOUBLE(value); + Py_DECREF(value); + return double_val; } static PyObject *get_item(PyObject *obj, Py_ssize_t i) { @@ -450,7 +449,7 @@ static void *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc, void *outValue, if (PyUnicode_IS_COMPACT_ASCII(obj)) { Py_ssize_t len; - char *data = (char*)PyUnicode_AsUTF8AndSize(obj, &len); + char *data = (char *)PyUnicode_AsUTF8AndSize(obj, &len); *_outLen = len; return data; } @@ -505,7 +504,7 @@ static void *NpyDateTimeScalarToJSON(JSOBJ _obj, JSONTypeContext *tc, // TODO(anyone): Does not appear to be reached in tests. pandas_datetime_to_datetimestruct(obj->obval, - (NPY_DATETIMEUNIT)obj->obmeta.base, &dts); + (NPY_DATETIMEUNIT)obj->obmeta.base, &dts); return PandasDateTimeStructToJSON(&dts, tc, outValue, _outLen); } @@ -664,9 +663,9 @@ void NpyArr_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { GET_TC(tc)->npyarr = npyarr; if (!npyarr) { - PyErr_NoMemory(); - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - return; + PyErr_NoMemory(); + GET_TC(tc)->iterNext = NpyArr_iterNextNone; + return; } npyarr->array = (PyObject *)obj; @@ -677,17 +676,17 @@ void NpyArr_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { npyarr->type_num = PyArray_DESCR(obj)->type_num; if (GET_TC(tc)->transpose) { - npyarr->dim = PyArray_DIM(obj, npyarr->ndim); - npyarr->stride = PyArray_STRIDE(obj, npyarr->ndim); - npyarr->stridedim = npyarr->ndim; - npyarr->index[npyarr->ndim] = 0; - npyarr->inc = -1; + npyarr->dim = PyArray_DIM(obj, npyarr->ndim); + npyarr->stride = PyArray_STRIDE(obj, npyarr->ndim); + npyarr->stridedim = npyarr->ndim; + npyarr->index[npyarr->ndim] = 0; + npyarr->inc = -1; } else { - npyarr->dim = PyArray_DIM(obj, 0); - npyarr->stride = PyArray_STRIDE(obj, 0); - npyarr->stridedim = 0; - npyarr->index[0] = 0; - npyarr->inc = 1; + npyarr->dim = PyArray_DIM(obj, 0); + npyarr->stride = PyArray_STRIDE(obj, 0); + npyarr->stridedim = 0; + npyarr->index[0] = 0; + npyarr->inc = 1; } npyarr->columnLabels = GET_TC(tc)->columnLabels; @@ -735,8 +734,7 @@ int NpyArr_iterNextItem(JSOBJ obj, JSONTypeContext *tc) { NpyArr_freeItemValue(obj, tc); - if (PyArray_ISDATETIME(npyarr->array)) - { + if (PyArray_ISDATETIME(npyarr->array)) { PRINTMARK(); GET_TC(tc)->itemValue = obj; Py_INCREF(obj); @@ -797,10 +795,10 @@ char *NpyArr_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { if (GET_TC(tc)->iterNext == NpyArr_iterNextItem) { idx = npyarr->index[npyarr->stridedim] - 1; - cStr = npyarr->columnLabels[idx]; + cStr = npyarr->columnLabels[idx]; } else { idx = npyarr->index[npyarr->stridedim - npyarr->inc] - 1; - cStr = npyarr->rowLabels[idx]; + cStr = npyarr->rowLabels[idx]; } *outLen = strlen(cStr); @@ -852,13 +850,13 @@ char *PdBlock_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { if (GET_TC(tc)->iterNext == PdBlock_iterNextItem) { idx = blkCtxt->colIdx - 1; - cStr = npyarr->columnLabels[idx]; + cStr = npyarr->columnLabels[idx]; } else { idx = GET_TC(tc)->iterNext != PdBlock_iterNext ? npyarr->index[npyarr->stridedim - npyarr->inc] - 1 : npyarr->index[npyarr->stridedim]; - cStr = npyarr->rowLabels[idx]; + cStr = npyarr->rowLabels[idx]; } *outLen = strlen(cStr); @@ -875,10 +873,10 @@ char *PdBlock_iterGetName_Transpose(JSOBJ obj, JSONTypeContext *tc, if (GET_TC(tc)->iterNext == NpyArr_iterNextItem) { idx = npyarr->index[npyarr->stridedim] - 1; - cStr = npyarr->columnLabels[idx]; + cStr = npyarr->columnLabels[idx]; } else { idx = blkCtxt->colIdx; - cStr = npyarr->rowLabels[idx]; + cStr = npyarr->rowLabels[idx]; } *outLen = strlen(cStr); @@ -943,9 +941,9 @@ void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { dtype = PyArray_DescrFromType(NPY_INT64); obj = (PyObject *)_obj; - GET_TC(tc) - ->iterGetName = GET_TC(tc)->transpose ? PdBlock_iterGetName_Transpose - : PdBlock_iterGetName; + GET_TC(tc)->iterGetName = GET_TC(tc)->transpose + ? PdBlock_iterGetName_Transpose + : PdBlock_iterGetName; blkCtxt = PyObject_Malloc(sizeof(PdBlockContext)); if (!blkCtxt) { @@ -1396,7 +1394,7 @@ void Series_iterBegin(JSOBJ obj, JSONTypeContext *tc) { PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; GET_TC(tc)->index = 0; GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char)); - enc->outputFormat = VALUES; // for contained series + enc->outputFormat = VALUES; // for contained series if (!GET_TC(tc)->cStr) { PyErr_NoMemory(); } @@ -1455,7 +1453,7 @@ void DataFrame_iterBegin(JSOBJ obj, JSONTypeContext *tc) { PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; GET_TC(tc)->index = 0; GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char)); - enc->outputFormat = VALUES; // for contained series & index + enc->outputFormat = VALUES; // for contained series & index if (!GET_TC(tc)->cStr) { PyErr_NoMemory(); } @@ -1634,115 +1632,116 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, type_num = PyArray_TYPE(labels); for (i = 0; i < num; i++) { - item = PyArray_GETITEM(labels, dataptr); + item = PyArray_GETITEM(labels, dataptr); if (!item) { - NpyArr_freeLabels(ret, num); - ret = 0; - break; - } - - // TODO: for any matches on type_num (date and timedeltas) should use a - // vectorized solution to convert to epoch or iso formats - if (enc->datetimeIso && (type_num == NPY_TIMEDELTA || PyDelta_Check(item))) { - PyObject *td = PyObject_CallFunction(cls_timedelta, "(O)", item); - if (td == NULL) { - Py_DECREF(item); NpyArr_freeLabels(ret, num); ret = 0; break; - } - - PyObject *iso = PyObject_CallMethod(td, "isoformat", NULL); - Py_DECREF(td); - if (iso == NULL) { - Py_DECREF(item); - NpyArr_freeLabels(ret, num); - ret = 0; - break; - } - - cLabel = (char *)PyUnicode_AsUTF8(iso); - Py_DECREF(iso); - len = strlen(cLabel); - } - else if (PyTypeNum_ISDATETIME(type_num) || - PyDateTime_Check(item) || PyDate_Check(item)) { - PyObject *ts = PyObject_CallFunction(cls_timestamp, "(O)", item); - if (ts == NULL) { - Py_DECREF(item); - NpyArr_freeLabels(ret, num); - ret = 0; - break; - } - - if (enc->datetimeIso) { - PyObject *iso = PyObject_CallMethod(ts, "isoformat", NULL); - Py_DECREF(ts); - if (iso == NULL) { - Py_DECREF(item); - NpyArr_freeLabels(ret, num); - ret = 0; - break; - } - - cLabel = (char *)PyUnicode_AsUTF8(iso); - Py_DECREF(iso); - len = strlen(cLabel); - } else { - npy_int64 value; - // TODO: refactor to not duplicate what goes on in beginTypeContext - if (PyObject_HasAttrString(ts, "value")) { - PRINTMARK(); - value = get_long_attr(ts, "value"); - } else { - PRINTMARK(); - value = - total_seconds(ts) * 1000000000LL; // nanoseconds per second - } - Py_DECREF(ts); - - switch (enc->datetimeUnit) { - case NPY_FR_ns: - break; - case NPY_FR_us: - value /= 1000LL; - break; - case NPY_FR_ms: - value /= 1000000LL; - break; - case NPY_FR_s: - value /= 1000000000LL; - break; - default: - Py_DECREF(item); - NpyArr_freeLabels(ret, num); - ret = 0; - break; - } - - char buf[21] = {0}; // 21 chars for 2**63 as string - cLabel = buf; - sprintf(buf, "%" NPY_INT64_FMT, value); - len = strlen(cLabel); - } - } else { // Fallack to string representation - PyObject *str = PyObject_Str(item); - if (str == NULL) { - Py_DECREF(item); - NpyArr_freeLabels(ret, num); - ret = 0; - break; - } - - cLabel = (char *)PyUnicode_AsUTF8(str); - Py_DECREF(str); - len = strlen(cLabel); - } - - Py_DECREF(item); - // Add 1 to include NULL terminator - ret[i] = PyObject_Malloc(len + 1); - memcpy(ret[i], cLabel, len + 1); + } + + // TODO: for any matches on type_num (date and timedeltas) should use a + // vectorized solution to convert to epoch or iso formats + if (enc->datetimeIso && + (type_num == NPY_TIMEDELTA || PyDelta_Check(item))) { + PyObject *td = PyObject_CallFunction(cls_timedelta, "(O)", item); + if (td == NULL) { + Py_DECREF(item); + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } + + PyObject *iso = PyObject_CallMethod(td, "isoformat", NULL); + Py_DECREF(td); + if (iso == NULL) { + Py_DECREF(item); + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } + + cLabel = (char *)PyUnicode_AsUTF8(iso); + Py_DECREF(iso); + len = strlen(cLabel); + } else if (PyTypeNum_ISDATETIME(type_num) || PyDateTime_Check(item) || + PyDate_Check(item)) { + PyObject *ts = PyObject_CallFunction(cls_timestamp, "(O)", item); + if (ts == NULL) { + Py_DECREF(item); + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } + + if (enc->datetimeIso) { + PyObject *iso = PyObject_CallMethod(ts, "isoformat", NULL); + Py_DECREF(ts); + if (iso == NULL) { + Py_DECREF(item); + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } + + cLabel = (char *)PyUnicode_AsUTF8(iso); + Py_DECREF(iso); + len = strlen(cLabel); + } else { + npy_int64 value; + // TODO: refactor to not duplicate what goes on in + // beginTypeContext + if (PyObject_HasAttrString(ts, "value")) { + PRINTMARK(); + value = get_long_attr(ts, "value"); + } else { + PRINTMARK(); + value = total_seconds(ts) * + 1000000000LL; // nanoseconds per second + } + Py_DECREF(ts); + + switch (enc->datetimeUnit) { + case NPY_FR_ns: + break; + case NPY_FR_us: + value /= 1000LL; + break; + case NPY_FR_ms: + value /= 1000000LL; + break; + case NPY_FR_s: + value /= 1000000000LL; + break; + default: + Py_DECREF(item); + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } + + char buf[21] = {0}; // 21 chars for 2**63 as string + cLabel = buf; + sprintf(buf, "%" NPY_INT64_FMT, value); + len = strlen(cLabel); + } + } else { // Fallack to string representation + PyObject *str = PyObject_Str(item); + if (str == NULL) { + Py_DECREF(item); + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } + + cLabel = (char *)PyUnicode_AsUTF8(str); + Py_DECREF(str); + len = strlen(cLabel); + } + + Py_DECREF(item); + // Add 1 to include NULL terminator + ret[i] = PyObject_Malloc(len + 1); + memcpy(ret[i], cLabel, len + 1); if (PyErr_Occurred()) { NpyArr_freeLabels(ret, num); @@ -1923,23 +1922,22 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { value = get_long_attr(obj, "value"); } else { PRINTMARK(); - value = - total_seconds(obj) * 1000000000LL; // nanoseconds per second + value = total_seconds(obj) * 1000000000LL; // nanoseconds per second } base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; switch (base) { - case NPY_FR_ns: - break; - case NPY_FR_us: - value /= 1000LL; - break; - case NPY_FR_ms: - value /= 1000000LL; - break; - case NPY_FR_s: - value /= 1000000000LL; - break; + case NPY_FR_ns: + break; + case NPY_FR_us: + value /= 1000LL; + break; + case NPY_FR_ms: + value /= 1000000LL; + break; + case NPY_FR_s: + value /= 1000000000LL; + break; } exc = PyErr_Occurred(); @@ -2054,8 +2052,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { goto INVALID; } pc->columnLabelsLen = PyArray_DIM(pc->newObj, 0); - pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, - enc, + pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, enc, pc->columnLabelsLen); if (!pc->columnLabels) { goto INVALID; @@ -2157,8 +2154,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { goto INVALID; } pc->columnLabelsLen = PyObject_Size(tmpObj); - pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, - enc, + pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, enc, pc->columnLabelsLen); Py_DECREF(tmpObj); if (!pc->columnLabels) { @@ -2179,9 +2175,8 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { goto INVALID; } pc->rowLabelsLen = PyObject_Size(tmpObj); - pc->rowLabels = - NpyArr_encodeLabels((PyArrayObject *)values, - enc, pc->rowLabelsLen); + pc->rowLabels = NpyArr_encodeLabels((PyArrayObject *)values, enc, + pc->rowLabelsLen); Py_DECREF(tmpObj); tmpObj = (enc->outputFormat == INDEX ? PyObject_GetAttrString(obj, "columns") @@ -2199,8 +2194,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { goto INVALID; } pc->columnLabelsLen = PyObject_Size(tmpObj); - pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, - enc, + pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, enc, pc->columnLabelsLen); Py_DECREF(tmpObj); if (!pc->columnLabels) { @@ -2325,7 +2319,8 @@ void Object_endTypeContext(JSOBJ obj, JSONTypeContext *tc) { PyObject_Free(GET_TC(tc)->cStr); GET_TC(tc)->cStr = NULL; - if (tc->prv != &(((PyObjectEncoder *)tc->encoder)->basicTypeContext)) { // NOLINT + if (tc->prv != + &(((PyObjectEncoder *)tc->encoder)->basicTypeContext)) { // NOLINT PyObject_Free(tc->prv); } tc->prv = NULL; @@ -2388,7 +2383,7 @@ PyObject *objToJSON(PyObject *self, PyObject *args, PyObject *kwargs) { PyObject *newobj; PyObject *oinput = NULL; PyObject *oensureAscii = NULL; - int idoublePrecision = 10; // default double precision setting + int idoublePrecision = 10; // default double precision setting PyObject *oencodeHTMLChars = NULL; char *sOrient = NULL; char *sdateFormat = NULL; @@ -2411,10 +2406,10 @@ PyObject *objToJSON(PyObject *self, PyObject *args, PyObject *kwargs) { PyObject_Malloc, PyObject_Realloc, PyObject_Free, - -1, // recursionMax + -1, // recursionMax idoublePrecision, - 1, // forceAscii - 0, // encodeHTMLChars + 1, // forceAscii + 0, // encodeHTMLChars }}; JSONObjectEncoder *encoder = (JSONObjectEncoder *)&pyEncoder; From 87d26bafcdb2495f8a9e76489d3438b1571beb05 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 26 Aug 2019 13:14:06 -0700 Subject: [PATCH 22/95] PERF: replace with list, closes #28084 (#28099) --- asv_bench/benchmarks/replace.py | 17 +++++++++++++++++ pandas/core/internals/blocks.py | 22 +++++++++++++++++++++- 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/replace.py b/asv_bench/benchmarks/replace.py index 6137e944e6b9e3..f69ae150285255 100644 --- a/asv_bench/benchmarks/replace.py +++ b/asv_bench/benchmarks/replace.py @@ -36,6 +36,23 @@ def time_replace_series(self, inplace): self.s.replace(self.to_rep, inplace=inplace) +class ReplaceList: + # GH#28099 + + params = [(True, False)] + param_names = ["inplace"] + + def setup(self, inplace): + self.df = pd.DataFrame({"A": 0, "B": 0}, index=range(4 * 10 ** 7)) + + def time_replace_list(self, inplace): + self.df.replace([np.inf, -np.inf], np.nan, inplace=inplace) + + def time_replace_list_one_match(self, inplace): + # the 1 can be held in self._df.blocks[0], while the inf and -inf cant + self.df.replace([np.inf, -np.inf, 1], np.nan, inplace=inplace) + + class Convert: params = (["DataFrame", "Series"], ["Timestamp", "Timedelta"]) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index f0ee56f403325a..a2a51881016a38 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -743,6 +743,26 @@ def replace( return [self] return [self.copy()] + to_replace = [x for x in to_replace if self._can_hold_element(x)] + if not len(to_replace): + # GH#28084 avoid costly checks since we can infer + # that there is nothing to replace in this block + if inplace: + return [self] + return [self.copy()] + + if len(to_replace) == 1: + # _can_hold_element checks have reduced this back to the + # scalar case and we can avoid a costly object cast + return self.replace( + to_replace[0], + value, + inplace=inplace, + filter=filter, + regex=regex, + convert=convert, + ) + # GH 22083, TypeError or ValueError occurred within error handling # causes infinite loop. Cast and retry only if not objectblock. if is_object_dtype(self): @@ -751,7 +771,7 @@ def replace( # try again with a compatible block block = self.astype(object) return block.replace( - to_replace=original_to_replace, + to_replace=to_replace, value=value, inplace=inplace, filter=filter, From 7deda218435e787275e5899162b482001df85684 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 26 Aug 2019 15:56:57 -0500 Subject: [PATCH 23/95] DOC: whatsnew for 28099 (#28154) --- doc/source/whatsnew/v1.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 2bfc09e52c68b5..7fe358d3820f23 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -76,6 +76,7 @@ Performance improvements - Performance improvement in indexing with a non-unique :class:`IntervalIndex` (:issue:`27489`) - Performance improvement in `MultiIndex.is_monotonic` (:issue:`27495`) - Performance improvement in :func:`cut` when ``bins`` is an :class:`IntervalIndex` (:issue:`27668`) +- Performance improvement in :meth:`DataFrame.replace` when provided a list of values to replace (:issue:`28099`) .. _whatsnew_1000.bug_fixes: From 9f48098a021c7b744ff4604b605de7b99c7e62f4 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 26 Aug 2019 17:49:08 -0500 Subject: [PATCH 24/95] DOC: Set 1.0.0 in index.rst (#28149) --- doc/source/index.rst.template | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/index.rst.template b/doc/source/index.rst.template index b57ce83cfc33c9..f5669626aa2b31 100644 --- a/doc/source/index.rst.template +++ b/doc/source/index.rst.template @@ -39,7 +39,7 @@ See the :ref:`overview` for more detail about what's in the library. :hidden: {% endif %} {% if not single_doc %} - What's New in 0.25.0 + What's New in 1.0.0 install getting_started/index user_guide/index @@ -53,7 +53,7 @@ See the :ref:`overview` for more detail about what's in the library. whatsnew/index {% endif %} -* :doc:`whatsnew/v0.25.0` +* :doc:`whatsnew/v1.0.0` * :doc:`install` * :doc:`getting_started/index` From 294a22c0baa2e024d12f70705c4ec85f4c82b2b0 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 26 Aug 2019 18:11:05 -0500 Subject: [PATCH 25/95] BUG: Fix groupby quantile array (#28113) --- doc/source/whatsnew/v0.25.2.rst | 3 +-- pandas/core/groupby/groupby.py | 4 ++-- pandas/tests/groupby/test_function.py | 18 ++++++++++++++++++ 3 files changed, 21 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.25.2.rst b/doc/source/whatsnew/v0.25.2.rst index 403c02c3ff129d..6974c7521a2376 100644 --- a/doc/source/whatsnew/v0.25.2.rst +++ b/doc/source/whatsnew/v0.25.2.rst @@ -76,8 +76,7 @@ Plotting Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ -- -- +- Bug incorrectly raising an ``IndexError`` when passing a list of quantiles to :meth:`pandas.core.groupby.DataFrameGroupBy.quantile` (:issue:`28113`). - - - diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 3eeecd9c149e1b..87047d21709927 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1947,8 +1947,8 @@ def post_processor(vals: np.ndarray, inference: Optional[Type]) -> np.ndarray: arrays = [] for i in range(self.ngroups): - arr = arr + i - arrays.append(arr) + arr2 = arr + i + arrays.append(arr2) indices = np.concatenate(arrays) assert len(indices) == len(result) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 509d7c33b643b5..d89233f2fd603c 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1257,6 +1257,24 @@ def test_quantile_array(): tm.assert_frame_equal(result, expected) +def test_quantile_array2(): + # https://github.com/pandas-dev/pandas/pull/28085#issuecomment-524066959 + df = pd.DataFrame( + np.random.RandomState(0).randint(0, 5, size=(10, 3)), columns=list("ABC") + ) + result = df.groupby("A").quantile([0.3, 0.7]) + expected = pd.DataFrame( + { + "B": [0.9, 2.1, 2.2, 3.4, 1.6, 2.4, 2.3, 2.7, 0.0, 0.0], + "C": [1.2, 2.8, 1.8, 3.0, 0.0, 0.0, 1.9, 3.1, 3.0, 3.0], + }, + index=pd.MultiIndex.from_product( + [[0, 1, 2, 3, 4], [0.3, 0.7]], names=["A", None] + ), + ) + tm.assert_frame_equal(result, expected) + + def test_quantile_array_no_sort(): df = pd.DataFrame({"A": [0, 1, 2], "B": [3, 4, 5]}) result = df.groupby([1, 0, 1], sort=False).quantile([0.25, 0.5, 0.75]) From ddfc9a232f605e935c06efebdc0830d2b14dfdd5 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Tue, 27 Aug 2019 00:16:13 +0100 Subject: [PATCH 26/95] TYPING: --disallow-any-expr for HTMLFormatter.__init__ (#28140) --- pandas/io/formats/format.py | 6 ++++-- pandas/io/formats/html.py | 8 ++++---- pandas/io/formats/latex.py | 5 +++-- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 61af935bd82276..8ff4b9bda0430a 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -549,7 +549,8 @@ def __init__( decimal: str = ".", table_id: Optional[str] = None, render_links: bool = False, - **kwds + bold_rows: bool = False, + escape: bool = True, ): self.frame = frame self.show_index_names = index_names @@ -580,7 +581,8 @@ def __init__( else: self.justify = justify - self.kwds = kwds + self.bold_rows = bold_rows + self.escape = escape if columns is not None: self.columns = ensure_index(columns) diff --git a/pandas/io/formats/html.py b/pandas/io/formats/html.py index 4b44893df70ed5..8c4a7f4a1213d9 100644 --- a/pandas/io/formats/html.py +++ b/pandas/io/formats/html.py @@ -37,7 +37,7 @@ class HTMLFormatter(TableFormatter): def __init__( self, formatter: DataFrameFormatter, - classes: Optional[Union[str, List, Tuple]] = None, + classes: Optional[Union[str, List[str], Tuple[str, ...]]] = None, border: Optional[int] = None, ) -> None: self.fmt = formatter @@ -46,11 +46,11 @@ def __init__( self.frame = self.fmt.frame self.columns = self.fmt.tr_frame.columns self.elements = [] # type: List[str] - self.bold_rows = self.fmt.kwds.get("bold_rows", False) - self.escape = self.fmt.kwds.get("escape", True) + self.bold_rows = self.fmt.bold_rows + self.escape = self.fmt.escape self.show_dimensions = self.fmt.show_dimensions if border is None: - border = get_option("display.html.border") + border = cast(int, get_option("display.html.border")) self.border = border self.table_id = self.fmt.table_id self.render_links = self.fmt.render_links diff --git a/pandas/io/formats/latex.py b/pandas/io/formats/latex.py index c60e15b733f0a9..4c4d5ec73269a5 100644 --- a/pandas/io/formats/latex.py +++ b/pandas/io/formats/latex.py @@ -39,12 +39,13 @@ def __init__( ): self.fmt = formatter self.frame = self.fmt.frame - self.bold_rows = self.fmt.kwds.get("bold_rows", False) + self.bold_rows = self.fmt.bold_rows self.column_format = column_format self.longtable = longtable self.multicolumn = multicolumn self.multicolumn_format = multicolumn_format self.multirow = multirow + self.escape = self.fmt.escape def write_result(self, buf: IO[str]) -> None: """ @@ -142,7 +143,7 @@ def pad_empties(x): buf.write("\\endfoot\n\n") buf.write("\\bottomrule\n") buf.write("\\endlastfoot\n") - if self.fmt.kwds.get("escape", True): + if self.escape: # escape backslashes first crow = [ ( From 357774695a4caf7b83506686f4c29cc38d2b9726 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 26 Aug 2019 16:39:12 -0700 Subject: [PATCH 27/95] CLN: small ops optimizations (#28036) --- pandas/core/frame.py | 28 +++++++++++++++++----------- pandas/core/ops/__init__.py | 12 ++++++------ pandas/core/ops/array_ops.py | 12 ++++++------ pandas/core/ops/missing.py | 4 ++-- pandas/core/sparse/frame.py | 2 +- 5 files changed, 32 insertions(+), 26 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9da7999724a186..f636bb6db74309 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5298,12 +5298,19 @@ def _combine_frame(self, other, func, fill_value=None, level=None): this, other = self.align(other, join="outer", level=level, copy=False) new_index, new_columns = this.index, this.columns - def _arith_op(left, right): - # for the mixed_type case where we iterate over columns, - # _arith_op(left, right) is equivalent to - # left._binop(right, func, fill_value=fill_value) - left, right = ops.fill_binop(left, right, fill_value) - return func(left, right) + if fill_value is None: + # since _arith_op may be called in a loop, avoid function call + # overhead if possible by doing this check once + _arith_op = func + + else: + + def _arith_op(left, right): + # for the mixed_type case where we iterate over columns, + # _arith_op(left, right) is equivalent to + # left._binop(right, func, fill_value=fill_value) + left, right = ops.fill_binop(left, right, fill_value) + return func(left, right) if ops.should_series_dispatch(this, other, func): # iterate over columns @@ -5318,7 +5325,7 @@ def _arith_op(left, right): def _combine_match_index(self, other, func, level=None): left, right = self.align(other, join="outer", axis=0, level=level, copy=False) - assert left.index.equals(right.index) + # at this point we have `left.index.equals(right.index)` if left._is_mixed_type or right._is_mixed_type: # operate column-wise; avoid costly object-casting in `.values` @@ -5331,14 +5338,13 @@ def _combine_match_index(self, other, func, level=None): new_data, index=left.index, columns=self.columns, copy=False ) - def _combine_match_columns(self, other, func, level=None): - assert isinstance(other, Series) + def _combine_match_columns(self, other: Series, func, level=None): left, right = self.align(other, join="outer", axis=1, level=level, copy=False) - assert left.columns.equals(right.index) + # at this point we have `left.columns.equals(right.index)` return ops.dispatch_to_series(left, right, func, axis="columns") def _combine_const(self, other, func): - assert lib.is_scalar(other) or np.ndim(other) == 0 + # scalar other or np.ndim(other) == 0 return ops.dispatch_to_series(self, other, func) def combine(self, other, func, fill_value=None, overwrite=True): diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 7e03b9544ee727..86cd6e878cde60 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -169,7 +169,7 @@ def maybe_upcast_for_op(obj, shape: Tuple[int, ...]): # np.timedelta64(3, 'D') / 2 == np.timedelta64(1, 'D') return Timedelta(obj) - elif isinstance(obj, np.ndarray) and is_timedelta64_dtype(obj): + elif isinstance(obj, np.ndarray) and is_timedelta64_dtype(obj.dtype): # GH#22390 Unfortunately we need to special-case right-hand # timedelta64 dtypes because numpy casts integer dtypes to # timedelta64 when operating with timedelta64 @@ -415,7 +415,7 @@ def should_extension_dispatch(left: ABCSeries, right: Any) -> bool: ): return True - if is_extension_array_dtype(right) and not is_scalar(right): + if not is_scalar(right) and is_extension_array_dtype(right): # GH#22378 disallow scalar to exclude e.g. "category", "Int64" return True @@ -755,7 +755,7 @@ def na_op(x, y): assert not isinstance(y, (list, ABCSeries, ABCIndexClass)) if isinstance(y, np.ndarray): # bool-bool dtype operations should be OK, should not get here - assert not (is_bool_dtype(x) and is_bool_dtype(y)) + assert not (is_bool_dtype(x.dtype) and is_bool_dtype(y.dtype)) x = ensure_object(x) y = ensure_object(y) result = libops.vec_binop(x, y, op) @@ -804,7 +804,7 @@ def wrapper(self, other): else: # scalars, list, tuple, np.array - is_other_int_dtype = is_integer_dtype(np.asarray(other)) + is_other_int_dtype = is_integer_dtype(np.asarray(other).dtype) if is_list_like(other) and not isinstance(other, np.ndarray): # TODO: Can we do this before the is_integer_dtype check? # could the is_integer_dtype check be checking the wrong @@ -988,10 +988,10 @@ def f(self, other, axis=default_axis, level=None, fill_value=None): self, other, pass_op, fill_value=fill_value, axis=axis, level=level ) else: + # in this case we always have `np.ndim(other) == 0` if fill_value is not None: self = self.fillna(fill_value) - assert np.ndim(other) == 0 return self._combine_const(other, op) f.__name__ = op_name @@ -1032,7 +1032,7 @@ def f(self, other, axis=default_axis, level=None): self, other, na_op, fill_value=None, axis=axis, level=level ) else: - assert np.ndim(other) == 0, other + # in this case we always have `np.ndim(other) == 0` return self._combine_const(other, na_op) f.__name__ = op_name diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index 523ba5d42a69cf..f5f6d77676f1f3 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -11,7 +11,7 @@ find_common_type, maybe_upcast_putmask, ) -from pandas.core.dtypes.common import is_object_dtype, is_period_dtype, is_scalar +from pandas.core.dtypes.common import is_object_dtype, is_scalar from pandas.core.dtypes.generic import ABCIndex, ABCSeries from pandas.core.dtypes.missing import notna @@ -57,9 +57,9 @@ def masked_arith_op(x, y, op): dtype = find_common_type([x.dtype, y.dtype]) result = np.empty(x.size, dtype=dtype) - # PeriodIndex.ravel() returns int64 dtype, so we have - # to work around that case. See GH#19956 - yrav = y if is_period_dtype(y) else y.ravel() + # NB: ravel() is only safe since y is ndarray; for e.g. PeriodIndex + # we would get int64 dtype, see GH#19956 + yrav = y.ravel() mask = notna(xrav) & notna(yrav) if yrav.shape != mask.shape: @@ -82,9 +82,9 @@ def masked_arith_op(x, y, op): mask = notna(xrav) # 1 ** np.nan is 1. So we have to unmask those. - if op == pow: + if op is pow: mask = np.where(x == 1, False, mask) - elif op == rpow: + elif op is rpow: mask = np.where(y == 1, False, mask) if mask.any(): diff --git a/pandas/core/ops/missing.py b/pandas/core/ops/missing.py index 01bc345a40b83c..45fa6a2830af64 100644 --- a/pandas/core/ops/missing.py +++ b/pandas/core/ops/missing.py @@ -40,7 +40,7 @@ def fill_zeros(result, x, y, name, fill): Mask the nan's from x. """ - if fill is None or is_float_dtype(result): + if fill is None or is_float_dtype(result.dtype): return result if name.startswith(("r", "__r")): @@ -55,7 +55,7 @@ def fill_zeros(result, x, y, name, fill): if is_scalar_type: y = np.array(y) - if is_integer_dtype(y): + if is_integer_dtype(y.dtype): if (y == 0).any(): diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index f5add426297a73..8fe6850c84b8b1 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -569,13 +569,13 @@ def _combine_frame(self, other, func, fill_value=None, level=None): ).__finalize__(self) def _combine_match_index(self, other, func, level=None): - new_data = {} if level is not None: raise NotImplementedError("'level' argument is not supported") this, other = self.align(other, join="outer", axis=0, level=level, copy=False) + new_data = {} for col, series in this.items(): new_data[col] = func(series.values, other.values) From 49d2019723b0089bd357adf6c936c5a82e0cc775 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 26 Aug 2019 16:52:44 -0700 Subject: [PATCH 28/95] CLN: internals.blocks cleanup, typing (#27941) --- pandas/core/internals/blocks.py | 90 ++++++++++----------------------- 1 file changed, 28 insertions(+), 62 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index a2a51881016a38..33698d245e9ffc 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -7,7 +7,7 @@ import numpy as np -from pandas._libs import NaT, Timestamp, lib, tslib, tslibs +from pandas._libs import NaT, Timestamp, lib, tslib import pandas._libs.internals as libinternals from pandas._libs.tslibs import Timedelta, conversion from pandas._libs.tslibs.timezones import tz_compare @@ -407,7 +407,7 @@ def fillna(self, value, limit=None, inplace=False, downcast=None): return self.copy() if self._can_hold_element(value): - # equivalent: self._try_coerce_args(value) would not raise + # equivalent: _try_coerce_args(value) would not raise blocks = self.putmask(mask, value, inplace=inplace) return self._maybe_downcast(blocks, downcast) @@ -669,7 +669,7 @@ def convert( return self.copy() if copy else self - def _can_hold_element(self, element): + def _can_hold_element(self, element: Any) -> bool: """ require the same dtype as ourselves """ dtype = self.values.dtype.type tipo = maybe_infer_dtype_type(element) @@ -857,12 +857,6 @@ def setitem(self, indexer, value): if self._can_hold_element(value): value = self._try_coerce_args(value) - # can keep its own dtype - if hasattr(value, "dtype") and is_dtype_equal(values.dtype, value.dtype): - dtype = self.dtype - else: - dtype = "infer" - else: # current dtype cannot store value, coerce to common dtype find_dtype = False @@ -871,15 +865,9 @@ def setitem(self, indexer, value): dtype = value.dtype find_dtype = True - elif lib.is_scalar(value): - if isna(value): - # NaN promotion is handled in latter path - dtype = False - else: - dtype, _ = infer_dtype_from_scalar(value, pandas_dtype=True) - find_dtype = True - else: - dtype = "infer" + elif lib.is_scalar(value) and not isna(value): + dtype, _ = infer_dtype_from_scalar(value, pandas_dtype=True) + find_dtype = True if find_dtype: dtype = find_common_type([values.dtype, dtype]) @@ -1088,7 +1076,7 @@ def coerce_to_target_dtype(self, other): mytz = getattr(self.dtype, "tz", None) othertz = getattr(dtype, "tz", None) - if str(mytz) != str(othertz): + if not tz_compare(mytz, othertz): return self.astype(object) raise AssertionError( @@ -1308,7 +1296,7 @@ def take_nd(self, indexer, axis, new_mgr_locs=None, fill_tuple=None): else: return self.make_block_same_class(new_values, new_mgr_locs) - def diff(self, n, axis=1): + def diff(self, n: int, axis: int = 1) -> List["Block"]: """ return block for the diff of the values """ new_values = algos.diff(self.values, n, axis=axis) return [self.make_block(values=new_values)] @@ -1397,7 +1385,7 @@ def func(cond, values, other): if not ( (self.is_integer or self.is_bool) - and lib.is_scalar(other) + and lib.is_float(other) and np.isnan(other) ): # np.where will cast integer array to floats in this case @@ -1450,7 +1438,7 @@ def func(cond, values, other): return result_blocks - def equals(self, other): + def equals(self, other) -> bool: if self.dtype != other.dtype or self.shape != other.shape: return False return array_equivalent(self.values, other.values) @@ -1830,7 +1818,7 @@ def take_nd(self, indexer, axis=0, new_mgr_locs=None, fill_tuple=None): return self.make_block_same_class(new_values, new_mgr_locs) - def _can_hold_element(self, element): + def _can_hold_element(self, element: Any) -> bool: # XXX: We may need to think about pushing this onto the array. # We're doing the same as CategoricalBlock here. return True @@ -2000,7 +1988,7 @@ class NumericBlock(Block): class FloatOrComplexBlock(NumericBlock): __slots__ = () - def equals(self, other): + def equals(self, other) -> bool: if self.dtype != other.dtype or self.shape != other.shape: return False left, right = self.values, other.values @@ -2011,7 +1999,7 @@ class FloatBlock(FloatOrComplexBlock): __slots__ = () is_float = True - def _can_hold_element(self, element): + def _can_hold_element(self, element: Any) -> bool: tipo = maybe_infer_dtype_type(element) if tipo is not None: return issubclass(tipo.type, (np.floating, np.integer)) and not issubclass( @@ -2075,7 +2063,7 @@ class ComplexBlock(FloatOrComplexBlock): __slots__ = () is_complex = True - def _can_hold_element(self, element): + def _can_hold_element(self, element: Any) -> bool: tipo = maybe_infer_dtype_type(element) if tipo is not None: return issubclass(tipo.type, (np.floating, np.integer, np.complexfloating)) @@ -2092,7 +2080,7 @@ class IntBlock(NumericBlock): is_integer = True _can_hold_na = False - def _can_hold_element(self, element): + def _can_hold_element(self, element: Any) -> bool: tipo = maybe_infer_dtype_type(element) if tipo is not None: return ( @@ -2182,7 +2170,7 @@ def _astype(self, dtype, **kwargs): # delegate return super()._astype(dtype=dtype, **kwargs) - def _can_hold_element(self, element): + def _can_hold_element(self, element: Any) -> bool: tipo = maybe_infer_dtype_type(element) if tipo is not None: if self.is_datetimetz: @@ -2372,41 +2360,19 @@ def _slice(self, slicer): return self.values[slicer] def _try_coerce_args(self, other): - """ - localize and return i8 for the values - - Parameters - ---------- - other : ndarray-like or scalar - - Returns - ------- - base-type other - """ - if is_valid_nat_for_dtype(other, self.dtype): - other = np.datetime64("NaT", "ns") - elif isinstance(other, self._holder): - if not tz_compare(other.tz, self.values.tz): - raise ValueError("incompatible or non tz-aware value") - - elif isinstance(other, (np.datetime64, datetime, date)): - other = tslibs.Timestamp(other) - - # test we can have an equal time zone - if not tz_compare(other.tz, self.values.tz): - raise ValueError("incompatible or non tz-aware value") - else: - raise TypeError(other) - + # DatetimeArray handles this for us return other - def diff(self, n, axis=0): - """1st discrete difference + def diff(self, n: int, axis: int = 0) -> List["Block"]: + """ + 1st discrete difference. Parameters ---------- - n : int, number of periods to diff - axis : int, axis to diff upon. default 0 + n : int + Number of periods to diff. + axis : int, default 0 + Axis to diff upon. Returns ------- @@ -2468,7 +2434,7 @@ def setitem(self, indexer, value): ) return newb.setitem(indexer, value) - def equals(self, other): + def equals(self, other) -> bool: # override for significant performance improvement if self.dtype != other.dtype or self.shape != other.shape: return False @@ -2507,7 +2473,7 @@ def __init__(self, values, placement, ndim=None): def _holder(self): return TimedeltaArray - def _can_hold_element(self, element): + def _can_hold_element(self, element: Any) -> bool: tipo = maybe_infer_dtype_type(element) if tipo is not None: return issubclass(tipo.type, np.timedelta64) @@ -2600,7 +2566,7 @@ class BoolBlock(NumericBlock): is_bool = True _can_hold_na = False - def _can_hold_element(self, element): + def _can_hold_element(self, element: Any) -> bool: tipo = maybe_infer_dtype_type(element) if tipo is not None: return issubclass(tipo.type, np.bool_) @@ -2694,7 +2660,7 @@ def _maybe_downcast(self, blocks: List["Block"], downcast=None) -> List["Block"] # split and convert the blocks return _extend_blocks([b.convert(datetime=True, numeric=False) for b in blocks]) - def _can_hold_element(self, element): + def _can_hold_element(self, element: Any) -> bool: return True def _try_coerce_args(self, other): From 041b6b180f8175b642977852f01e9211983b46ce Mon Sep 17 00:00:00 2001 From: Kaiqi Dong Date: Tue, 27 Aug 2019 16:09:41 +0200 Subject: [PATCH 29/95] Replace with nested dict raises for overlapping keys (#27696) --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/core/generic.py | 6 +----- pandas/tests/frame/test_replace.py | 18 ++++++++++++------ 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 7fe358d3820f23..7a10447e3ad402 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -207,6 +207,7 @@ ExtensionArray Other ^^^^^ - Trying to set the ``display.precision``, ``display.max_rows`` or ``display.max_columns`` using :meth:`set_option` to anything but a ``None`` or a positive int will raise a ``ValueError`` (:issue:`23348`) +- Using :meth:`DataFrame.replace` with overlapping keys in a nested dictionary will no longer raise, now matching the behavior of a flat dictionary (:issue:`27660`) - :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now support dicts as ``compression`` argument with key ``'method'`` being the compression method and others as additional compression options when the compression method is ``'zip'``. (:issue:`26023`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index fac5e0f085fc62..6ade69fb4ca9d9 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6669,11 +6669,7 @@ def replace( for k, v in items: keys, values = list(zip(*v.items())) or ([], []) - if set(keys) & set(values): - raise ValueError( - "Replacement not allowed with " - "overlapping keys and values" - ) + to_rep_dict[k] = list(keys) value_dict[k] = list(values) diff --git a/pandas/tests/frame/test_replace.py b/pandas/tests/frame/test_replace.py index 2862615ef8585a..b341ed6a52ca57 100644 --- a/pandas/tests/frame/test_replace.py +++ b/pandas/tests/frame/test_replace.py @@ -1069,18 +1069,24 @@ def test_replace_truthy(self): e = df assert_frame_equal(r, e) - def test_replace_int_to_int_chain(self): + def test_nested_dict_overlapping_keys_replace_int(self): + # GH 27660 keep behaviour consistent for simple dictionary and + # nested dictionary replacement df = DataFrame({"a": list(range(1, 5))}) - with pytest.raises(ValueError, match="Replacement not allowed .+"): - df.replace({"a": dict(zip(range(1, 5), range(2, 6)))}) - def test_replace_str_to_str_chain(self): + result = df.replace({"a": dict(zip(range(1, 5), range(2, 6)))}) + expected = df.replace(dict(zip(range(1, 5), range(2, 6)))) + assert_frame_equal(result, expected) + + def test_nested_dict_overlapping_keys_replace_str(self): + # GH 27660 a = np.arange(1, 5) astr = a.astype(str) bstr = np.arange(2, 6).astype(str) df = DataFrame({"a": astr}) - with pytest.raises(ValueError, match="Replacement not allowed .+"): - df.replace({"a": dict(zip(astr, bstr))}) + result = df.replace(dict(zip(astr, bstr))) + expected = df.replace({"a": dict(zip(astr, bstr))}) + assert_frame_equal(result, expected) def test_replace_swapping_bug(self): df = pd.DataFrame({"a": [True, False, True]}) From bd8dbf906e4352567094637c9c824c350dae3ad2 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Tue, 27 Aug 2019 22:32:40 +0100 Subject: [PATCH 30/95] TYPING: --check-untyped-defs util._decorators (#28128) --- pandas/core/groupby/generic.py | 30 ++++++------ pandas/core/indexes/interval.py | 4 +- pandas/core/window/ewm.py | 4 +- pandas/core/window/expanding.py | 4 +- pandas/core/window/rolling.py | 10 ++-- pandas/util/_decorators.py | 82 +++++++++++++++++++-------------- 6 files changed, 74 insertions(+), 60 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index ea2bd22cccc3d0..7d6690a0dfa5ac 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -833,45 +833,45 @@ def apply(self, func, *args, **kwargs): axis="", ) @Appender(_shared_docs["aggregate"]) - def aggregate(self, func_or_funcs=None, *args, **kwargs): + def aggregate(self, func=None, *args, **kwargs): _level = kwargs.pop("_level", None) - relabeling = func_or_funcs is None + relabeling = func is None columns = None - no_arg_message = "Must provide 'func_or_funcs' or named aggregation **kwargs." + no_arg_message = "Must provide 'func' or named aggregation **kwargs." if relabeling: columns = list(kwargs) if not PY36: # sort for 3.5 and earlier columns = list(sorted(columns)) - func_or_funcs = [kwargs[col] for col in columns] + func = [kwargs[col] for col in columns] kwargs = {} if not columns: raise TypeError(no_arg_message) - if isinstance(func_or_funcs, str): - return getattr(self, func_or_funcs)(*args, **kwargs) + if isinstance(func, str): + return getattr(self, func)(*args, **kwargs) - if isinstance(func_or_funcs, abc.Iterable): + if isinstance(func, abc.Iterable): # Catch instances of lists / tuples # but not the class list / tuple itself. - func_or_funcs = _maybe_mangle_lambdas(func_or_funcs) - ret = self._aggregate_multiple_funcs(func_or_funcs, (_level or 0) + 1) + func = _maybe_mangle_lambdas(func) + ret = self._aggregate_multiple_funcs(func, (_level or 0) + 1) if relabeling: ret.columns = columns else: - cyfunc = self._get_cython_func(func_or_funcs) + cyfunc = self._get_cython_func(func) if cyfunc and not args and not kwargs: return getattr(self, cyfunc)() if self.grouper.nkeys > 1: - return self._python_agg_general(func_or_funcs, *args, **kwargs) + return self._python_agg_general(func, *args, **kwargs) try: - return self._python_agg_general(func_or_funcs, *args, **kwargs) + return self._python_agg_general(func, *args, **kwargs) except Exception: - result = self._aggregate_named(func_or_funcs, *args, **kwargs) + result = self._aggregate_named(func, *args, **kwargs) index = Index(sorted(result), name=self.grouper.names[0]) ret = Series(result, index=index) @@ -1464,8 +1464,8 @@ class DataFrameGroupBy(NDFrameGroupBy): axis="", ) @Appender(_shared_docs["aggregate"]) - def aggregate(self, arg=None, *args, **kwargs): - return super().aggregate(arg, *args, **kwargs) + def aggregate(self, func=None, *args, **kwargs): + return super().aggregate(func, *args, **kwargs) agg = aggregate diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 3874c6404565c7..021ff5fb462767 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -788,7 +788,7 @@ def _find_non_overlapping_monotonic_bounds(self, key): return start, stop def get_loc( - self, key: Any, method: Optional[str] = None + self, key: Any, method: Optional[str] = None, tolerance=None ) -> Union[int, slice, np.ndarray]: """ Get integer location, slice or boolean mask for requested label. @@ -982,7 +982,7 @@ def get_indexer_for(self, target: AnyArrayLike, **kwargs) -> np.ndarray: List of indices. """ if self.is_overlapping: - return self.get_indexer_non_unique(target, **kwargs)[0] + return self.get_indexer_non_unique(target)[0] return self.get_indexer(target, **kwargs) @Appender(_index_shared_docs["get_value"] % _index_doc_kwargs) diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index 0ce6d5ddec2ad7..40e6c679ba72d8 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -206,8 +206,8 @@ def _constructor(self): axis="", ) @Appender(_shared_docs["aggregate"]) - def aggregate(self, arg, *args, **kwargs): - return super().aggregate(arg, *args, **kwargs) + def aggregate(self, func, *args, **kwargs): + return super().aggregate(func, *args, **kwargs) agg = aggregate diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py index c43ca6b0565f36..47bd8f2ec593b5 100644 --- a/pandas/core/window/expanding.py +++ b/pandas/core/window/expanding.py @@ -136,8 +136,8 @@ def _get_window(self, other=None, **kwargs): axis="", ) @Appender(_shared_docs["aggregate"]) - def aggregate(self, arg, *args, **kwargs): - return super().aggregate(arg, *args, **kwargs) + def aggregate(self, func, *args, **kwargs): + return super().aggregate(func, *args, **kwargs) agg = aggregate diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 323089b3fdf6b4..a7e122fa3528ff 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -901,12 +901,12 @@ def func(arg, window, min_periods=None, closed=None): axis="", ) @Appender(_shared_docs["aggregate"]) - def aggregate(self, arg, *args, **kwargs): - result, how = self._aggregate(arg, *args, **kwargs) + def aggregate(self, func, *args, **kwargs): + result, how = self._aggregate(func, *args, **kwargs) if result is None: # these must apply directly - result = arg(self) + result = func(self) return result @@ -1788,8 +1788,8 @@ def _validate_freq(self): axis="", ) @Appender(_shared_docs["aggregate"]) - def aggregate(self, arg, *args, **kwargs): - return super().aggregate(arg, *args, **kwargs) + def aggregate(self, func, *args, **kwargs): + return super().aggregate(func, *args, **kwargs) agg = aggregate diff --git a/pandas/util/_decorators.py b/pandas/util/_decorators.py index 5c7d481ff2586e..8a25e511b5fc4f 100644 --- a/pandas/util/_decorators.py +++ b/pandas/util/_decorators.py @@ -1,21 +1,35 @@ from functools import wraps import inspect from textwrap import dedent -from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union +from typing import ( + Any, + Callable, + Dict, + List, + Optional, + Tuple, + Type, + TypeVar, + Union, + cast, +) import warnings from pandas._libs.properties import cache_readonly # noqa +FuncType = Callable[..., Any] +F = TypeVar("F", bound=FuncType) + def deprecate( name: str, - alternative: Callable, + alternative: Callable[..., Any], version: str, alt_name: Optional[str] = None, klass: Optional[Type[Warning]] = None, stacklevel: int = 2, msg: Optional[str] = None, -) -> Callable: +) -> Callable[..., Any]: """ Return a new function that emits a deprecation warning on use. @@ -47,7 +61,7 @@ def deprecate( warning_msg = msg or "{} is deprecated, use {} instead".format(name, alt_name) @wraps(alternative) - def wrapper(*args, **kwargs): + def wrapper(*args, **kwargs) -> Callable[..., Any]: warnings.warn(warning_msg, klass, stacklevel=stacklevel) return alternative(*args, **kwargs) @@ -90,9 +104,9 @@ def wrapper(*args, **kwargs): def deprecate_kwarg( old_arg_name: str, new_arg_name: Optional[str], - mapping: Optional[Union[Dict, Callable[[Any], Any]]] = None, + mapping: Optional[Union[Dict[Any, Any], Callable[[Any], Any]]] = None, stacklevel: int = 2, -) -> Callable: +) -> Callable[..., Any]: """ Decorator to deprecate a keyword argument of a function. @@ -160,27 +174,27 @@ def deprecate_kwarg( "mapping from old to new argument values " "must be dict or callable!" ) - def _deprecate_kwarg(func): + def _deprecate_kwarg(func: F) -> F: @wraps(func) - def wrapper(*args, **kwargs): + def wrapper(*args, **kwargs) -> Callable[..., Any]: old_arg_value = kwargs.pop(old_arg_name, None) - if new_arg_name is None and old_arg_value is not None: - msg = ( - "the '{old_name}' keyword is deprecated and will be " - "removed in a future version. " - "Please take steps to stop the use of '{old_name}'" - ).format(old_name=old_arg_name) - warnings.warn(msg, FutureWarning, stacklevel=stacklevel) - kwargs[old_arg_name] = old_arg_value - return func(*args, **kwargs) - if old_arg_value is not None: - if mapping is not None: - if hasattr(mapping, "get"): - new_arg_value = mapping.get(old_arg_value, old_arg_value) - else: + if new_arg_name is None: + msg = ( + "the '{old_name}' keyword is deprecated and will be " + "removed in a future version. " + "Please take steps to stop the use of '{old_name}'" + ).format(old_name=old_arg_name) + warnings.warn(msg, FutureWarning, stacklevel=stacklevel) + kwargs[old_arg_name] = old_arg_value + return func(*args, **kwargs) + + elif mapping is not None: + if callable(mapping): new_arg_value = mapping(old_arg_value) + else: + new_arg_value = mapping.get(old_arg_value, old_arg_value) msg = ( "the {old_name}={old_val!r} keyword is deprecated, " "use {new_name}={new_val!r} instead" @@ -198,7 +212,7 @@ def wrapper(*args, **kwargs): ).format(old_name=old_arg_name, new_name=new_arg_name) warnings.warn(msg, FutureWarning, stacklevel=stacklevel) - if kwargs.get(new_arg_name, None) is not None: + if kwargs.get(new_arg_name) is not None: msg = ( "Can only specify '{old_name}' or '{new_name}', " "not both" ).format(old_name=old_arg_name, new_name=new_arg_name) @@ -207,17 +221,17 @@ def wrapper(*args, **kwargs): kwargs[new_arg_name] = new_arg_value return func(*args, **kwargs) - return wrapper + return cast(F, wrapper) return _deprecate_kwarg def rewrite_axis_style_signature( name: str, extra_params: List[Tuple[str, Any]] -) -> Callable: - def decorate(func): +) -> Callable[..., Any]: + def decorate(func: F) -> F: @wraps(func) - def wrapper(*args, **kwargs): + def wrapper(*args, **kwargs) -> Callable[..., Any]: return func(*args, **kwargs) kind = inspect.Parameter.POSITIONAL_OR_KEYWORD @@ -234,8 +248,9 @@ def wrapper(*args, **kwargs): sig = inspect.Signature(params) - func.__signature__ = sig - return wrapper + # https://github.com/python/typing/issues/598 + func.__signature__ = sig # type: ignore + return cast(F, wrapper) return decorate @@ -279,18 +294,17 @@ def __init__(self, *args, **kwargs): self.params = args or kwargs - def __call__(self, func: Callable) -> Callable: + def __call__(self, func: F) -> F: func.__doc__ = func.__doc__ and func.__doc__ % self.params return func def update(self, *args, **kwargs) -> None: """ Update self.params with supplied args. - - If called, we assume self.params is a dict. """ - self.params.update(*args, **kwargs) + if isinstance(self.params, dict): + self.params.update(*args, **kwargs) class Appender: @@ -320,7 +334,7 @@ def __init__(self, addendum: Optional[str], join: str = "", indents: int = 0): self.addendum = addendum self.join = join - def __call__(self, func: Callable) -> Callable: + def __call__(self, func: F) -> F: func.__doc__ = func.__doc__ if func.__doc__ else "" self.addendum = self.addendum if self.addendum else "" docitems = [func.__doc__, self.addendum] From 080d57ee9fef9275518908cb7665ea062684c29b Mon Sep 17 00:00:00 2001 From: Addison Lynch Date: Tue, 27 Aug 2019 17:39:03 -0400 Subject: [PATCH 31/95] CLN: Use ABC classes for isinstance checks, remove unnecessary imports (#28158) * CLN: Use ABC classes for isinstance checks, remove unnecessary imports * Formatting repairs --- pandas/core/frame.py | 27 +++++++++++---------------- pandas/core/indexing.py | 40 +++++++++++++++++++++------------------- 2 files changed, 32 insertions(+), 35 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f636bb6db74309..3d1a39a86c784e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -86,12 +86,7 @@ from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin as DatetimeLikeArray from pandas.core.arrays.sparse import SparseFrameAccessor from pandas.core.generic import NDFrame, _shared_docs -from pandas.core.index import ( - Index, - MultiIndex, - ensure_index, - ensure_index_from_sequences, -) +from pandas.core.index import Index, ensure_index, ensure_index_from_sequences from pandas.core.indexes import base as ibase from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.indexes.multi import maybe_droplevels @@ -1734,7 +1729,7 @@ def to_records( if is_datetime64_any_dtype(self.index) and convert_datetime64: ix_vals = [self.index.to_pydatetime()] else: - if isinstance(self.index, MultiIndex): + if isinstance(self.index, ABCMultiIndex): # array of tuples to numpy cols. copy copy copy ix_vals = list(map(np.array, zip(*self.index.values))) else: @@ -1745,7 +1740,7 @@ def to_records( count = 0 index_names = list(self.index.names) - if isinstance(self.index, MultiIndex): + if isinstance(self.index, ABCMultiIndex): for i, n in enumerate(index_names): if n is None: index_names[i] = "level_%d" % count @@ -2868,7 +2863,7 @@ def __getitem__(self, key): # The behavior is inconsistent. It returns a Series, except when # - the key itself is repeated (test on data.shape, #9519), or # - we have a MultiIndex on columns (test on self.columns, #21309) - if data.shape[1] == 1 and not isinstance(self.columns, MultiIndex): + if data.shape[1] == 1 and not isinstance(self.columns, ABCMultiIndex): data = data[key] return data @@ -3657,7 +3652,7 @@ def reindexer(value): elif isinstance(value, DataFrame): # align right-hand-side columns if self.columns # is multi-index and self[key] is a sub-frame - if isinstance(self.columns, MultiIndex) and key in self.columns: + if isinstance(self.columns, ABCMultiIndex) and key in self.columns: loc = self.columns.get_loc(key) if isinstance(loc, (slice, Series, np.ndarray, Index)): cols = maybe_droplevels(self.columns[loc], key) @@ -3706,7 +3701,7 @@ def reindexer(value): # broadcast across multiple columns if necessary if broadcast and key in self.columns and value.ndim == 1: - if not self.columns.is_unique or isinstance(self.columns, MultiIndex): + if not self.columns.is_unique or isinstance(self.columns, ABCMultiIndex): existing_piece = self[key] if isinstance(existing_piece, DataFrame): value = np.tile(value, (len(existing_piece.columns), 1)) @@ -4601,7 +4596,7 @@ def _maybe_casted_values(index, labels=None): new_index = self.index.droplevel(level) if not drop: - if isinstance(self.index, MultiIndex): + if isinstance(self.index, ABCMultiIndex): names = [ n if n is not None else ("level_%d" % i) for (i, n) in enumerate(self.index.names) @@ -4612,7 +4607,7 @@ def _maybe_casted_values(index, labels=None): names = [default] if self.index.name is None else [self.index.name] to_insert = ((self.index, None),) - multi_col = isinstance(self.columns, MultiIndex) + multi_col = isinstance(self.columns, ABCMultiIndex) for i, (lev, lab) in reversed(list(enumerate(to_insert))): if not (level is None or i in level): continue @@ -4994,7 +4989,7 @@ def sort_index( level, ascending=ascending, sort_remaining=sort_remaining ) - elif isinstance(labels, MultiIndex): + elif isinstance(labels, ABCMultiIndex): from pandas.core.sorting import lexsort_indexer indexer = lexsort_indexer( @@ -5280,7 +5275,7 @@ def reorder_levels(self, order, axis=0): type of caller (new object) """ axis = self._get_axis_number(axis) - if not isinstance(self._get_axis(axis), MultiIndex): # pragma: no cover + if not isinstance(self._get_axis(axis), ABCMultiIndex): # pragma: no cover raise TypeError("Can only reorder levels on a hierarchical axis.") result = self.copy() @@ -7784,7 +7779,7 @@ def _count_level(self, level, axis=0, numeric_only=False): count_axis = frame._get_axis(axis) agg_axis = frame._get_agg_axis(axis) - if not isinstance(count_axis, MultiIndex): + if not isinstance(count_axis, ABCMultiIndex): raise TypeError( "Can only count levels on hierarchical " "{ax}.".format(ax=self._get_axis_name(axis)) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index b8ca3419af4d7e..3d495eeb8c885b 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -22,11 +22,11 @@ is_sparse, ) from pandas.core.dtypes.concat import concat_compat -from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries +from pandas.core.dtypes.generic import ABCDataFrame, ABCMultiIndex, ABCSeries from pandas.core.dtypes.missing import _infer_fill_value, isna import pandas.core.common as com -from pandas.core.index import Index, InvalidIndexError, MultiIndex +from pandas.core.index import Index, InvalidIndexError from pandas.core.indexers import is_list_like_indexer, length_of_indexer @@ -172,7 +172,7 @@ def _get_setitem_indexer(self, key): ax = self.obj._get_axis(0) - if isinstance(ax, MultiIndex) and self.name != "iloc": + if isinstance(ax, ABCMultiIndex) and self.name != "iloc": try: return ax.get_loc(key) except Exception: @@ -241,7 +241,7 @@ def _has_valid_tuple(self, key: Tuple): ) def _is_nested_tuple_indexer(self, tup: Tuple): - if any(isinstance(ax, MultiIndex) for ax in self.obj.axes): + if any(isinstance(ax, ABCMultiIndex) for ax in self.obj.axes): return any(is_nested_tuple(tup, ax) for ax in self.obj.axes) return False @@ -329,7 +329,7 @@ def _setitem_with_indexer(self, indexer, value): # GH 10360, GH 27841 if isinstance(indexer, tuple) and len(indexer) == len(self.obj.axes): for i, ax in zip(indexer, self.obj.axes): - if isinstance(ax, MultiIndex) and not ( + if isinstance(ax, ABCMultiIndex) and not ( is_integer(i) or com.is_null_slice(i) ): take_split_path = True @@ -422,7 +422,9 @@ def _setitem_with_indexer(self, indexer, value): # if we have a partial multiindex, then need to adjust the plane # indexer here - if len(labels) == 1 and isinstance(self.obj[labels[0]].axes[0], MultiIndex): + if len(labels) == 1 and isinstance( + self.obj[labels[0]].axes[0], ABCMultiIndex + ): item = labels[0] obj = self.obj[item] index = obj.index @@ -495,7 +497,7 @@ def setter(item, v): # we have an equal len Frame if isinstance(value, ABCDataFrame): sub_indexer = list(indexer) - multiindex_indexer = isinstance(labels, MultiIndex) + multiindex_indexer = isinstance(labels, ABCMultiIndex) for item in labels: if item in value: @@ -777,8 +779,8 @@ def _align_frame(self, indexer, df: ABCDataFrame): # we have a multi-index and are trying to align # with a particular, level GH3738 if ( - isinstance(ax, MultiIndex) - and isinstance(df.index, MultiIndex) + isinstance(ax, ABCMultiIndex) + and isinstance(df.index, ABCMultiIndex) and ax.nlevels != df.index.nlevels ): raise TypeError( @@ -904,7 +906,7 @@ def _getitem_lowerdim(self, tup: Tuple): ax0 = self.obj._get_axis(0) # ...but iloc should handle the tuple as simple integer-location # instead of checking it as multiindex representation (GH 13797) - if isinstance(ax0, MultiIndex) and self.name != "iloc": + if isinstance(ax0, ABCMultiIndex) and self.name != "iloc": result = self._handle_lowerdim_multi_index_axis0(tup) if result is not None: return result @@ -1004,7 +1006,7 @@ def _getitem_axis(self, key, axis: int): if isinstance(key, slice): return self._get_slice_axis(key, axis=axis) elif is_list_like_indexer(key) and not ( - isinstance(key, tuple) and isinstance(labels, MultiIndex) + isinstance(key, tuple) and isinstance(labels, ABCMultiIndex) ): if hasattr(key, "ndim") and key.ndim > 1: @@ -1017,7 +1019,7 @@ def _getitem_axis(self, key, axis: int): key = labels._maybe_cast_indexer(key) if is_integer(key): - if axis == 0 and isinstance(labels, MultiIndex): + if axis == 0 and isinstance(labels, ABCMultiIndex): try: return self._get_label(key, axis=axis) except (KeyError, TypeError): @@ -1228,7 +1230,7 @@ def _convert_to_indexer(self, obj, axis: int, raise_missing: bool = False): try: return labels.get_loc(obj) except LookupError: - if isinstance(obj, tuple) and isinstance(labels, MultiIndex): + if isinstance(obj, tuple) and isinstance(labels, ABCMultiIndex): if len(obj) == labels.nlevels: return {"key": obj} raise @@ -1248,7 +1250,7 @@ def _convert_to_indexer(self, obj, axis: int, raise_missing: bool = False): # always valid return {"key": obj} - if obj >= self.obj.shape[axis] and not isinstance(labels, MultiIndex): + if obj >= self.obj.shape[axis] and not isinstance(labels, ABCMultiIndex): # a positional raise ValueError("cannot set by positional indexing with enlargement") @@ -1715,7 +1717,7 @@ def _is_scalar_access(self, key: Tuple): return False ax = self.obj.axes[i] - if isinstance(ax, MultiIndex): + if isinstance(ax, ABCMultiIndex): return False if isinstance(k, str) and ax._supports_partial_string_indexing: @@ -1737,7 +1739,7 @@ def _getitem_scalar(self, key): def _get_partial_string_timestamp_match_key(self, key, labels): """Translate any partial string timestamp matches in key, returning the new key (GH 10331)""" - if isinstance(labels, MultiIndex): + if isinstance(labels, ABCMultiIndex): if ( isinstance(key, str) and labels.levels[0]._supports_partial_string_indexing @@ -1781,7 +1783,7 @@ def _getitem_axis(self, key, axis: int): # to a list of keys # we will use the *values* of the object # and NOT the index if its a PandasObject - if isinstance(labels, MultiIndex): + if isinstance(labels, ABCMultiIndex): if isinstance(key, (ABCSeries, np.ndarray)) and key.ndim <= 1: # Series, or 0,1 ndim ndarray @@ -1809,7 +1811,7 @@ def _getitem_axis(self, key, axis: int): key = tuple([key]) # an iterable multi-selection - if not (isinstance(key, tuple) and isinstance(labels, MultiIndex)): + if not (isinstance(key, tuple) and isinstance(labels, ABCMultiIndex)): if hasattr(key, "ndim") and key.ndim > 1: raise ValueError("Cannot index with multidimensional key") @@ -2474,7 +2476,7 @@ def is_nested_tuple(tup, labels): for i, k in enumerate(tup): if is_list_like(k) or isinstance(k, slice): - return isinstance(labels, MultiIndex) + return isinstance(labels, ABCMultiIndex) return False From d91ffa6407c1baf6afe7d0a1b9655f44da77ac24 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Tue, 27 Aug 2019 22:50:22 +0100 Subject: [PATCH 32/95] TYPING: change to FrameOrSeries Alias in pandas._typing (#28173) --- pandas/_typing.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/_typing.py b/pandas/_typing.py index 837a7a89e0b839..37a5d7945955de 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -11,9 +11,9 @@ from pandas.core.arrays.base import ExtensionArray # noqa: F401 from pandas.core.dtypes.dtypes import ExtensionDtype # noqa: F401 from pandas.core.indexes.base import Index # noqa: F401 - from pandas.core.frame import DataFrame # noqa: F401 from pandas.core.series import Series # noqa: F401 from pandas.core.sparse.series import SparseSeries # noqa: F401 + from pandas.core.generic import NDFrame # noqa: F401 AnyArrayLike = TypeVar( @@ -24,7 +24,10 @@ Dtype = Union[str, np.dtype, "ExtensionDtype"] FilePathOrBuffer = Union[str, Path, IO[AnyStr]] -FrameOrSeries = TypeVar("FrameOrSeries", "Series", "DataFrame") +FrameOrSeries = TypeVar("FrameOrSeries", bound="NDFrame") Scalar = Union[str, int, float] Axis = Union[str, int] Ordered = Optional[bool] + +# to maintain type information across generic functions and parametrization +_T = TypeVar("_T") From 612d3b23da5b99f6c5642be574fb08713a45d7d1 Mon Sep 17 00:00:00 2001 From: killerontherun1 Date: Thu, 29 Aug 2019 02:04:56 +0530 Subject: [PATCH 33/95] Solving GL01,GL02 in pandas.Interval and a few mentioned in the comments (#28197) --- pandas/core/indexes/interval.py | 3 ++- pandas/io/sql.py | 3 ++- pandas/io/stata.py | 2 +- pandas/plotting/_misc.py | 6 ++++-- pandas/util/testing.py | 6 ++++-- 5 files changed, 13 insertions(+), 7 deletions(-) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 021ff5fb462767..6b0081c6a2ff51 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -331,7 +331,8 @@ def __contains__(self, key): >>> idx.to_tuples() Index([(0.0, 1.0), (nan, nan), (2.0, 3.0)], dtype='object') >>> idx.to_tuples(na_tuple=False) - Index([(0.0, 1.0), nan, (2.0, 3.0)], dtype='object')""", + Index([(0.0, 1.0), nan, (2.0, 3.0)], dtype='object') + """, ) ) def to_tuples(self, na_tuple=True): diff --git a/pandas/io/sql.py b/pandas/io/sql.py index f1f52a9198d29d..72df00fd4c5a19 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -269,7 +269,8 @@ def read_sql_query( parse_dates=None, chunksize=None, ): - """Read SQL query into a DataFrame. + """ + Read SQL query into a DataFrame. Returns a DataFrame corresponding to the result set of the query string. Optionally provide an `index_col` parameter to use one of the diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 69bafc77492587..31fdaa5cc67359 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -138,7 +138,7 @@ _iterator_params, ) -_data_method_doc = """\ +_data_method_doc = """ Read observations from Stata file, converting them into a dataframe .. deprecated:: diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index 1cba0e73541826..7ed0ffc6d0115e 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -329,7 +329,8 @@ def parallel_coordinates( sort_labels=False, **kwds ): - """Parallel coordinates plotting. + """ + Parallel coordinates plotting. Parameters ---------- @@ -392,7 +393,8 @@ def parallel_coordinates( def lag_plot(series, lag=1, ax=None, **kwds): - """Lag plot for time series. + """ + Lag plot for time series. Parameters ---------- diff --git a/pandas/util/testing.py b/pandas/util/testing.py index a8f0d0da52e1f4..0d543f891a5f63 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -580,7 +580,8 @@ def assert_index_equal( check_categorical: bool = True, obj: str = "Index", ) -> None: - """Check that left and right Index are equal. + """ + Check that left and right Index are equal. Parameters ---------- @@ -1081,7 +1082,8 @@ def assert_series_equal( check_categorical=True, obj="Series", ): - """Check that left and right Series are equal. + """ + Check that left and right Series are equal. Parameters ---------- From bc65fe6c12dc78679ba8584eee83c6e3e243b5b9 Mon Sep 17 00:00:00 2001 From: "Roei.r" Date: Thu, 29 Aug 2019 02:01:46 +0300 Subject: [PATCH 34/95] Fix slicer assignment bug (#28131) --- doc/source/whatsnew/v1.0.0.rst | 2 +- pandas/core/indexers.py | 1 + pandas/tests/indexing/test_loc.py | 10 ++++++++++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 7a10447e3ad402..050a26cc86d429 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -141,7 +141,7 @@ Interval Indexing ^^^^^^^^ -- +- Bug in assignment using a reverse slicer (:issue:`26939`) - Missing diff --git a/pandas/core/indexers.py b/pandas/core/indexers.py index 70c48e969172f5..433bca940c0285 100644 --- a/pandas/core/indexers.py +++ b/pandas/core/indexers.py @@ -226,6 +226,7 @@ def length_of_indexer(indexer, target=None) -> int: if step is None: step = 1 elif step < 0: + start, stop = stop + 1, start + 1 step = -step return (stop - start + step - 1) // step elif isinstance(indexer, (ABCSeries, ABCIndexClass, np.ndarray, list)): diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index abe0cd86c90d7d..9845b1ac3a4b9a 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1070,6 +1070,16 @@ def test_series_indexing_zerodim_np_array(self): result = s.loc[np.array(0)] assert result == 1 + def test_loc_reverse_assignment(self): + # GH26939 + data = [1, 2, 3, 4, 5, 6] + [None] * 4 + expected = Series(data, index=range(2010, 2020)) + + result = pd.Series(index=range(2010, 2020)) + result.loc[2015:2010:-1] = [6, 5, 4, 3, 2, 1] + + tm.assert_series_equal(result, expected) + def test_series_loc_getitem_label_list_missing_values(): # gh-11428 From 2518040894ef00d9ce427539937a86b2328a9e50 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 29 Aug 2019 05:17:03 -0700 Subject: [PATCH 35/95] STY: whitespace before class docstringsd (#28209) --- pandas/core/base.py | 1 - pandas/core/computation/expr.py | 8 ++++---- pandas/core/computation/pytables.py | 2 -- pandas/core/groupby/groupby.py | 1 - pandas/core/groupby/grouper.py | 1 - pandas/core/groupby/ops.py | 1 - pandas/core/indexes/frozen.py | 1 - pandas/core/sorting.py | 1 - pandas/io/common.py | 1 - pandas/io/packers.py | 1 - pandas/io/pytables.py | 14 -------------- pandas/tests/io/test_sql.py | 1 - pandas/tests/reshape/test_concat.py | 1 - 13 files changed, 4 insertions(+), 30 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index 767b5594450385..2d5ffb5e913923 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -47,7 +47,6 @@ class PandasObject(DirNamesMixin): - """baseclass for various pandas objects""" @property diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index 4c164968575a16..45319a4d63d948 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -367,8 +367,8 @@ def f(cls): @disallow(_unsupported_nodes) @add_ops(_op_classes) class BaseExprVisitor(ast.NodeVisitor): - - """Custom ast walker. Parsers of other engines should subclass this class + """ + Custom ast walker. Parsers of other engines should subclass this class if necessary. Parameters @@ -803,8 +803,8 @@ def __init__(self, env, engine, parser, preparser=lambda x: x): class Expr: - - """Object encapsulating an expression. + """ + Object encapsulating an expression. Parameters ---------- diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py index 1523eb05ac41dd..81658ab23ba466 100644 --- a/pandas/core/computation/pytables.py +++ b/pandas/core/computation/pytables.py @@ -478,7 +478,6 @@ def _validate_where(w): class Expr(expr.Expr): - """ hold a pytables like expression, comprised of possibly multiple 'terms' Parameters @@ -573,7 +572,6 @@ def evaluate(self): class TermValue: - """ hold a term value the we use to construct a condition/filter """ def __init__(self, value, converted, kind): diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 87047d21709927..4d21b5810470a7 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1011,7 +1011,6 @@ def _apply_filter(self, indices, dropna): class GroupBy(_GroupBy): - """ Class for grouping and aggregating relational data. diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 3415c0e056a1ce..31623171e9e631 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -217,7 +217,6 @@ def __repr__(self): class Grouping: - """ Holds the grouping information for a single key diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index b0c629f017dd34..5ad48fa675dd92 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -706,7 +706,6 @@ def _aggregate_series_pure_python(self, obj, func): class BinGrouper(BaseGrouper): - """ This is an internal Grouper class diff --git a/pandas/core/indexes/frozen.py b/pandas/core/indexes/frozen.py index 2e5b3ff8ef502d..329456e25bdedc 100644 --- a/pandas/core/indexes/frozen.py +++ b/pandas/core/indexes/frozen.py @@ -22,7 +22,6 @@ class FrozenList(PandasObject, list): - """ Container that doesn't allow setting item *but* because it's technically non-hashable, will be used diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 5db31fe6664eaf..e6edad656d430e 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -271,7 +271,6 @@ def nargsort(items, kind="quicksort", ascending=True, na_position="last"): class _KeyMapper: - """ Ease my suffering. Map compressed group id -> key tuple """ diff --git a/pandas/io/common.py b/pandas/io/common.py index 290022167e5205..30228d660e8167 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -576,7 +576,6 @@ def __next__(self) -> str: class UTF8Recoder(BaseIterator): - """ Iterator that reads an encoded stream and re-encodes the input to UTF-8 """ diff --git a/pandas/io/packers.py b/pandas/io/packers.py index 04e49708ff082b..ad47ba23b9221d 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -846,7 +846,6 @@ def __init__( class Iterator: - """ manage the unpacking iteration, close the file on completion """ diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 576c45a2f8097e..fbe413f820c901 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -429,7 +429,6 @@ def _is_metadata_of(group, parent_group): class HDFStore: - """ Dict-like IO interface for storing pandas objects in PyTables. @@ -1546,7 +1545,6 @@ def _read_group(self, group, **kwargs): class TableIterator: - """ define the iteration interface on a table Parameters @@ -1654,7 +1652,6 @@ def get_result(self, coordinates=False): class IndexCol: - """ an index column description class Parameters @@ -1968,7 +1965,6 @@ def write_metadata(self, handler): class GenericIndexCol(IndexCol): - """ an index which is not represented in the data of the table """ @property @@ -2006,7 +2002,6 @@ def set_attr(self): class DataCol(IndexCol): - """ a data holding column, by definition this is not indexable Parameters @@ -2456,7 +2451,6 @@ def set_attr(self): class DataIndexableCol(DataCol): - """ represent a data column that can be indexed """ is_data_indexable = True @@ -2479,7 +2473,6 @@ def get_atom_timedelta64(self, block): class GenericDataIndexableCol(DataIndexableCol): - """ represent a generic pytables data column """ def get_attr(self): @@ -2487,7 +2480,6 @@ def get_attr(self): class Fixed: - """ represent an object in my store facilitate read/write of various types of objects this is an abstract base class @@ -2655,7 +2647,6 @@ def delete(self, where=None, start=None, stop=None, **kwargs): class GenericFixed(Fixed): - """ a generified fixed version """ _index_type_map = {DatetimeIndex: "datetime", PeriodIndex: "period"} @@ -3252,7 +3243,6 @@ class FrameFixed(BlockManagerFixed): class Table(Fixed): - """ represent a table: facilitate read/write of various types of tables @@ -4127,7 +4117,6 @@ def read_column(self, column, where=None, start=None, stop=None): class WORMTable(Table): - """ a write-once read-many table: this format DOES NOT ALLOW appending to a table. writing is a one-time operation the data are stored in a format that allows for searching the data on disk @@ -4149,7 +4138,6 @@ def write(self, **kwargs): class LegacyTable(Table): - """ an appendable table: allow append/query/delete operations to a (possibly) already existing appendable table this table ALLOWS append (but doesn't require them), and stores the data in a format @@ -4603,7 +4591,6 @@ def write(self, **kwargs): class AppendableMultiFrameTable(AppendableFrameTable): - """ a frame with a multi-index """ table_type = "appendable_multiframe" @@ -4962,7 +4949,6 @@ def _need_convert(kind): class Selection: - """ Carries out a selection operation on a tables.Table object. diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index d8465a427eaea5..25727447b4c6fb 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -565,7 +565,6 @@ def _transaction_test(self): class _TestSQLApi(PandasSQLTest): - """ Base class to test the public API. diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 6366bf0521fbc2..13f0f14014a314 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -50,7 +50,6 @@ def sort_with_none(request): class TestConcatAppendCommon: - """ Test common dtype coercion rules between concat and append. """ From 5f34933848d7daa129651a53158cb94367bacbcd Mon Sep 17 00:00:00 2001 From: DavidRosen Date: Thu, 29 Aug 2019 08:31:31 -0400 Subject: [PATCH 36/95] DOC: Example for adding a calculated column in SQL and Pandas (#28182) * Add example for adding a calculated column --- .../comparison/comparison_with_sql.rst | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/doc/source/getting_started/comparison/comparison_with_sql.rst b/doc/source/getting_started/comparison/comparison_with_sql.rst index 366fdd546f58b5..6a03c06de3699a 100644 --- a/doc/source/getting_started/comparison/comparison_with_sql.rst +++ b/doc/source/getting_started/comparison/comparison_with_sql.rst @@ -49,6 +49,20 @@ With pandas, column selection is done by passing a list of column names to your Calling the DataFrame without the list of column names would display all columns (akin to SQL's ``*``). +In SQL, you can add a calculated column: + +.. code-block:: sql + + SELECT *, tip/total_bill as tip_rate + FROM tips + LIMIT 5; + +With pandas, you can use the :meth:`DataFrame.assign` method of a DataFrame to append a new column: + +.. ipython:: python + + tips.assign(tip_rate=tips['tip'] / tips['total_bill']).head(5) + WHERE ----- Filtering in SQL is done via a WHERE clause. From 03b3c8fc82b3a18a3ddcad1b3b26d601467fc74c Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 29 Aug 2019 20:28:54 +0100 Subject: [PATCH 37/95] CLN: minor typos MutliIndex -> MultiIndex (#28223) --- doc/source/whatsnew/v0.20.0.rst | 2 +- pandas/tests/frame/test_reshape.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.rst b/doc/source/whatsnew/v0.20.0.rst index ef6108ae3ec909..62604dd3edd2dd 100644 --- a/doc/source/whatsnew/v0.20.0.rst +++ b/doc/source/whatsnew/v0.20.0.rst @@ -495,7 +495,7 @@ Other enhancements - :func:`pandas.util.hash_pandas_object` has gained the ability to hash a ``MultiIndex`` (:issue:`15224`) - ``Series/DataFrame.squeeze()`` have gained the ``axis`` parameter. (:issue:`15339`) - ``DataFrame.to_excel()`` has a new ``freeze_panes`` parameter to turn on Freeze Panes when exporting to Excel (:issue:`15160`) -- ``pd.read_html()`` will parse multiple header rows, creating a MutliIndex header. (:issue:`13434`). +- ``pd.read_html()`` will parse multiple header rows, creating a MultiIndex header. (:issue:`13434`). - HTML table output skips ``colspan`` or ``rowspan`` attribute if equal to 1. (:issue:`15403`) - :class:`pandas.io.formats.style.Styler` template now has blocks for easier extension, see the :ref:`example notebook ` (:issue:`15649`) - :meth:`Styler.render() ` now accepts ``**kwargs`` to allow user-defined variables in the template (:issue:`15649`) diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index f3452e9a85fb3e..84e343f07f990d 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -984,7 +984,7 @@ def test_stack_preserve_categorical_dtype(self, ordered, labels): df = DataFrame([[10, 11, 12]], columns=cidx) result = df.stack() - # `MutliIndex.from_product` preserves categorical dtype - + # `MultiIndex.from_product` preserves categorical dtype - # it's tested elsewhere. midx = pd.MultiIndex.from_product([df.index, cidx]) expected = Series([10, 11, 12], index=midx) From d9b3993cc3722ddd01367089d374652c0b5ce0ce Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 30 Aug 2019 07:28:37 -0700 Subject: [PATCH 38/95] reduction-> libreduction for grepability (#28184) --- pandas/core/apply.py | 6 +++--- pandas/core/groupby/ops.py | 10 +++++----- pandas/tests/groupby/test_bin_groupby.py | 16 ++++++++-------- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 5c8599dbb054b6..b96b3c75720315 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -3,7 +3,7 @@ import numpy as np -from pandas._libs import reduction +from pandas._libs import reduction as libreduction from pandas.util._decorators import cache_readonly from pandas.core.dtypes.common import ( @@ -221,7 +221,7 @@ def apply_raw(self): """ apply to the values as a numpy array """ try: - result = reduction.compute_reduction(self.values, self.f, axis=self.axis) + result = libreduction.compute_reduction(self.values, self.f, axis=self.axis) except Exception: result = np.apply_along_axis(self.f, self.axis, self.values) @@ -281,7 +281,7 @@ def apply_standard(self): dummy = Series(empty_arr, index=index, dtype=values.dtype) try: - result = reduction.compute_reduction( + result = libreduction.compute_reduction( values, self.f, axis=self.axis, dummy=dummy, labels=labels ) return self.obj._constructor_sliced(result, index=labels) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 5ad48fa675dd92..7afb0a28f943ee 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -12,7 +12,7 @@ from pandas._libs import NaT, iNaT, lib import pandas._libs.groupby as libgroupby -import pandas._libs.reduction as reduction +import pandas._libs.reduction as libreduction from pandas.errors import AbstractMethodError from pandas.util._decorators import cache_readonly @@ -207,7 +207,7 @@ def apply(self, f, data, axis=0): if len(result_values) == len(group_keys): return group_keys, result_values, mutated - except reduction.InvalidApply: + except libreduction.InvalidApply: # Cannot fast apply on MultiIndex (_has_complex_internals). # This Exception is also raised if `f` triggers an exception # but it is preferable to raise the exception in Python. @@ -678,7 +678,7 @@ def _aggregate_series_fast(self, obj, func): indexer = get_group_index_sorter(group_index, ngroups) obj = obj.take(indexer) group_index = algorithms.take_nd(group_index, indexer, allow_fill=False) - grouper = reduction.SeriesGrouper(obj, func, group_index, ngroups, dummy) + grouper = libreduction.SeriesGrouper(obj, func, group_index, ngroups, dummy) result, counts = grouper.get_result() return result, counts @@ -851,7 +851,7 @@ def groupings(self): def agg_series(self, obj, func): dummy = obj[:0] - grouper = reduction.SeriesBinGrouper(obj, func, self.bins, dummy) + grouper = libreduction.SeriesBinGrouper(obj, func, self.bins, dummy) return grouper.get_result() @@ -939,7 +939,7 @@ def fast_apply(self, f, names): return [], True sdata = self._get_sorted_data() - return reduction.apply_frame_axis0(sdata, f, names, starts, ends) + return libreduction.apply_frame_axis0(sdata, f, names, starts, ends) def _chop(self, sdata, slice_obj): if self.axis == 0: diff --git a/pandas/tests/groupby/test_bin_groupby.py b/pandas/tests/groupby/test_bin_groupby.py index 2195686ee9c7f6..b8f9ecd42bae3c 100644 --- a/pandas/tests/groupby/test_bin_groupby.py +++ b/pandas/tests/groupby/test_bin_groupby.py @@ -2,7 +2,7 @@ from numpy import nan import pytest -from pandas._libs import groupby, lib, reduction +from pandas._libs import groupby, lib, reduction as libreduction from pandas.core.dtypes.common import ensure_int64 @@ -18,7 +18,7 @@ def test_series_grouper(): labels = np.array([-1, -1, -1, 0, 0, 0, 1, 1, 1, 1], dtype=np.int64) - grouper = reduction.SeriesGrouper(obj, np.mean, labels, 2, dummy) + grouper = libreduction.SeriesGrouper(obj, np.mean, labels, 2, dummy) result, counts = grouper.get_result() expected = np.array([obj[3:6].mean(), obj[6:].mean()]) @@ -34,7 +34,7 @@ def test_series_bin_grouper(): bins = np.array([3, 6]) - grouper = reduction.SeriesBinGrouper(obj, np.mean, bins, dummy) + grouper = libreduction.SeriesBinGrouper(obj, np.mean, bins, dummy) result, counts = grouper.get_result() expected = np.array([obj[:3].mean(), obj[3:6].mean(), obj[6:].mean()]) @@ -120,31 +120,31 @@ class TestMoments: class TestReducer: def test_int_index(self): arr = np.random.randn(100, 4) - result = reduction.compute_reduction(arr, np.sum, labels=Index(np.arange(4))) + result = libreduction.compute_reduction(arr, np.sum, labels=Index(np.arange(4))) expected = arr.sum(0) assert_almost_equal(result, expected) - result = reduction.compute_reduction( + result = libreduction.compute_reduction( arr, np.sum, axis=1, labels=Index(np.arange(100)) ) expected = arr.sum(1) assert_almost_equal(result, expected) dummy = Series(0.0, index=np.arange(100)) - result = reduction.compute_reduction( + result = libreduction.compute_reduction( arr, np.sum, dummy=dummy, labels=Index(np.arange(4)) ) expected = arr.sum(0) assert_almost_equal(result, expected) dummy = Series(0.0, index=np.arange(4)) - result = reduction.compute_reduction( + result = libreduction.compute_reduction( arr, np.sum, axis=1, dummy=dummy, labels=Index(np.arange(100)) ) expected = arr.sum(1) assert_almost_equal(result, expected) - result = reduction.compute_reduction( + result = libreduction.compute_reduction( arr, np.sum, axis=1, dummy=dummy, labels=Index(np.arange(100)) ) assert_almost_equal(result, expected) From 82a7455f8a69b99e9508e6f69bae943072d12a1b Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Fri, 30 Aug 2019 08:32:27 -0600 Subject: [PATCH 39/95] REGR: Fix to_csv with IntervalIndex (#28229) * REGR: Fix to_csv with IntervalIndex --- doc/source/whatsnew/v0.25.2.rst | 2 +- pandas/core/indexes/interval.py | 8 +--- pandas/tests/frame/test_to_csv.py | 14 +++++++ .../tests/indexes/interval/test_interval.py | 40 +++++++++++++++++++ pandas/tests/series/test_io.py | 14 +++++++ 5 files changed, 71 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.25.2.rst b/doc/source/whatsnew/v0.25.2.rst index 6974c7521a2376..8d8a39139cf84c 100644 --- a/doc/source/whatsnew/v0.25.2.rst +++ b/doc/source/whatsnew/v0.25.2.rst @@ -62,7 +62,7 @@ Missing I/O ^^^ -- +- Regression in :meth:`~DataFrame.to_csv` where writing a :class:`Series` or :class:`DataFrame` indexed by an :class:`IntervalIndex` would incorrectly raise a ``TypeError`` (:issue:`28210`) - - diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 6b0081c6a2ff51..7c581a12764b1e 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -1096,12 +1096,8 @@ def _format_with_header(self, header, **kwargs): return header + list(self._format_native_types(**kwargs)) def _format_native_types(self, na_rep="NaN", quoting=None, **kwargs): - """ actually format my specific types """ - from pandas.io.formats.format import ExtensionArrayFormatter - - return ExtensionArrayFormatter( - values=self, na_rep=na_rep, justify="all", leading_space=False - ).get_result() + # GH 28210: use base method but with different default na_rep + return super()._format_native_types(na_rep=na_rep, quoting=quoting, **kwargs) def _format_data(self, name=None): diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index e2e4a82ff581cf..8fb028a0f0326d 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -695,6 +695,20 @@ def _make_frame(names=None): tm.assert_index_equal(recons.columns, exp.columns) assert len(recons) == 0 + def test_to_csv_interval_index(self): + # GH 28210 + df = DataFrame({"A": list("abc"), "B": range(3)}, index=pd.interval_range(0, 3)) + + with ensure_clean("__tmp_to_csv_interval_index__.csv") as path: + df.to_csv(path) + result = self.read_csv(path, index_col=0) + + # can't roundtrip intervalindex via read_csv so check string repr (GH 23595) + expected = df.copy() + expected.index = expected.index.astype(str) + + assert_frame_equal(result, expected) + def test_to_csv_float32_nanrep(self): df = DataFrame(np.random.randn(1, 4).astype(np.float32)) df[1] = np.nan diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index c1a21e6a7f1527..eeb0f43f4b9003 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -417,6 +417,46 @@ def test_repr_missing(self, constructor, expected): result = repr(obj) assert result == expected + @pytest.mark.parametrize( + "tuples, closed, expected_data", + [ + ([(0, 1), (1, 2), (2, 3)], "left", ["[0, 1)", "[1, 2)", "[2, 3)"]), + ( + [(0.5, 1.0), np.nan, (2.0, 3.0)], + "right", + ["(0.5, 1.0]", "NaN", "(2.0, 3.0]"], + ), + ( + [ + (Timestamp("20180101"), Timestamp("20180102")), + np.nan, + ((Timestamp("20180102"), Timestamp("20180103"))), + ], + "both", + ["[2018-01-01, 2018-01-02]", "NaN", "[2018-01-02, 2018-01-03]"], + ), + ( + [ + (Timedelta("0 days"), Timedelta("1 days")), + (Timedelta("1 days"), Timedelta("2 days")), + np.nan, + ], + "neither", + [ + "(0 days 00:00:00, 1 days 00:00:00)", + "(1 days 00:00:00, 2 days 00:00:00)", + "NaN", + ], + ), + ], + ) + def test_to_native_types(self, tuples, closed, expected_data): + # GH 28210 + index = IntervalIndex.from_tuples(tuples, closed=closed) + result = index.to_native_types() + expected = np.array(expected_data) + tm.assert_numpy_array_equal(result, expected) + def test_get_item(self, closed): i = IntervalIndex.from_arrays((0, 1, np.nan), (1, 2, np.nan), closed=closed) assert i[0] == Interval(0.0, 1.0, closed=closed) diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py index 0686b397cbd811..0ddf1dfcabb59b 100644 --- a/pandas/tests/series/test_io.py +++ b/pandas/tests/series/test_io.py @@ -191,6 +191,20 @@ def test_to_csv_compression(self, s, encoding, compression): s, pd.read_csv(fh, index_col=0, squeeze=True, encoding=encoding) ) + def test_to_csv_interval_index(self): + # GH 28210 + s = Series(["foo", "bar", "baz"], index=pd.interval_range(0, 3)) + + with ensure_clean("__tmp_to_csv_interval_index__.csv") as path: + s.to_csv(path, header=False) + result = self.read_csv(path, index_col=0, squeeze=True) + + # can't roundtrip intervalindex via read_csv so check string repr (GH 23595) + expected = s.copy() + expected.index = expected.index.astype(str) + + assert_series_equal(result, expected) + class TestSeriesIO: def test_to_frame(self, datetime_series): From 7b25463abeeea197f55ff2d5187938dd4cba08ce Mon Sep 17 00:00:00 2001 From: Kaiqi Dong Date: Fri, 30 Aug 2019 16:47:16 +0200 Subject: [PATCH 40/95] BUG: Multiple lambdas in named aggregation (#27921) --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/core/groupby/generic.py | 42 ++++- .../tests/groupby/aggregate/test_aggregate.py | 149 +++++++++++++++++- 3 files changed, 187 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 050a26cc86d429..83beec5607986f 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -178,6 +178,7 @@ Groupby/resample/rolling - - - Bug in :meth:`DataFrame.groupby` not offering selection by column name when ``axis=1`` (:issue:`27614`) +- Bug in :meth:`DataFrameGroupby.agg` not able to use lambda function with named aggregation (:issue:`27519`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 7d6690a0dfa5ac..b0bcd1cc1e27c4 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -268,7 +268,9 @@ def aggregate(self, func, *args, **kwargs): result.index = np.arange(len(result)) if relabeling: - result = result[order] + + # used reordered index of columns + result = result.iloc[:, order] result.columns = columns return result._convert(datetime=True) @@ -1731,8 +1733,8 @@ def _normalize_keyword_aggregation(kwargs): The transformed kwargs. columns : List[str] The user-provided keys. - order : List[Tuple[str, str]] - Pairs of the input and output column names. + col_idx_order : List[int] + List of columns indices. Examples -------- @@ -1759,7 +1761,39 @@ def _normalize_keyword_aggregation(kwargs): else: aggspec[column] = [aggfunc] order.append((column, com.get_callable_name(aggfunc) or aggfunc)) - return aggspec, columns, order + + # uniquify aggfunc name if duplicated in order list + uniquified_order = _make_unique(order) + + # GH 25719, due to aggspec will change the order of assigned columns in aggregation + # uniquified_aggspec will store uniquified order list and will compare it with order + # based on index + aggspec_order = [ + (column, com.get_callable_name(aggfunc) or aggfunc) + for column, aggfuncs in aggspec.items() + for aggfunc in aggfuncs + ] + uniquified_aggspec = _make_unique(aggspec_order) + + # get the new indice of columns by comparison + col_idx_order = Index(uniquified_aggspec).get_indexer(uniquified_order) + return aggspec, columns, col_idx_order + + +def _make_unique(seq): + """Uniquify aggfunc name of the pairs in the order list + + Examples: + -------- + >>> _make_unique([('a', ''), ('a', ''), ('b', '')]) + [('a', '_0'), ('a', '_1'), ('b', '')] + """ + return [ + (pair[0], "_".join([pair[1], str(seq[:i].count(pair))])) + if seq.count(pair) > 1 + else pair + for i, pair in enumerate(seq) + ] # TODO: Can't use, because mypy doesn't like us setting __name__ diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 52d4fa76bf8794..aa80c461a00e79 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -10,7 +10,7 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, compat, concat from pandas.core.base import SpecificationError -from pandas.core.groupby.generic import _maybe_mangle_lambdas +from pandas.core.groupby.generic import _make_unique, _maybe_mangle_lambdas from pandas.core.groupby.grouper import Grouping import pandas.util.testing as tm @@ -560,3 +560,150 @@ def test_with_kwargs(self): result = pd.Series([1, 2]).groupby([0, 0]).agg([f1, f2], 0, b=10) expected = pd.DataFrame({"": [13], "": [30]}) tm.assert_frame_equal(result, expected) + + def test_agg_with_one_lambda(self): + # GH 25719, write tests for DataFrameGroupby.agg with only one lambda + df = pd.DataFrame( + { + "kind": ["cat", "dog", "cat", "dog"], + "height": [9.1, 6.0, 9.5, 34.0], + "weight": [7.9, 7.5, 9.9, 198.0], + } + ) + + # sort for 35 and earlier + columns = ["height_sqr_min", "height_max", "weight_max"] + if compat.PY35: + columns = ["height_max", "height_sqr_min", "weight_max"] + expected = pd.DataFrame( + { + "height_sqr_min": [82.81, 36.00], + "height_max": [9.5, 34.0], + "weight_max": [9.9, 198.0], + }, + index=pd.Index(["cat", "dog"], name="kind"), + columns=columns, + ) + + # check pd.NameAgg case + result1 = df.groupby(by="kind").agg( + height_sqr_min=pd.NamedAgg( + column="height", aggfunc=lambda x: np.min(x ** 2) + ), + height_max=pd.NamedAgg(column="height", aggfunc="max"), + weight_max=pd.NamedAgg(column="weight", aggfunc="max"), + ) + tm.assert_frame_equal(result1, expected) + + # check agg(key=(col, aggfunc)) case + result2 = df.groupby(by="kind").agg( + height_sqr_min=("height", lambda x: np.min(x ** 2)), + height_max=("height", "max"), + weight_max=("weight", "max"), + ) + tm.assert_frame_equal(result2, expected) + + def test_agg_multiple_lambda(self): + # GH25719, test for DataFrameGroupby.agg with multiple lambdas + # with mixed aggfunc + df = pd.DataFrame( + { + "kind": ["cat", "dog", "cat", "dog"], + "height": [9.1, 6.0, 9.5, 34.0], + "weight": [7.9, 7.5, 9.9, 198.0], + } + ) + # sort for 35 and earlier + columns = [ + "height_sqr_min", + "height_max", + "weight_max", + "height_max_2", + "weight_min", + ] + if compat.PY35: + columns = [ + "height_max", + "height_max_2", + "height_sqr_min", + "weight_max", + "weight_min", + ] + expected = pd.DataFrame( + { + "height_sqr_min": [82.81, 36.00], + "height_max": [9.5, 34.0], + "weight_max": [9.9, 198.0], + "height_max_2": [9.5, 34.0], + "weight_min": [7.9, 7.5], + }, + index=pd.Index(["cat", "dog"], name="kind"), + columns=columns, + ) + + # check agg(key=(col, aggfunc)) case + result1 = df.groupby(by="kind").agg( + height_sqr_min=("height", lambda x: np.min(x ** 2)), + height_max=("height", "max"), + weight_max=("weight", "max"), + height_max_2=("height", lambda x: np.max(x)), + weight_min=("weight", lambda x: np.min(x)), + ) + tm.assert_frame_equal(result1, expected) + + # check pd.NamedAgg case + result2 = df.groupby(by="kind").agg( + height_sqr_min=pd.NamedAgg( + column="height", aggfunc=lambda x: np.min(x ** 2) + ), + height_max=pd.NamedAgg(column="height", aggfunc="max"), + weight_max=pd.NamedAgg(column="weight", aggfunc="max"), + height_max_2=pd.NamedAgg(column="height", aggfunc=lambda x: np.max(x)), + weight_min=pd.NamedAgg(column="weight", aggfunc=lambda x: np.min(x)), + ) + tm.assert_frame_equal(result2, expected) + + @pytest.mark.parametrize( + "order, expected_reorder", + [ + ( + [ + ("height", ""), + ("height", "max"), + ("weight", "max"), + ("height", ""), + ("weight", ""), + ], + [ + ("height", "_0"), + ("height", "max"), + ("weight", "max"), + ("height", "_1"), + ("weight", ""), + ], + ), + ( + [ + ("col2", "min"), + ("col1", ""), + ("col1", ""), + ("col1", ""), + ], + [ + ("col2", "min"), + ("col1", "_0"), + ("col1", "_1"), + ("col1", "_2"), + ], + ), + ( + [("col", ""), ("col", ""), ("col", "")], + [("col", "_0"), ("col", "_1"), ("col", "_2")], + ), + ], + ) + def test_make_unique(self, order, expected_reorder): + # GH 27519, test if make_unique function reorders correctly + result = _make_unique(order) + + assert result == expected_reorder From 51db82d9cc1abcec6c912d83e714811005471379 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 30 Aug 2019 09:38:39 -0700 Subject: [PATCH 41/95] PERF: lazify pytz seqToRE call, trims 35ms from import (#28228) --- pandas/_libs/tslibs/strptime.pyx | 44 +++++++++++++++++++++----------- 1 file changed, 29 insertions(+), 15 deletions(-) diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index d93858cff5e053..fbda5f178e1647 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -341,7 +341,8 @@ def array_strptime(object[:] values, object fmt, return result, result_timezone.base -"""_getlang, LocaleTime, TimeRE, _calc_julian_from_U_or_W are vendored +""" +_getlang, LocaleTime, TimeRE, _calc_julian_from_U_or_W are vendored from the standard library, see https://github.com/python/cpython/blob/master/Lib/_strptime.py The original module-level docstring follows. @@ -363,7 +364,8 @@ def _getlang(): class LocaleTime: - """Stores and handles locale-specific information related to time. + """ + Stores and handles locale-specific information related to time. ATTRIBUTES: f_weekday -- full weekday names (7-item list) @@ -382,7 +384,8 @@ class LocaleTime: """ def __init__(self): - """Set all attributes. + """ + Set all attributes. Order of methods called matters for dependency reasons. @@ -399,7 +402,6 @@ class LocaleTime: Only other possible issue is if someone changed the timezone and did not call tz.tzset . That is an issue for the programmer, though, since changing the timezone is worthless without that call. - """ self.lang = _getlang() self.__calc_weekday() @@ -518,15 +520,16 @@ class TimeRE(dict): """ def __init__(self, locale_time=None): - """Create keys/values. + """ + Create keys/values. Order of execution is important for dependency reasons. - """ if locale_time: self.locale_time = locale_time else: self.locale_time = LocaleTime() + self._Z = None base = super() base.__init__({ # The " \d" part of the regex is to make %c from ANSI C work @@ -555,21 +558,29 @@ class TimeRE(dict): 'B': self.__seqToRE(self.locale_time.f_month[1:], 'B'), 'b': self.__seqToRE(self.locale_time.a_month[1:], 'b'), 'p': self.__seqToRE(self.locale_time.am_pm, 'p'), - 'Z': self.__seqToRE(pytz.all_timezones, 'Z'), + # 'Z' key is generated lazily via __getitem__ '%': '%'}) base.__setitem__('W', base.__getitem__('U').replace('U', 'W')) base.__setitem__('c', self.pattern(self.locale_time.LC_date_time)) base.__setitem__('x', self.pattern(self.locale_time.LC_date)) base.__setitem__('X', self.pattern(self.locale_time.LC_time)) + def __getitem__(self, key): + if key == "Z": + # lazy computation + if self._Z is None: + self._Z = self.__seqToRE(pytz.all_timezones, 'Z') + return self._Z + return super().__getitem__(key) + def __seqToRE(self, to_convert, directive): - """Convert a list to a regex string for matching a directive. + """ + Convert a list to a regex string for matching a directive. Want possible matching values to be from longest to shortest. This prevents the possibility of a match occurring for a value that also a substring of a larger value that should have matched (e.g., 'abc' matching when 'abcdef' should have been the match). - """ to_convert = sorted(to_convert, key=len, reverse=True) for value in to_convert: @@ -582,11 +593,11 @@ class TimeRE(dict): return '%s)' % regex def pattern(self, format): - """Return regex pattern for the format string. + """ + Return regex pattern for the format string. Need to make sure that any characters that might be interpreted as regex syntax are escaped. - """ processed_format = '' # The sub() call escapes all characters that might be misconstrued @@ -619,7 +630,8 @@ _regex_cache = {} cdef int _calc_julian_from_U_or_W(int year, int week_of_year, int day_of_week, int week_starts_Mon): - """Calculate the Julian day based on the year, week of the year, and day of + """ + Calculate the Julian day based on the year, week of the year, and day of the week, with week_start_day representing whether the week of the year assumes the week starts on Sunday or Monday (6 or 0). @@ -660,8 +672,10 @@ cdef int _calc_julian_from_U_or_W(int year, int week_of_year, return 1 + days_to_week + day_of_week -cdef object _calc_julian_from_V(int iso_year, int iso_week, int iso_weekday): - """Calculate the Julian day based on the ISO 8601 year, week, and weekday. +cdef (int, int) _calc_julian_from_V(int iso_year, int iso_week, int iso_weekday): + """ + Calculate the Julian day based on the ISO 8601 year, week, and weekday. + ISO weeks start on Mondays, with week 01 being the week containing 4 Jan. ISO week days range from 1 (Monday) to 7 (Sunday). @@ -694,7 +708,7 @@ cdef object _calc_julian_from_V(int iso_year, int iso_week, int iso_weekday): return iso_year, ordinal -cdef parse_timezone_directive(object z): +cdef parse_timezone_directive(str z): """ Parse the '%z' directive and return a pytz.FixedOffset From 75c9783d4924c98d84e9722060686fc7b4643259 Mon Sep 17 00:00:00 2001 From: Bas Nijholt Date: Fri, 30 Aug 2019 19:05:31 +0200 Subject: [PATCH 42/95] STYLE: run pre-commit filters on the repo (#27915) * add isort:skip to "from .pandas_vb_common import setup" * add isort:skip to noqa: E402 marked lines * run black * add noqa: E402 isort:skip where needed * run pre-commit filters on asv_bench/benchmarks/ * parse the isort config when using pre-commit * run isort on pandas/core/api.py * run pre-commit filters and commit trivial import sorting changes * specify flake8 errors in pandas/io/msgpack/__init__.py * fix imports for doc/source/conf.py * fix the [isort] skip entry in setup.cfg Also I removed the files for which I have fixed the problems. --- .pre-commit-config.yaml | 36 ++++++----- asv_bench/benchmarks/attrs_caching.py | 3 +- asv_bench/benchmarks/binary_ops.py | 3 +- asv_bench/benchmarks/categoricals.py | 6 +- asv_bench/benchmarks/ctors.py | 5 +- asv_bench/benchmarks/dtypes.py | 10 +-- asv_bench/benchmarks/eval.py | 3 +- asv_bench/benchmarks/frame_ctor.py | 5 +- asv_bench/benchmarks/frame_methods.py | 4 +- asv_bench/benchmarks/gil.py | 9 +-- asv_bench/benchmarks/groupby.py | 3 +- asv_bench/benchmarks/index_object.py | 14 +++-- asv_bench/benchmarks/indexing.py | 17 ++--- asv_bench/benchmarks/inference.py | 7 ++- asv_bench/benchmarks/io/csv.py | 7 ++- asv_bench/benchmarks/io/excel.py | 6 +- asv_bench/benchmarks/io/hdf.py | 5 +- asv_bench/benchmarks/io/json.py | 5 +- asv_bench/benchmarks/io/msgpack.py | 4 +- asv_bench/benchmarks/io/pickle.py | 3 +- asv_bench/benchmarks/io/sql.py | 7 ++- asv_bench/benchmarks/io/stata.py | 3 +- asv_bench/benchmarks/join_merge.py | 5 +- asv_bench/benchmarks/multiindex_object.py | 5 +- asv_bench/benchmarks/offset.py | 3 +- asv_bench/benchmarks/pandas_vb_common.py | 3 +- asv_bench/benchmarks/period.py | 1 + asv_bench/benchmarks/plotting.py | 7 ++- asv_bench/benchmarks/reindex.py | 6 +- asv_bench/benchmarks/replace.py | 3 +- asv_bench/benchmarks/reshape.py | 7 ++- asv_bench/benchmarks/rolling.py | 5 +- asv_bench/benchmarks/series_methods.py | 5 +- asv_bench/benchmarks/sparse.py | 2 +- asv_bench/benchmarks/stat_ops.py | 4 +- asv_bench/benchmarks/strings.py | 3 +- asv_bench/benchmarks/timeseries.py | 6 +- ci/print_skipped.py | 2 +- doc/logo/pandas_logo.py | 3 +- doc/make.py | 8 +-- doc/source/conf.py | 24 +++++--- doc/source/user_guide/io.rst | 2 +- doc/sphinxext/contributors.py | 3 +- pandas/core/api.py | 61 +++++++++--------- pandas/io/msgpack/__init__.py | 14 +++-- pandas/tests/io/pytables/test_pytables.py | 5 +- pandas/tests/io/test_feather.py | 2 +- scripts/find_commits_touching_func.py | 6 +- scripts/generate_pip_deps_from_conda.py | 2 +- scripts/merge-pr.py | 7 ++- scripts/tests/test_validate_docstrings.py | 7 ++- scripts/validate_docstrings.py | 32 +++++----- setup.cfg | 75 +++++------------------ setup.py | 20 +++--- 54 files changed, 255 insertions(+), 248 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 32ffb3330564c3..5cc22c638c9b13 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,17 +1,21 @@ repos: - - repo: https://github.com/python/black - rev: stable - hooks: - - id: black - language_version: python3.7 - - repo: https://gitlab.com/pycqa/flake8 - rev: 3.7.7 - hooks: - - id: flake8 - language: python_venv - additional_dependencies: [flake8-comprehensions] - - repo: https://github.com/pre-commit/mirrors-isort - rev: v4.3.20 - hooks: - - id: isort - language: python_venv +- repo: https://github.com/python/black + rev: stable + hooks: + - id: black + language_version: python3.7 +- repo: https://gitlab.com/pycqa/flake8 + rev: 3.7.7 + hooks: + - id: flake8 + language: python_venv + additional_dependencies: [flake8-comprehensions] +- repo: https://github.com/pre-commit/mirrors-isort + rev: v4.3.20 + hooks: + - id: isort + language: python_venv +- repo: https://github.com/asottile/seed-isort-config + rev: v1.9.2 + hooks: + - id: seed-isort-config diff --git a/asv_bench/benchmarks/attrs_caching.py b/asv_bench/benchmarks/attrs_caching.py index c43e5dfd729aad..501e27b9078ec6 100644 --- a/asv_bench/benchmarks/attrs_caching.py +++ b/asv_bench/benchmarks/attrs_caching.py @@ -1,4 +1,5 @@ import numpy as np + from pandas import DataFrame try: @@ -32,4 +33,4 @@ def time_cache_readonly(self): self.obj.prop -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/binary_ops.py b/asv_bench/benchmarks/binary_ops.py index fd3324b78f1c3d..58e0db67d60254 100644 --- a/asv_bench/benchmarks/binary_ops.py +++ b/asv_bench/benchmarks/binary_ops.py @@ -1,4 +1,5 @@ import numpy as np + from pandas import DataFrame, Series, date_range from pandas.core.algorithms import checked_add_with_arr @@ -155,4 +156,4 @@ def time_add_overflow_both_arg_nan(self): ) -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index 8097118a79d20d..559aa7050a6407 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -1,7 +1,9 @@ +import warnings + import numpy as np + import pandas as pd import pandas.util.testing as tm -import warnings try: from pandas.api.types import union_categoricals @@ -280,4 +282,4 @@ def time_sort_values(self): self.index.sort_values(ascending=False) -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/ctors.py b/asv_bench/benchmarks/ctors.py index 654075292cdf62..ec3dd7a48a89f4 100644 --- a/asv_bench/benchmarks/ctors.py +++ b/asv_bench/benchmarks/ctors.py @@ -1,6 +1,7 @@ import numpy as np + +from pandas import DatetimeIndex, Index, MultiIndex, Series, Timestamp import pandas.util.testing as tm -from pandas import Series, Index, DatetimeIndex, Timestamp, MultiIndex def no_change(arr): @@ -113,4 +114,4 @@ def time_multiindex_from_iterables(self): MultiIndex.from_product(self.iterables) -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/dtypes.py b/asv_bench/benchmarks/dtypes.py index 60800b1f9cae71..24cc1c6f9fa701 100644 --- a/asv_bench/benchmarks/dtypes.py +++ b/asv_bench/benchmarks/dtypes.py @@ -1,14 +1,14 @@ +import numpy as np + from pandas.api.types import pandas_dtype -import numpy as np from .pandas_vb_common import ( - numeric_dtypes, datetime_dtypes, - string_dtypes, extension_dtypes, + numeric_dtypes, + string_dtypes, ) - _numpy_dtypes = [ np.dtype(dtype) for dtype in (numeric_dtypes + datetime_dtypes + string_dtypes) ] @@ -40,4 +40,4 @@ def time_pandas_dtype_invalid(self, dtype): pass -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/eval.py b/asv_bench/benchmarks/eval.py index 84e94315cc28b0..06a181875aaa85 100644 --- a/asv_bench/benchmarks/eval.py +++ b/asv_bench/benchmarks/eval.py @@ -1,4 +1,5 @@ import numpy as np + import pandas as pd try: @@ -62,4 +63,4 @@ def time_query_with_boolean_selection(self): self.df.query("(a >= @self.min_val) & (a <= @self.max_val)") -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/frame_ctor.py b/asv_bench/benchmarks/frame_ctor.py index acfb26bcf5d7ca..3944e0bc523d84 100644 --- a/asv_bench/benchmarks/frame_ctor.py +++ b/asv_bench/benchmarks/frame_ctor.py @@ -1,6 +1,7 @@ import numpy as np + +from pandas import DataFrame, MultiIndex, Series, Timestamp, date_range import pandas.util.testing as tm -from pandas import DataFrame, Series, MultiIndex, Timestamp, date_range try: from pandas.tseries.offsets import Nano, Hour @@ -104,4 +105,4 @@ def time_frame_from_lists(self): self.df = DataFrame(self.data) -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index e2f6764c76eef8..05f98c66faa2b8 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -1,5 +1,5 @@ -import warnings import string +import warnings import numpy as np @@ -609,4 +609,4 @@ def time_dataframe_describe(self): self.df.describe() -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/gil.py b/asv_bench/benchmarks/gil.py index 0d0b75561d057a..d57492dd372680 100644 --- a/asv_bench/benchmarks/gil.py +++ b/asv_bench/benchmarks/gil.py @@ -1,7 +1,8 @@ import numpy as np -import pandas.util.testing as tm -from pandas import DataFrame, Series, read_csv, factorize, date_range + +from pandas import DataFrame, Series, date_range, factorize, read_csv from pandas.core.algorithms import take_1d +import pandas.util.testing as tm try: from pandas import ( @@ -36,7 +37,7 @@ def wrapper(fname): return wrapper -from .pandas_vb_common import BaseIO +from .pandas_vb_common import BaseIO # noqa: E402 isort:skip class ParallelGroupbyMethods: @@ -301,4 +302,4 @@ def time_loop(self, threads): self.loop() -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 39b07d4734399e..d51c53e2264f1a 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -15,7 +15,6 @@ ) import pandas.util.testing as tm - method_blacklist = { "object": { "median", @@ -626,4 +625,4 @@ def time_first(self): self.df_nans.groupby("key").transform("first") -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py index 49834ae94cc387..a94960d4947077 100644 --- a/asv_bench/benchmarks/index_object.py +++ b/asv_bench/benchmarks/index_object.py @@ -1,15 +1,17 @@ import gc + import numpy as np -import pandas.util.testing as tm + from pandas import ( - Series, - date_range, DatetimeIndex, - Index, - RangeIndex, Float64Index, + Index, IntervalIndex, + RangeIndex, + Series, + date_range, ) +import pandas.util.testing as tm class SetOperations: @@ -243,4 +245,4 @@ def peakmem_gc_instances(self, N): gc.enable() -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 84604b8196536b..ac35139c1954ab 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -1,22 +1,23 @@ import warnings import numpy as np -import pandas.util.testing as tm + from pandas import ( - Series, + CategoricalIndex, DataFrame, - MultiIndex, - Int64Index, - UInt64Index, Float64Index, - IntervalIndex, - CategoricalIndex, IndexSlice, + Int64Index, + IntervalIndex, + MultiIndex, + Series, + UInt64Index, concat, date_range, option_context, period_range, ) +import pandas.util.testing as tm class NumericSeriesIndexing: @@ -371,4 +372,4 @@ def time_chained_indexing(self, mode): df2["C"] = 1.0 -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/inference.py b/asv_bench/benchmarks/inference.py index 66ef4f2aec380c..e85b3bd2c76879 100644 --- a/asv_bench/benchmarks/inference.py +++ b/asv_bench/benchmarks/inference.py @@ -1,8 +1,9 @@ import numpy as np -import pandas.util.testing as tm + from pandas import DataFrame, Series, to_numeric +import pandas.util.testing as tm -from .pandas_vb_common import numeric_dtypes, lib +from .pandas_vb_common import lib, numeric_dtypes class NumericInferOps: @@ -120,4 +121,4 @@ def time_convert(self, data): lib.maybe_convert_numeric(data, set(), coerce_numeric=False) -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 4525e504fc4dd5..9b8599b0a1b64a 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -1,10 +1,11 @@ +from io import StringIO import random import string import numpy as np + +from pandas import Categorical, DataFrame, date_range, read_csv, to_datetime import pandas.util.testing as tm -from pandas import DataFrame, Categorical, date_range, read_csv, to_datetime -from io import StringIO from ..pandas_vb_common import BaseIO @@ -406,4 +407,4 @@ def time_to_datetime_format_DD_MM_YYYY(self, cache_dates): to_datetime(df["date"], cache=cache_dates, format="%d-%m-%Y") -from ..pandas_vb_common import setup # noqa: F401 +from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/io/excel.py b/asv_bench/benchmarks/io/excel.py index 12e70f84e52038..9aa5cbd5b6f7c3 100644 --- a/asv_bench/benchmarks/io/excel.py +++ b/asv_bench/benchmarks/io/excel.py @@ -1,6 +1,8 @@ from io import BytesIO + import numpy as np -from pandas import DataFrame, date_range, ExcelWriter, read_excel + +from pandas import DataFrame, ExcelWriter, date_range, read_excel import pandas.util.testing as tm @@ -35,4 +37,4 @@ def time_write_excel(self, engine): writer_write.save() -from ..pandas_vb_common import setup # noqa: F401 +from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/io/hdf.py b/asv_bench/benchmarks/io/hdf.py index 2874a7889156bf..8ec04a2087f1b7 100644 --- a/asv_bench/benchmarks/io/hdf.py +++ b/asv_bench/benchmarks/io/hdf.py @@ -1,5 +1,6 @@ import numpy as np -from pandas import DataFrame, date_range, HDFStore, read_hdf + +from pandas import DataFrame, HDFStore, date_range, read_hdf import pandas.util.testing as tm from ..pandas_vb_common import BaseIO @@ -127,4 +128,4 @@ def time_write_hdf(self, format): self.df.to_hdf(self.fname, "df", format=format) -from ..pandas_vb_common import setup # noqa: F401 +from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/io/json.py b/asv_bench/benchmarks/io/json.py index fc07f2a4841025..b249c92b53e93e 100644 --- a/asv_bench/benchmarks/io/json.py +++ b/asv_bench/benchmarks/io/json.py @@ -1,6 +1,7 @@ import numpy as np + +from pandas import DataFrame, concat, date_range, read_json, timedelta_range import pandas.util.testing as tm -from pandas import DataFrame, date_range, timedelta_range, concat, read_json from ..pandas_vb_common import BaseIO @@ -214,4 +215,4 @@ def peakmem_float(self, frames): df.to_json() -from ..pandas_vb_common import setup # noqa: F401 +from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/io/msgpack.py b/asv_bench/benchmarks/io/msgpack.py index d97b4ae13f0bd5..f5038602539ab6 100644 --- a/asv_bench/benchmarks/io/msgpack.py +++ b/asv_bench/benchmarks/io/msgpack.py @@ -1,5 +1,7 @@ import warnings + import numpy as np + from pandas import DataFrame, date_range, read_msgpack import pandas.util.testing as tm @@ -27,4 +29,4 @@ def time_write_msgpack(self): self.df.to_msgpack(self.fname) -from ..pandas_vb_common import setup # noqa: F401 +from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/io/pickle.py b/asv_bench/benchmarks/io/pickle.py index 286ac767c02e7e..647e9d27dec9d3 100644 --- a/asv_bench/benchmarks/io/pickle.py +++ b/asv_bench/benchmarks/io/pickle.py @@ -1,4 +1,5 @@ import numpy as np + from pandas import DataFrame, date_range, read_pickle import pandas.util.testing as tm @@ -25,4 +26,4 @@ def time_write_pickle(self): self.df.to_pickle(self.fname) -from ..pandas_vb_common import setup # noqa: F401 +from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/io/sql.py b/asv_bench/benchmarks/io/sql.py index b80872b17a9e4a..fe84c869717e38 100644 --- a/asv_bench/benchmarks/io/sql.py +++ b/asv_bench/benchmarks/io/sql.py @@ -1,10 +1,11 @@ import sqlite3 import numpy as np -import pandas.util.testing as tm -from pandas import DataFrame, date_range, read_sql_query, read_sql_table from sqlalchemy import create_engine +from pandas import DataFrame, date_range, read_sql_query, read_sql_table +import pandas.util.testing as tm + class SQL: @@ -141,4 +142,4 @@ def time_read_sql_table_column(self, dtype): read_sql_table(self.table_name, self.con, columns=[dtype]) -from ..pandas_vb_common import setup # noqa: F401 +from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/io/stata.py b/asv_bench/benchmarks/io/stata.py index b3ed71af47dc8b..28829785d72e92 100644 --- a/asv_bench/benchmarks/io/stata.py +++ b/asv_bench/benchmarks/io/stata.py @@ -1,4 +1,5 @@ import numpy as np + from pandas import DataFrame, date_range, read_stata import pandas.util.testing as tm @@ -50,4 +51,4 @@ def setup(self, convert_dates): self.df.to_stata(self.fname, self.convert_dates) -from ..pandas_vb_common import setup # noqa: F401 +from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index 7c899e3dc6ac8a..6aa82a43a4d6a0 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -1,8 +1,9 @@ import string import numpy as np + +from pandas import DataFrame, MultiIndex, Series, concat, date_range, merge, merge_asof import pandas.util.testing as tm -from pandas import DataFrame, Series, MultiIndex, date_range, concat, merge, merge_asof try: from pandas import merge_ordered @@ -348,4 +349,4 @@ def time_series_align_left_monotonic(self): self.ts1.align(self.ts2, join="left") -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/multiindex_object.py b/asv_bench/benchmarks/multiindex_object.py index eda059a68e8a58..3f4fd7ad911c1e 100644 --- a/asv_bench/benchmarks/multiindex_object.py +++ b/asv_bench/benchmarks/multiindex_object.py @@ -1,8 +1,9 @@ import string import numpy as np + +from pandas import DataFrame, MultiIndex, date_range import pandas.util.testing as tm -from pandas import date_range, MultiIndex, DataFrame class GetLoc: @@ -146,4 +147,4 @@ def time_categorical_level(self): self.df.set_index(["a", "b"]) -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/offset.py b/asv_bench/benchmarks/offset.py index 31c3b6fb6cb60a..d822646e712ae5 100644 --- a/asv_bench/benchmarks/offset.py +++ b/asv_bench/benchmarks/offset.py @@ -1,7 +1,8 @@ -import warnings from datetime import datetime +import warnings import numpy as np + import pandas as pd try: diff --git a/asv_bench/benchmarks/pandas_vb_common.py b/asv_bench/benchmarks/pandas_vb_common.py index fdc8207021c0f3..1faf13329110d6 100644 --- a/asv_bench/benchmarks/pandas_vb_common.py +++ b/asv_bench/benchmarks/pandas_vb_common.py @@ -1,7 +1,8 @@ -import os from importlib import import_module +import os import numpy as np + import pandas as pd # Compatibility import for lib diff --git a/asv_bench/benchmarks/period.py b/asv_bench/benchmarks/period.py index 2f8ae0650ab751..7303240a25f292 100644 --- a/asv_bench/benchmarks/period.py +++ b/asv_bench/benchmarks/period.py @@ -1,4 +1,5 @@ from pandas import DataFrame, Period, PeriodIndex, Series, date_range, period_range + from pandas.tseries.frequencies import to_offset diff --git a/asv_bench/benchmarks/plotting.py b/asv_bench/benchmarks/plotting.py index 4fb0876f05a0a0..5c718516360ed2 100644 --- a/asv_bench/benchmarks/plotting.py +++ b/asv_bench/benchmarks/plotting.py @@ -1,11 +1,12 @@ +import matplotlib import numpy as np -from pandas import DataFrame, Series, DatetimeIndex, date_range + +from pandas import DataFrame, DatetimeIndex, Series, date_range try: from pandas.plotting import andrews_curves except ImportError: from pandas.tools.plotting import andrews_curves -import matplotlib matplotlib.use("Agg") @@ -93,4 +94,4 @@ def time_plot_andrews_curves(self): andrews_curves(self.df, "Name") -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/reindex.py b/asv_bench/benchmarks/reindex.py index 8d4c9ebaf3e891..cd450f801c8052 100644 --- a/asv_bench/benchmarks/reindex.py +++ b/asv_bench/benchmarks/reindex.py @@ -1,6 +1,8 @@ import numpy as np + +from pandas import DataFrame, Index, MultiIndex, Series, date_range, period_range import pandas.util.testing as tm -from pandas import DataFrame, Series, MultiIndex, Index, date_range, period_range + from .pandas_vb_common import lib @@ -159,4 +161,4 @@ def time_lib_fast_zip(self): lib.fast_zip(self.col_array_list) -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/replace.py b/asv_bench/benchmarks/replace.py index f69ae150285255..2a115fb0b4fe33 100644 --- a/asv_bench/benchmarks/replace.py +++ b/asv_bench/benchmarks/replace.py @@ -1,4 +1,5 @@ import numpy as np + import pandas as pd @@ -73,4 +74,4 @@ def time_replace(self, constructor, replace_data): self.data.replace(self.to_replace) -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py index cc373f413fb885..441f4b380656ec 100644 --- a/asv_bench/benchmarks/reshape.py +++ b/asv_bench/benchmarks/reshape.py @@ -1,9 +1,10 @@ -import string from itertools import product +import string import numpy as np -from pandas import DataFrame, MultiIndex, date_range, melt, wide_to_long + import pandas as pd +from pandas import DataFrame, MultiIndex, date_range, melt, wide_to_long class Melt: @@ -262,4 +263,4 @@ def time_explode(self, n_rows, max_list_length): self.series.explode() -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index a70977fcf539f7..3640513d31be26 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -1,6 +1,7 @@ -import pandas as pd import numpy as np +import pandas as pd + class Methods: @@ -121,4 +122,4 @@ def peakmem_fixed(self): self.roll.max() -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 6038a2ab4bd9f9..a3f1d92545c3f2 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -1,8 +1,9 @@ from datetime import datetime import numpy as np + +from pandas import NaT, Series, date_range import pandas.util.testing as tm -from pandas import Series, date_range, NaT class SeriesConstructor: @@ -275,4 +276,4 @@ def time_func(self, func, N, dtype): self.func() -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/sparse.py b/asv_bench/benchmarks/sparse.py index 19d08c086a508a..ac78ca53679fd6 100644 --- a/asv_bench/benchmarks/sparse.py +++ b/asv_bench/benchmarks/sparse.py @@ -136,4 +136,4 @@ def time_division(self, fill_value): self.arr1 / self.arr2 -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py index 620a6de0f5f341..6032bee41958e1 100644 --- a/asv_bench/benchmarks/stat_ops.py +++ b/asv_bench/benchmarks/stat_ops.py @@ -1,6 +1,6 @@ import numpy as np -import pandas as pd +import pandas as pd ops = ["mean", "sum", "median", "std", "skew", "kurt", "mad", "prod", "sem", "var"] @@ -148,4 +148,4 @@ def time_cov_series(self, use_bottleneck): self.s.cov(self.s2) -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index 6be2fa92d9eac3..f30b2482615bd2 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -1,7 +1,8 @@ import warnings import numpy as np -from pandas import Series, DataFrame + +from pandas import DataFrame, Series import pandas.util.testing as tm diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py index 1020b773f8acbb..498774034d6422 100644 --- a/asv_bench/benchmarks/timeseries.py +++ b/asv_bench/benchmarks/timeseries.py @@ -2,7 +2,9 @@ import dateutil import numpy as np -from pandas import to_datetime, date_range, Series, DataFrame, period_range + +from pandas import DataFrame, Series, date_range, period_range, to_datetime + from pandas.tseries.frequencies import infer_freq try: @@ -426,4 +428,4 @@ def time_dt_accessor_year(self, tz): self.series.dt.year -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/ci/print_skipped.py b/ci/print_skipped.py index a44281044e11d0..6bc1dcfcd320dd 100755 --- a/ci/print_skipped.py +++ b/ci/print_skipped.py @@ -1,8 +1,8 @@ #!/usr/bin/env python +import math import os import sys -import math import xml.etree.ElementTree as et diff --git a/doc/logo/pandas_logo.py b/doc/logo/pandas_logo.py index 5a07b094e6ad35..89410e3847bef9 100644 --- a/doc/logo/pandas_logo.py +++ b/doc/logo/pandas_logo.py @@ -1,7 +1,6 @@ # script to generate the pandas logo -from matplotlib import pyplot as plt -from matplotlib import rcParams +from matplotlib import pyplot as plt, rcParams import numpy as np rcParams["mathtext.fontset"] = "cm" diff --git a/doc/make.py b/doc/make.py index 48febef20fbe66..cbb1fa6a5324aa 100755 --- a/doc/make.py +++ b/doc/make.py @@ -11,18 +11,18 @@ $ python make.py html $ python make.py latex """ +import argparse +import csv import importlib -import sys import os import shutil -import csv import subprocess -import argparse +import sys import webbrowser + import docutils import docutils.parsers.rst - DOC_PATH = os.path.dirname(os.path.abspath(__file__)) SOURCE_PATH = os.path.join(DOC_PATH, "source") BUILD_PATH = os.path.join(DOC_PATH, "build") diff --git a/doc/source/conf.py b/doc/source/conf.py index a4b7d97c2cf5e2..1da1948e452688 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -10,15 +10,15 @@ # All configuration values have a default; values that are commented out # serve to show the default. -import sys -import os -import inspect import importlib +import inspect import logging +import os +import sys + import jinja2 -from sphinx.ext.autosummary import _import_by_name from numpydoc.docscrape import NumpyDocString - +from sphinx.ext.autosummary import _import_by_name logger = logging.getLogger(__name__) @@ -141,7 +141,7 @@ # built documents. # # The short X.Y version. -import pandas +import pandas # noqa: E402 isort:skip # version = '%s r%s' % (pandas.__version__, svn_version()) version = str(pandas.__version__) @@ -432,10 +432,14 @@ # Add custom Documenter to handle attributes/methods of an AccessorProperty # eg pandas.Series.str and pandas.Series.dt (see GH9322) -import sphinx -from sphinx.util import rpartition -from sphinx.ext.autodoc import Documenter, MethodDocumenter, AttributeDocumenter -from sphinx.ext.autosummary import Autosummary +import sphinx # noqa: E402 isort:skip +from sphinx.util import rpartition # noqa: E402 isort:skip +from sphinx.ext.autodoc import ( # noqa: E402 isort:skip + AttributeDocumenter, + Documenter, + MethodDocumenter, +) +from sphinx.ext.autosummary import Autosummary # noqa: E402 isort:skip class AccessorDocumenter(MethodDocumenter): diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 1d49dbdee9c03a..338c890ce317c5 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -3206,7 +3206,7 @@ argument to ``to_excel`` and to ``ExcelWriter``. The built-in engines are: writer = pd.ExcelWriter('path_to_file.xlsx', engine='xlsxwriter') # Or via pandas configuration. - from pandas import options # noqa: E402 + from pandas import options # noqa: E402 options.io.excel.xlsx.writer = 'xlsxwriter' df.to_excel('path_to_file.xlsx', sheet_name='Sheet1') diff --git a/doc/sphinxext/contributors.py b/doc/sphinxext/contributors.py index 4256e4659715d2..1a064f71792e96 100644 --- a/doc/sphinxext/contributors.py +++ b/doc/sphinxext/contributors.py @@ -8,12 +8,11 @@ code contributors and commits, and then list each contributor individually. """ +from announce import build_components from docutils import nodes from docutils.parsers.rst import Directive import git -from announce import build_components - class ContributorsDirective(Directive): required_arguments = 1 diff --git a/pandas/core/api.py b/pandas/core/api.py index 73323d93b8215a..bd2a57a15bdd2b 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -2,6 +2,16 @@ import numpy as np +from pandas.core.dtypes.dtypes import ( + CategoricalDtype, + DatetimeTZDtype, + IntervalDtype, + PeriodDtype, +) +from pandas.core.dtypes.missing import isna, isnull, notna, notnull + +from pandas.core.algorithms import factorize, unique, value_counts +from pandas.core.arrays import Categorical from pandas.core.arrays.integer import ( Int8Dtype, Int16Dtype, @@ -12,45 +22,38 @@ UInt32Dtype, UInt64Dtype, ) -from pandas.core.algorithms import factorize, unique, value_counts -from pandas.core.dtypes.missing import isna, isnull, notna, notnull -from pandas.core.dtypes.dtypes import ( - CategoricalDtype, - PeriodDtype, - IntervalDtype, - DatetimeTZDtype, -) -from pandas.core.arrays import Categorical from pandas.core.construction import array + from pandas.core.groupby import Grouper, NamedAgg -from pandas.io.formats.format import set_eng_float_format + +# DataFrame needs to be imported after NamedAgg to avoid a circular import +from pandas.core.frame import DataFrame # isort:skip from pandas.core.index import ( - Index, CategoricalIndex, - Int64Index, - UInt64Index, - RangeIndex, + DatetimeIndex, Float64Index, - MultiIndex, + Index, + Int64Index, IntervalIndex, - TimedeltaIndex, - DatetimeIndex, - PeriodIndex, + MultiIndex, NaT, + PeriodIndex, + RangeIndex, + TimedeltaIndex, + UInt64Index, ) +from pandas.core.indexes.datetimes import Timestamp, bdate_range, date_range +from pandas.core.indexes.interval import Interval, interval_range from pandas.core.indexes.period import Period, period_range from pandas.core.indexes.timedeltas import Timedelta, timedelta_range -from pandas.core.indexes.datetimes import Timestamp, date_range, bdate_range -from pandas.core.indexes.interval import Interval, interval_range - -from pandas.core.series import Series -from pandas.core.frame import DataFrame - -# TODO: Remove import when statsmodels updates #18264 -from pandas.core.reshape.reshape import get_dummies - from pandas.core.indexing import IndexSlice -from pandas.core.tools.numeric import to_numeric -from pandas.tseries.offsets import DateOffset +from pandas.core.reshape.reshape import ( + get_dummies, +) # TODO: Remove get_dummies import when statsmodels updates #18264 +from pandas.core.series import Series from pandas.core.tools.datetimes import to_datetime +from pandas.core.tools.numeric import to_numeric from pandas.core.tools.timedeltas import to_timedelta + +from pandas.io.formats.format import set_eng_float_format +from pandas.tseries.offsets import DateOffset diff --git a/pandas/io/msgpack/__init__.py b/pandas/io/msgpack/__init__.py index 9b09cffd83f755..7107263c180cb1 100644 --- a/pandas/io/msgpack/__init__.py +++ b/pandas/io/msgpack/__init__.py @@ -2,8 +2,8 @@ from collections import namedtuple -from pandas.io.msgpack.exceptions import * # noqa -from pandas.io.msgpack._version import version # noqa +from pandas.io.msgpack.exceptions import * # noqa: F401,F403 isort:skip +from pandas.io.msgpack._version import version # noqa: F401 isort:skip class ExtType(namedtuple("ExtType", "code data")): @@ -19,10 +19,14 @@ def __new__(cls, code, data): return super().__new__(cls, code, data) -import os # noqa +import os # noqa: F401,E402 isort:skip -from pandas.io.msgpack._packer import Packer # noqa -from pandas.io.msgpack._unpacker import unpack, unpackb, Unpacker # noqa +from pandas.io.msgpack._unpacker import ( # noqa: F401,E402 isort:skip + Unpacker, + unpack, + unpackb, +) +from pandas.io.msgpack._packer import Packer # noqa: E402 isort:skip def pack(o, stream, **kwargs): diff --git a/pandas/tests/io/pytables/test_pytables.py b/pandas/tests/io/pytables/test_pytables.py index d67f2c3b7bd66e..7306393a1339ee 100644 --- a/pandas/tests/io/pytables/test_pytables.py +++ b/pandas/tests/io/pytables/test_pytables.py @@ -37,7 +37,6 @@ import pandas.util.testing as tm from pandas.util.testing import assert_frame_equal, assert_series_equal, set_timezone -from pandas.io import pytables as pytables # noqa:E402 from pandas.io.formats.printing import pprint_thing from pandas.io.pytables import ( ClosedFileError, @@ -46,7 +45,9 @@ Term, read_hdf, ) -from pandas.io.pytables import TableIterator # noqa:E402 + +from pandas.io import pytables as pytables # noqa: E402 isort:skip +from pandas.io.pytables import TableIterator # noqa: E402 isort:skip tables = pytest.importorskip("tables") diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 87a2405a10dd5c..ee668d6890756f 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -8,7 +8,7 @@ import pandas.util.testing as tm from pandas.util.testing import assert_frame_equal, ensure_clean -from pandas.io.feather_format import read_feather, to_feather # noqa:E402 +from pandas.io.feather_format import read_feather, to_feather # noqa: E402 isort:skip pyarrow = pytest.importorskip("pyarrow") diff --git a/scripts/find_commits_touching_func.py b/scripts/find_commits_touching_func.py index 1075a257d42705..95a892b822cff7 100755 --- a/scripts/find_commits_touching_func.py +++ b/scripts/find_commits_touching_func.py @@ -10,11 +10,11 @@ Usage:: $ ./find_commits_touching_func.py (see arguments below) """ -import logging -import re -import os import argparse from collections import namedtuple +import logging +import os +import re from dateutil.parser import parse diff --git a/scripts/generate_pip_deps_from_conda.py b/scripts/generate_pip_deps_from_conda.py index 6ae10c2cb07d29..29fe8bf84c12b0 100755 --- a/scripts/generate_pip_deps_from_conda.py +++ b/scripts/generate_pip_deps_from_conda.py @@ -16,8 +16,8 @@ import os import re import sys -import yaml +import yaml EXCLUDE = {"python=3"} RENAME = {"pytables": "tables", "pyqt": "pyqt5"} diff --git a/scripts/merge-pr.py b/scripts/merge-pr.py index 95352751a23c6b..300cb149f387fc 100755 --- a/scripts/merge-pr.py +++ b/scripts/merge-pr.py @@ -22,14 +22,15 @@ # usage: ./apache-pr-merge.py (see config env vars below) # # Lightly modified from version of this script in incubator-parquet-format -from subprocess import check_output -from requests.auth import HTTPBasicAuth -import requests import os +from subprocess import check_output import sys import textwrap +import requests +from requests.auth import HTTPBasicAuth + PANDAS_HOME = "." PROJECT_NAME = "pandas" print("PANDAS_HOME = " + PANDAS_HOME) diff --git a/scripts/tests/test_validate_docstrings.py b/scripts/tests/test_validate_docstrings.py index 35aaf10458f449..85e5bf239cbfa8 100644 --- a/scripts/tests/test_validate_docstrings.py +++ b/scripts/tests/test_validate_docstrings.py @@ -2,12 +2,13 @@ import random import string import textwrap -import pytest -import numpy as np -import pandas as pd +import numpy as np +import pytest import validate_docstrings +import pandas as pd + validate_one = validate_docstrings.validate_one diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py index bf5d861281a36b..401eaf8ff5ed5c 100755 --- a/scripts/validate_docstrings.py +++ b/scripts/validate_docstrings.py @@ -13,20 +13,20 @@ $ ./validate_docstrings.py $ ./validate_docstrings.py pandas.DataFrame.head """ -import os -import sys -import json -import re -import glob -import functools -import collections import argparse -import pydoc -import inspect -import importlib +import ast +import collections import doctest +import functools +import glob +import importlib +import inspect +import json +import os +import pydoc +import re +import sys import tempfile -import ast import textwrap import flake8.main.application @@ -41,20 +41,20 @@ # script. Setting here before matplotlib is loaded. # We don't warn for the number of open plots, as none is actually being opened os.environ["MPLBACKEND"] = "Template" -import matplotlib +import matplotlib # noqa: E402 isort:skip matplotlib.rc("figure", max_open_warning=10000) -import numpy +import numpy # noqa: E402 isort:skip BASE_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.insert(0, os.path.join(BASE_PATH)) -import pandas +import pandas # noqa: E402 isort:skip sys.path.insert(1, os.path.join(BASE_PATH, "doc", "sphinxext")) -from numpydoc.docscrape import NumpyDocString -from pandas.io.formats.printing import pprint_thing +from numpydoc.docscrape import NumpyDocString # noqa: E402 isort:skip +from pandas.io.formats.printing import pprint_thing # noqa: E402 isort:skip PRIVATE_CLASSES = ["NDFrame", "IndexOpsMixin"] diff --git a/setup.cfg b/setup.cfg index 716ff5d9d8853f..43dbac15f5cfe7 100644 --- a/setup.cfg +++ b/setup.cfg @@ -110,68 +110,25 @@ directory = coverage_html_report # To be kept consistent with "Import Formatting" section in contributing.rst [isort] -known_pre_libs=pandas._config -known_pre_core=pandas._libs,pandas.util._*,pandas.compat,pandas.errors -known_dtypes=pandas.core.dtypes -known_post_core=pandas.tseries,pandas.io,pandas.plotting -sections=FUTURE,STDLIB,THIRDPARTY,PRE_LIBS,PRE_CORE,DTYPES,FIRSTPARTY,POST_CORE,LOCALFOLDER - -known_first_party=pandas -known_third_party=Cython,numpy,dateutil,matplotlib,python-dateutil,pytz,pyarrow,pytest - -multi_line_output=3 -include_trailing_comma=True -force_grid_wrap=0 -combine_as_imports=True -line_length=88 -force_sort_within_sections=True -skip_glob=env, -skip= - pandas/__init__.py - pandas/core/api.py, - pandas/io/msgpack/__init__.py - asv_bench/benchmarks/attrs_caching.py, - asv_bench/benchmarks/binary_ops.py, - asv_bench/benchmarks/categoricals.py, - asv_bench/benchmarks/ctors.py, - asv_bench/benchmarks/eval.py, - asv_bench/benchmarks/frame_ctor.py, - asv_bench/benchmarks/frame_methods.py, - asv_bench/benchmarks/gil.py, - asv_bench/benchmarks/groupby.py, - asv_bench/benchmarks/index_object.py, - asv_bench/benchmarks/indexing.py, - asv_bench/benchmarks/inference.py, - asv_bench/benchmarks/io/csv.py, - asv_bench/benchmarks/io/excel.py, - asv_bench/benchmarks/io/hdf.py, - asv_bench/benchmarks/io/json.py, - asv_bench/benchmarks/io/msgpack.py, - asv_bench/benchmarks/io/pickle.py, - asv_bench/benchmarks/io/sql.py, - asv_bench/benchmarks/io/stata.py, - asv_bench/benchmarks/join_merge.py, - asv_bench/benchmarks/multiindex_object.py, - asv_bench/benchmarks/panel_ctor.py, - asv_bench/benchmarks/panel_methods.py, - asv_bench/benchmarks/plotting.py, - asv_bench/benchmarks/reindex.py, - asv_bench/benchmarks/replace.py, - asv_bench/benchmarks/reshape.py, - asv_bench/benchmarks/rolling.py, - asv_bench/benchmarks/series_methods.py, - asv_bench/benchmarks/sparse.py, - asv_bench/benchmarks/stat_ops.py, - asv_bench/benchmarks/timeseries.py - asv_bench/benchmarks/pandas_vb_common.py - asv_bench/benchmarks/offset.py - asv_bench/benchmarks/dtypes.py - asv_bench/benchmarks/strings.py - asv_bench/benchmarks/period.py +known_pre_libs = pandas._config +known_pre_core = pandas._libs,pandas.util._*,pandas.compat,pandas.errors +known_dtypes = pandas.core.dtypes +known_post_core = pandas.tseries,pandas.io,pandas.plotting +sections = FUTURE,STDLIB,THIRDPARTY,PRE_LIBS,PRE_CORE,DTYPES,FIRSTPARTY,POST_CORE,LOCALFOLDER +known_first_party = pandas +known_third_party = _pytest,announce,dateutil,docutils,flake8,git,hypothesis,jinja2,lxml,matplotlib,numpy,numpydoc,pkg_resources,pyarrow,pytest,pytz,requests,scipy,setuptools,sphinx,sqlalchemy,validate_docstrings,yaml +multi_line_output = 3 +include_trailing_comma = True +force_grid_wrap = 0 +combine_as_imports = True +line_length = 88 +force_sort_within_sections = True +skip_glob = env, +skip = pandas/__init__.py,pandas/core/api.py [mypy] ignore_missing_imports=True no_implicit_optional=True [mypy-pandas.conftest,pandas.tests.*] -ignore_errors=True \ No newline at end of file +ignore_errors=True diff --git a/setup.py b/setup.py index d2c6b18b892cda..a86527ace092b3 100755 --- a/setup.py +++ b/setup.py @@ -6,16 +6,16 @@ BSD license. Parts are from lxml (https://github.com/lxml/lxml) """ +from distutils.sysconfig import get_config_vars +from distutils.version import LooseVersion import os from os.path import join as pjoin - -import pkg_resources import platform -from distutils.sysconfig import get_config_vars -import sys import shutil -from distutils.version import LooseVersion -from setuptools import setup, Command, find_packages +import sys + +import pkg_resources +from setuptools import Command, find_packages, setup # versioning import versioneer @@ -58,8 +58,8 @@ def is_platform_mac(): # The import of Extension must be after the import of Cython, otherwise # we do not get the appropriately patched class. # See https://cython.readthedocs.io/en/latest/src/reference/compilation.html -from distutils.extension import Extension # noqa:E402 -from distutils.command.build import build # noqa:E402 +from distutils.extension import Extension # noqa: E402 isort:skip +from distutils.command.build import build # noqa: E402 isort:skip try: if not _CYTHON_INSTALLED: @@ -831,9 +831,7 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): ] }, entry_points={ - "pandas_plotting_backends": [ - "matplotlib = pandas:plotting._matplotlib", - ], + "pandas_plotting_backends": ["matplotlib = pandas:plotting._matplotlib"] }, **setuptools_kwargs ) From fadb27138a97eb96b619111f906b8921d2290d26 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 30 Aug 2019 18:06:49 +0100 Subject: [PATCH 43/95] REGR: tags for notebook display closes #28204 (#28216) * REGR: tags for notebook display closes #28204 --- doc/source/whatsnew/v0.25.2.rst | 1 + pandas/core/frame.py | 15 + .../html_repr_max_rows_10_min_rows_12.html | 70 +++++ .../html_repr_max_rows_10_min_rows_4.html | 46 +++ .../html_repr_max_rows_12_min_rows_None.html | 78 +++++ .../html_repr_max_rows_None_min_rows_12.html | 269 ++++++++++++++++++ ...l_repr_min_rows_default_no_truncation.html | 105 +++++++ .../html_repr_min_rows_default_truncated.html | 70 +++++ pandas/tests/io/formats/test_to_html.py | 39 +++ 9 files changed, 693 insertions(+) create mode 100644 pandas/tests/io/formats/data/html/html_repr_max_rows_10_min_rows_12.html create mode 100644 pandas/tests/io/formats/data/html/html_repr_max_rows_10_min_rows_4.html create mode 100644 pandas/tests/io/formats/data/html/html_repr_max_rows_12_min_rows_None.html create mode 100644 pandas/tests/io/formats/data/html/html_repr_max_rows_None_min_rows_12.html create mode 100644 pandas/tests/io/formats/data/html/html_repr_min_rows_default_no_truncation.html create mode 100644 pandas/tests/io/formats/data/html/html_repr_min_rows_default_truncated.html diff --git a/doc/source/whatsnew/v0.25.2.rst b/doc/source/whatsnew/v0.25.2.rst index 8d8a39139cf84c..1cdf213d81a74b 100644 --- a/doc/source/whatsnew/v0.25.2.rst +++ b/doc/source/whatsnew/v0.25.2.rst @@ -62,6 +62,7 @@ Missing I/O ^^^ +- Fix regression in notebook display where tags not used for :attr:`DataFrame.index` (:issue:`28204`). - Regression in :meth:`~DataFrame.to_csv` where writing a :class:`Series` or :class:`DataFrame` indexed by an :class:`IntervalIndex` would incorrectly raise a ``TypeError`` (:issue:`28210`) - - diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3d1a39a86c784e..16fece1c7eb8ba 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -671,10 +671,25 @@ def _repr_html_(self): formatter = fmt.DataFrameFormatter( self, + columns=None, + col_space=None, + na_rep="NaN", + formatters=None, + float_format=None, + sparsify=None, + justify=None, + index_names=True, + header=True, + index=True, + bold_rows=True, + escape=True, max_rows=max_rows, min_rows=min_rows, max_cols=max_cols, show_dimensions=show_dimensions, + decimal=".", + table_id=None, + render_links=False, ) return formatter.to_html(notebook=True) else: diff --git a/pandas/tests/io/formats/data/html/html_repr_max_rows_10_min_rows_12.html b/pandas/tests/io/formats/data/html/html_repr_max_rows_10_min_rows_12.html new file mode 100644 index 00000000000000..4eb3f5319749d9 --- /dev/null +++ b/pandas/tests/io/formats/data/html/html_repr_max_rows_10_min_rows_12.html @@ -0,0 +1,70 @@ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
a
00
11
22
33
44
......
5656
5757
5858
5959
6060
+

61 rows × 1 columns

+
diff --git a/pandas/tests/io/formats/data/html/html_repr_max_rows_10_min_rows_4.html b/pandas/tests/io/formats/data/html/html_repr_max_rows_10_min_rows_4.html new file mode 100644 index 00000000000000..2b1d97aec517c5 --- /dev/null +++ b/pandas/tests/io/formats/data/html/html_repr_max_rows_10_min_rows_4.html @@ -0,0 +1,46 @@ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
a
00
11
......
5959
6060
+

61 rows × 1 columns

+
diff --git a/pandas/tests/io/formats/data/html/html_repr_max_rows_12_min_rows_None.html b/pandas/tests/io/formats/data/html/html_repr_max_rows_12_min_rows_None.html new file mode 100644 index 00000000000000..a539e5a4884a12 --- /dev/null +++ b/pandas/tests/io/formats/data/html/html_repr_max_rows_12_min_rows_None.html @@ -0,0 +1,78 @@ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
a
00
11
22
33
44
55
......
5555
5656
5757
5858
5959
6060
+

61 rows × 1 columns

+
diff --git a/pandas/tests/io/formats/data/html/html_repr_max_rows_None_min_rows_12.html b/pandas/tests/io/formats/data/html/html_repr_max_rows_None_min_rows_12.html new file mode 100644 index 00000000000000..3e680a505c6d68 --- /dev/null +++ b/pandas/tests/io/formats/data/html/html_repr_max_rows_None_min_rows_12.html @@ -0,0 +1,269 @@ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
a
00
11
22
33
44
55
66
77
88
99
1010
1111
1212
1313
1414
1515
1616
1717
1818
1919
2020
2121
2222
2323
2424
2525
2626
2727
2828
2929
3030
3131
3232
3333
3434
3535
3636
3737
3838
3939
4040
4141
4242
4343
4444
4545
4646
4747
4848
4949
5050
5151
5252
5353
5454
5555
5656
5757
5858
5959
6060
+
diff --git a/pandas/tests/io/formats/data/html/html_repr_min_rows_default_no_truncation.html b/pandas/tests/io/formats/data/html/html_repr_min_rows_default_no_truncation.html new file mode 100644 index 00000000000000..10f6247e37deff --- /dev/null +++ b/pandas/tests/io/formats/data/html/html_repr_min_rows_default_no_truncation.html @@ -0,0 +1,105 @@ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
a
00
11
22
33
44
55
66
77
88
99
1010
1111
1212
1313
1414
1515
1616
1717
1818
1919
+
diff --git a/pandas/tests/io/formats/data/html/html_repr_min_rows_default_truncated.html b/pandas/tests/io/formats/data/html/html_repr_min_rows_default_truncated.html new file mode 100644 index 00000000000000..4eb3f5319749d9 --- /dev/null +++ b/pandas/tests/io/formats/data/html/html_repr_min_rows_default_truncated.html @@ -0,0 +1,70 @@ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
a
00
11
22
33
44
......
5656
5757
5858
5959
6060
+

61 rows × 1 columns

+
diff --git a/pandas/tests/io/formats/test_to_html.py b/pandas/tests/io/formats/test_to_html.py index 448e869df950dd..52c7b89220f06b 100644 --- a/pandas/tests/io/formats/test_to_html.py +++ b/pandas/tests/io/formats/test_to_html.py @@ -713,3 +713,42 @@ def test_to_html_with_col_space_units(unit): for h in hdrs: expected = ''.format(unit=unit) assert expected in h + + +def test_html_repr_min_rows_default(datapath): + # gh-27991 + + # default setting no truncation even if above min_rows + df = pd.DataFrame({"a": range(20)}) + result = df._repr_html_() + expected = expected_html(datapath, "html_repr_min_rows_default_no_truncation") + assert result == expected + + # default of max_rows 60 triggers truncation if above + df = pd.DataFrame({"a": range(61)}) + result = df._repr_html_() + expected = expected_html(datapath, "html_repr_min_rows_default_truncated") + assert result == expected + + +@pytest.mark.parametrize( + "max_rows,min_rows,expected", + [ + # truncated after first two rows + (10, 4, "html_repr_max_rows_10_min_rows_4"), + # when set to None, follow value of max_rows + (12, None, "html_repr_max_rows_12_min_rows_None"), + # when set value higher as max_rows, use the minimum + (10, 12, "html_repr_max_rows_10_min_rows_12"), + # max_rows of None -> never truncate + (None, 12, "html_repr_max_rows_None_min_rows_12"), + ], +) +def test_html_repr_min_rows(datapath, max_rows, min_rows, expected): + # gh-27991 + + df = pd.DataFrame({"a": range(61)}) + expected = expected_html(datapath, expected) + with option_context("display.max_rows", max_rows, "display.min_rows", min_rows): + result = df._repr_html_() + assert result == expected From cad39188c64bb844d6e915a97d1b88c6b4337723 Mon Sep 17 00:00:00 2001 From: John G Evans Date: Fri, 30 Aug 2019 13:08:33 -0400 Subject: [PATCH 44/95] Fix read of py27 pytables tz attribute, gh#26443 (#28221) --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/io/pytables.py | 7 ++++++- pandas/tests/io/data/legacy_hdf/gh26443.h5 | Bin 0 -> 7168 bytes pandas/tests/io/pytables/test_pytables.py | 13 +++++++++++++ 4 files changed, 20 insertions(+), 1 deletion(-) create mode 100644 pandas/tests/io/data/legacy_hdf/gh26443.h5 diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 83beec5607986f..3b6288146bdf2e 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -97,6 +97,7 @@ Datetimelike ^^^^^^^^^^^^ - Bug in :meth:`Series.__setitem__` incorrectly casting ``np.timedelta64("NaT")`` to ``np.datetime64("NaT")`` when inserting into a :class:`Series` with datetime64 dtype (:issue:`27311`) - Bug in :meth:`Series.dt` property lookups when the underlying data is read-only (:issue:`27529`) +- Bug in ``HDFStore.__getitem__`` incorrectly reading tz attribute created in Python 2 (:issue:`26443`) - diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index fbe413f820c901..1ff3400323e54a 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -2902,7 +2902,12 @@ def read_index_node(self, node, start=None, stop=None): kwargs["freq"] = node._v_attrs["freq"] if "tz" in node._v_attrs: - kwargs["tz"] = node._v_attrs["tz"] + if isinstance(node._v_attrs["tz"], bytes): + # created by python2 + kwargs["tz"] = node._v_attrs["tz"].decode("utf-8") + else: + # created by python3 + kwargs["tz"] = node._v_attrs["tz"] if kind in ("date", "datetime"): index = factory( diff --git a/pandas/tests/io/data/legacy_hdf/gh26443.h5 b/pandas/tests/io/data/legacy_hdf/gh26443.h5 new file mode 100644 index 0000000000000000000000000000000000000000..45aa64324530f943b48fa5c63390392af1110c6b GIT binary patch literal 7168 zcmeHL%}*0S6rU|$3s$KgL?j-FryfiNg2safsuUrqZPQjF2UAwM;0C&_y9GgG;!*z- zkNycBIdb>n(Sry61D*G^+bk4B4Z*P6bl%R)n|ZTue(%kj_i=u9?&8qckb=iFRj*3n zs}w)^Q8%e2s58SdQ-kRTrk@h@VyJ!veWTo-;`zsWsNp-eSIfDa(ws8CQ0`W{{q$x^ zLrl+=2Ih0w5`6G8{%S$#F5^s;v2O3+tKe~7I{uJeX1qpmom6R-)2~eZt1eXY8o=tR z{)23hk4(CV@;}OFD;3=i{C)Z_{Ey4Ur|_X6Kg@2b#ay|vP%I!nn2zvIW+rqE`0pb% zw50)mX{nlhn9o&ebH(zLY_-;H{8VO=$E9%2sGX+R;J&cHh4^hamK-;v)l^vb~BwJf)8`L*^=%f|Ia{pzY` zHJ~S{WZZD)xGmfBou*6v7$6ehCNp_AGu1h65Xaf>O0({`YbZwdvY+W| zC`RoQ3CbQlJFvohU?P27Pb3mJgQ-0^y!069q$c`dL!yeh!e0SA$$?wXqd4hy&SlHx z>=KL#&e5aKxv9)FN=&M8q!3p^v>H8kwf;nf$VYymmshFO-XJ!2EmGX`lJVb-G0h$6 zv2GmHF6Zftyd2GbW_#;a+fvK#etiDAIr5fQiEU?g#qq*o4a6FVH4tkc);hzIeH3Ha-vc6q4TI}Q6D zT{k-JCp7Lj+=Td`d#Q${yzv6qOh1gXFAw_A3|w`~Vr@KX|W%8=R@y zZ-ATl;SMAG*@o>oE7qNY{kpc)^wtH}dpD3h$ft(uHqjnkOvv~L@F(+zbO>d==LSRW zb98+S))DfPT&bTN=hHFHQH6*5Mafr3^If689YMTsz76wh9q#Lhyh0N9ayQuLqtAoi a&nrCmuks3^ot~E8O5<)Dn3?(a=iD!5Y$rGX literal 0 HcmV?d00001 diff --git a/pandas/tests/io/pytables/test_pytables.py b/pandas/tests/io/pytables/test_pytables.py index 7306393a1339ee..77cac00882771f 100644 --- a/pandas/tests/io/pytables/test_pytables.py +++ b/pandas/tests/io/pytables/test_pytables.py @@ -5447,3 +5447,16 @@ def test_read_with_where_tz_aware_index(self): store.append(key, expected, format="table", append=True) result = pd.read_hdf(path, key, where="DATE > 20151130") assert_frame_equal(result, expected) + + def test_py2_created_with_datetimez(self, datapath): + # The test HDF5 file was created in Python 2, but could not be read in + # Python 3. + # + # GH26443 + index = [pd.Timestamp("2019-01-01T18:00").tz_localize("America/New_York")] + expected = DataFrame({"data": 123}, index=index) + with ensure_clean_store( + datapath("io", "data", "legacy_hdf", "gh26443.h5"), mode="r" + ) as store: + result = store["key"] + assert_frame_equal(result, expected) From 621ad9df37911ea577029d8cac5de0920f07f33e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuzhan=20=C3=96=C4=9Freden?= Date: Fri, 30 Aug 2019 19:09:03 +0200 Subject: [PATCH 45/95] DOC: Document existing functionality of pandas.DataFrame.to_sql() #11886 (#26795) * DOC: add single dtype to NDFrame.to_sql --- pandas/core/generic.py | 15 ++++++++------- pandas/io/sql.py | 23 ++++++++++++----------- 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6ade69fb4ca9d9..1a5b36b07e93ca 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2594,13 +2594,14 @@ def to_sql( `index` is True, then the index names are used. A sequence should be given if the DataFrame uses MultiIndex. chunksize : int, optional - Rows will be written in batches of this size at a time. By default, - all rows will be written at once. - dtype : dict, optional - Specifying the datatype for columns. The keys should be the column - names and the values should be the SQLAlchemy types or strings for - the sqlite3 legacy mode. - method : {None, 'multi', callable}, default None + Specify the number of rows in each batch to be written at a time. + By default, all rows will be written at once. + dtype : dict or scalar, optional + Specifying the datatype for columns. If a dictionary is used, the + keys should be the column names and the values should be the + SQLAlchemy types or strings for the sqlite3 legacy mode. If a + scalar is provided, it will be applied to all columns. + method : {None, 'multi', callable}, optional Controls the SQL insertion clause used: * None : Uses standard SQL ``INSERT`` clause (one per row). diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 72df00fd4c5a19..44cb399336d62f 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -456,14 +456,14 @@ def to_sql( Parameters ---------- frame : DataFrame, Series - name : string + name : str Name of SQL table. con : SQLAlchemy connectable(engine/connection) or database string URI or sqlite3 DBAPI2 connection Using SQLAlchemy makes it possible to use any DB supported by that library. If a DBAPI2 object, only sqlite3 is supported. - schema : string, default None + schema : str, optional Name of SQL schema in database to write to (if database flavor supports this). If None, use default schema (default). if_exists : {'fail', 'replace', 'append'}, default 'fail' @@ -472,18 +472,19 @@ def to_sql( - append: If table exists, insert data. Create if does not exist. index : boolean, default True Write DataFrame index as a column. - index_label : string or sequence, default None + index_label : str or sequence, optional Column label for index column(s). If None is given (default) and `index` is True, then the index names are used. A sequence should be given if the DataFrame uses MultiIndex. - chunksize : int, default None - If not None, then rows will be written in batches of this size at a - time. If None, all rows will be written at once. - dtype : single SQLtype or dict of column name to SQL type, default None - Optional specifying the datatype for columns. The SQL type should - be a SQLAlchemy type, or a string for sqlite3 fallback connection. - If all columns are of the same type, one single value can be used. - method : {None, 'multi', callable}, default None + chunksize : int, optional + Specify the number of rows in each batch to be written at a time. + By default, all rows will be written at once. + dtype : dict or scalar, optional + Specifying the datatype for columns. If a dictionary is used, the + keys should be the column names and the values should be the + SQLAlchemy types or strings for the sqlite3 fallback mode. If a + scalar is provided, it will be applied to all columns. + method : {None, 'multi', callable}, optional Controls the SQL insertion clause used: - None : Uses standard SQL ``INSERT`` clause (one per row). From bfdbebec423d781ebde189de24f5413298ab7c81 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 30 Aug 2019 11:43:48 -0700 Subject: [PATCH 46/95] CLN: catch less inside try/except (#28203) * CLN: catch less inside try/except --- pandas/_libs/reduction.pyx | 4 ---- pandas/core/groupby/generic.py | 17 +++++++++-------- pandas/core/groupby/groupby.py | 3 ++- 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index f95685c3379696..c892c1cf1b8a3e 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -296,8 +296,6 @@ cdef class SeriesBinGrouper: islider.advance(group_size) vslider.advance(group_size) - except: - raise finally: # so we don't free the wrong memory islider.reset() @@ -425,8 +423,6 @@ cdef class SeriesGrouper: group_size = 0 - except: - raise finally: # so we don't free the wrong memory islider.reset() diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index b0bcd1cc1e27c4..5e463d50d43d6d 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -242,15 +242,18 @@ def aggregate(self, func, *args, **kwargs): # grouper specific aggregations if self.grouper.nkeys > 1: return self._python_agg_general(func, *args, **kwargs) + elif args or kwargs: + result = self._aggregate_generic(func, *args, **kwargs) else: # try to treat as if we are passing a list try: - assert not args and not kwargs result = self._aggregate_multiple_funcs( [func], _level=_level, _axis=self.axis ) - + except Exception: + result = self._aggregate_generic(func) + else: result.columns = Index( result.columns.levels[0], name=self._selected_obj.columns.name ) @@ -260,8 +263,6 @@ def aggregate(self, func, *args, **kwargs): # values. concat no longer converts DataFrame[Sparse] # to SparseDataFrame, so we do it here. result = SparseDataFrame(result._data) - except Exception: - result = self._aggregate_generic(func, *args, **kwargs) if not self.as_index: self._insert_inaxis_grouper_inplace(result) @@ -313,10 +314,10 @@ def _aggregate_item_by_item(self, func, *args, **kwargs): cannot_agg = [] errors = None for item in obj: - try: - data = obj[item] - colg = SeriesGroupBy(data, selection=item, grouper=self.grouper) + data = obj[item] + colg = SeriesGroupBy(data, selection=item, grouper=self.grouper) + try: cast = self._transform_should_cast(func) result[item] = colg.aggregate(func, *args, **kwargs) @@ -684,7 +685,7 @@ def _transform_item_by_item(self, obj, wrapper): return DataFrame(output, index=obj.index, columns=columns) - def filter(self, func, dropna=True, *args, **kwargs): # noqa + def filter(self, func, dropna=True, *args, **kwargs): """ Return a copy of a DataFrame excluding elements from groups that do not satisfy the boolean criterion specified by func. diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 4d21b5810470a7..6deef16bdec131 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -653,7 +653,8 @@ def curried(x): # mark this column as an error try: return self._aggregate_item_by_item(name, *args, **kwargs) - except (AttributeError): + except AttributeError: + # e.g. SparseArray has no flags attr raise ValueError return wrapper From f8a924bcc3191ea7c82482ddf22728e629e808f3 Mon Sep 17 00:00:00 2001 From: Will Holmgren Date: Fri, 30 Aug 2019 13:54:02 -0700 Subject: [PATCH 47/95] DOC: fix DatetimeIndex.tz_localize doc string example (#28237) --- pandas/core/arrays/datetimes.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 70df708d36b3bf..732f819e743a47 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -1063,6 +1063,7 @@ def tz_localize(self, tz, ambiguous="raise", nonexistent="raise", errors=None): Be careful with DST changes. When there is sequential data, pandas can infer the DST time: + >>> s = pd.to_datetime(pd.Series(['2018-10-28 01:30:00', ... '2018-10-28 02:00:00', ... '2018-10-28 02:30:00', @@ -1094,6 +1095,7 @@ def tz_localize(self, tz, ambiguous="raise", nonexistent="raise", errors=None): If the DST transition causes nonexistent times, you can shift these dates forward or backwards with a timedelta object or `'shift_forward'` or `'shift_backwards'`. + >>> s = pd.to_datetime(pd.Series(['2015-03-29 02:30:00', ... '2015-03-29 03:30:00'])) >>> s.dt.tz_localize('Europe/Warsaw', nonexistent='shift_forward') From 42d6ee7cd1d43dfc2054ec00d82135af87c33574 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 2 Sep 2019 14:02:09 -0700 Subject: [PATCH 48/95] have Timestamp return NotImplemented (#28157) --- pandas/_libs/tslibs/c_timestamp.pyx | 11 ++--------- pandas/core/arrays/datetimelike.py | 16 ++++++++++++---- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/pandas/_libs/tslibs/c_timestamp.pyx b/pandas/_libs/tslibs/c_timestamp.pyx index 906dabba09486c..10ed2588deaca5 100644 --- a/pandas/_libs/tslibs/c_timestamp.pyx +++ b/pandas/_libs/tslibs/c_timestamp.pyx @@ -269,15 +269,8 @@ cdef class _Timestamp(datetime): return self + neg_other typ = getattr(other, '_typ', None) - - # a Timestamp-DatetimeIndex -> yields a negative TimedeltaIndex - if typ in ('datetimeindex', 'datetimearray'): - # timezone comparison is performed in DatetimeIndex._sub_datelike - return -other.__sub__(self) - - # a Timestamp-TimedeltaIndex -> yields a negative TimedeltaIndex - elif typ in ('timedeltaindex', 'timedeltaarray'): - return (-other).__add__(self) + if typ is not None: + return NotImplemented elif other is NaT: return NaT diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 1988726edc79b9..bda5f8f4326f18 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1300,7 +1300,7 @@ def __sub__(self, other): return result def __rsub__(self, other): - if is_datetime64_any_dtype(other) and is_timedelta64_dtype(self): + if is_datetime64_any_dtype(other) and is_timedelta64_dtype(self.dtype): # ndarray[datetime64] cannot be subtracted from self, so # we need to wrap in DatetimeArray/Index and flip the operation if not isinstance(other, DatetimeLikeArrayMixin): @@ -1310,9 +1310,9 @@ def __rsub__(self, other): other = DatetimeArray(other) return other - self elif ( - is_datetime64_any_dtype(self) + is_datetime64_any_dtype(self.dtype) and hasattr(other, "dtype") - and not is_datetime64_any_dtype(other) + and not is_datetime64_any_dtype(other.dtype) ): # GH#19959 datetime - datetime is well-defined as timedelta, # but any other type - datetime is not well-defined. @@ -1321,13 +1321,21 @@ def __rsub__(self, other): cls=type(self).__name__, typ=type(other).__name__ ) ) - elif is_period_dtype(self) and is_timedelta64_dtype(other): + elif is_period_dtype(self.dtype) and is_timedelta64_dtype(other): # TODO: Can we simplify/generalize these cases at all? raise TypeError( "cannot subtract {cls} from {dtype}".format( cls=type(self).__name__, dtype=other.dtype ) ) + elif is_timedelta64_dtype(self.dtype): + if lib.is_integer(other) or is_integer_dtype(other): + # need to subtract before negating, since that flips freq + # -self flips self.freq, messing up results + return -(self - other) + + return (-self) + other + return -(self - other) # FIXME: DTA/TDA/PA inplace methods should actually be inplace, GH#24115 From 05cc95971e56b503d4df9911a44cd60a7b74cc79 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 2 Sep 2019 14:04:11 -0700 Subject: [PATCH 49/95] BUG: SparseDataFrame op incorrectly casting to float (#28107) --- doc/source/whatsnew/v1.0.0.rst | 2 +- pandas/core/sparse/frame.py | 6 +++--- pandas/tests/sparse/frame/test_frame.py | 16 ++++++++++++++++ 3 files changed, 20 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 3b6288146bdf2e..6834435adb4780 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -189,7 +189,7 @@ Reshaping Sparse ^^^^^^ - +- Bug in :class:`SparseDataFrame` arithmetic operations incorrectly casting inputs to float (:issue:`28107`) - - diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index 8fe6850c84b8b1..3d6ba0b8d97745 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -576,8 +576,8 @@ def _combine_match_index(self, other, func, level=None): this, other = self.align(other, join="outer", axis=0, level=level, copy=False) new_data = {} - for col, series in this.items(): - new_data[col] = func(series.values, other.values) + for col in this.columns: + new_data[col] = func(this[col], other) fill_value = self._get_op_result_fill_value(other, func) @@ -603,7 +603,7 @@ def _combine_match_columns(self, other, func, level=None): new_data = {} for col in left.columns: - new_data[col] = func(left[col], float(right[col])) + new_data[col] = func(left[col], right[col]) return self._constructor( new_data, diff --git a/pandas/tests/sparse/frame/test_frame.py b/pandas/tests/sparse/frame/test_frame.py index ddb50e0897a869..e372e2563e682c 100644 --- a/pandas/tests/sparse/frame/test_frame.py +++ b/pandas/tests/sparse/frame/test_frame.py @@ -1487,6 +1487,22 @@ def test_comparison_op_scalar(self): assert isinstance(res, pd.SparseDataFrame) tm.assert_frame_equal(res.to_dense(), df != 0) + def test_add_series_retains_dtype(self): + # SparseDataFrame._combine_match_columns used to incorrectly cast + # to float + d = {0: [2j, 3j], 1: [0, 1]} + sdf = SparseDataFrame(data=d, default_fill_value=1) + result = sdf + sdf[0] + + df = sdf.to_dense() + expected = df + df[0] + tm.assert_frame_equal(result.to_dense(), expected) + + # Make it explicit to be on the safe side + edata = {0: [4j, 5j], 1: [3j, 1 + 3j]} + expected = DataFrame(edata) + tm.assert_frame_equal(result.to_dense(), expected) + @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") @pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") From 2aeed3fb11434f16fae433480279dea9a495d473 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 2 Sep 2019 14:06:26 -0700 Subject: [PATCH 50/95] REF: separate bloated test (#28081) --- pandas/core/ops/__init__.py | 17 ++- pandas/tests/series/test_operators.py | 170 +++++++++++++++++--------- 2 files changed, 124 insertions(+), 63 deletions(-) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 86cd6e878cde60..dec2722275d6ea 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -791,16 +791,21 @@ def wrapper(self, other): self, other = _align_method_SERIES(self, other, align_asobject=True) res_name = get_op_result_name(self, other) + # TODO: shouldn't we be applying finalize whenever + # not isinstance(other, ABCSeries)? + finalizer = ( + lambda x: x.__finalize__(self) + if not isinstance(other, (ABCSeries, ABCIndexClass)) + else x + ) + if isinstance(other, ABCDataFrame): # Defer to DataFrame implementation; fail early return NotImplemented elif isinstance(other, (ABCSeries, ABCIndexClass)): is_other_int_dtype = is_integer_dtype(other.dtype) - other = fill_int(other) if is_other_int_dtype else fill_bool(other) - - ovalues = other.values - finalizer = lambda x: x + other = other if is_other_int_dtype else fill_bool(other) else: # scalars, list, tuple, np.array @@ -811,8 +816,8 @@ def wrapper(self, other): # thing? e.g. other = [[0, 1], [2, 3], [4, 5]]? other = construct_1d_object_array_from_listlike(other) - ovalues = other - finalizer = lambda x: x.__finalize__(self) + # TODO: use extract_array once we handle EA correctly, see GH#27959 + ovalues = lib.values_from_object(other) # For int vs int `^`, `|`, `&` are bitwise operators and return # integer dtypes. Otherwise these are boolean ops diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index 062c07cb6242aa..aa44760dcd9180 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -36,22 +36,14 @@ def test_bool_operators_with_nas(self, bool_op): expected[mask] = False assert_series_equal(result, expected) - def test_operators_bitwise(self): + def test_logical_operators_bool_dtype_with_empty(self): # GH#9016: support bitwise op for integer types index = list("bca") s_tft = Series([True, False, True], index=index) s_fff = Series([False, False, False], index=index) - s_tff = Series([True, False, False], index=index) s_empty = Series([]) - # TODO: unused - # s_0101 = Series([0, 1, 0, 1]) - - s_0123 = Series(range(4), dtype="int64") - s_3333 = Series([3] * 4) - s_4444 = Series([4] * 4) - res = s_tft & s_empty expected = s_fff assert_series_equal(res, expected) @@ -60,6 +52,16 @@ def test_operators_bitwise(self): expected = s_tft assert_series_equal(res, expected) + def test_logical_operators_int_dtype_with_int_dtype(self): + # GH#9016: support bitwise op for integer types + + # TODO: unused + # s_0101 = Series([0, 1, 0, 1]) + + s_0123 = Series(range(4), dtype="int64") + s_3333 = Series([3] * 4) + s_4444 = Series([4] * 4) + res = s_0123 & s_3333 expected = Series(range(4), dtype="int64") assert_series_equal(res, expected) @@ -68,76 +70,129 @@ def test_operators_bitwise(self): expected = Series(range(4, 8), dtype="int64") assert_series_equal(res, expected) - s_a0b1c0 = Series([1], list("b")) - - res = s_tft & s_a0b1c0 - expected = s_tff.reindex(list("abc")) + s_1111 = Series([1] * 4, dtype="int8") + res = s_0123 & s_1111 + expected = Series([0, 1, 0, 1], dtype="int64") assert_series_equal(res, expected) - res = s_tft | s_a0b1c0 - expected = s_tft.reindex(list("abc")) + res = s_0123.astype(np.int16) | s_1111.astype(np.int32) + expected = Series([1, 1, 3, 3], dtype="int32") assert_series_equal(res, expected) - n0 = 0 - res = s_tft & n0 - expected = s_fff - assert_series_equal(res, expected) + def test_logical_operators_int_dtype_with_int_scalar(self): + # GH#9016: support bitwise op for integer types + s_0123 = Series(range(4), dtype="int64") - res = s_0123 & n0 + res = s_0123 & 0 expected = Series([0] * 4) assert_series_equal(res, expected) - n1 = 1 - res = s_tft & n1 - expected = s_tft - assert_series_equal(res, expected) - - res = s_0123 & n1 + res = s_0123 & 1 expected = Series([0, 1, 0, 1]) assert_series_equal(res, expected) - s_1111 = Series([1] * 4, dtype="int8") - res = s_0123 & s_1111 - expected = Series([0, 1, 0, 1], dtype="int64") - assert_series_equal(res, expected) - - res = s_0123.astype(np.int16) | s_1111.astype(np.int32) - expected = Series([1, 1, 3, 3], dtype="int32") - assert_series_equal(res, expected) + def test_logical_operators_int_dtype_with_float(self): + # GH#9016: support bitwise op for integer types + s_0123 = Series(range(4), dtype="int64") - with pytest.raises(TypeError): - s_1111 & "a" - with pytest.raises(TypeError): - s_1111 & ["a", "b", "c", "d"] with pytest.raises(TypeError): s_0123 & np.NaN with pytest.raises(TypeError): s_0123 & 3.14 with pytest.raises(TypeError): s_0123 & [0.1, 4, 3.14, 2] + with pytest.raises(TypeError): + s_0123 & np.array([0.1, 4, 3.14, 2]) - # s_0123 will be all false now because of reindexing like s_tft - exp = Series([False] * 7, index=[0, 1, 2, 3, "a", "b", "c"]) - assert_series_equal(s_tft & s_0123, exp) - - # s_tft will be all false now because of reindexing like s_0123 - exp = Series([False] * 7, index=[0, 1, 2, 3, "a", "b", "c"]) - assert_series_equal(s_0123 & s_tft, exp) - - assert_series_equal(s_0123 & False, Series([False] * 4)) - assert_series_equal(s_0123 ^ False, Series([False, True, True, True])) - assert_series_equal(s_0123 & [False], Series([False] * 4)) - assert_series_equal(s_0123 & (False), Series([False] * 4)) - assert_series_equal( - s_0123 & Series([False, np.NaN, False, False]), Series([False] * 4) - ) + # FIXME: this should be consistent with the list case above + expected = Series([False, True, False, True]) + result = s_0123 & Series([0.1, 4, -3.14, 2]) + assert_series_equal(result, expected) + + def test_logical_operators_int_dtype_with_str(self): + s_1111 = Series([1] * 4, dtype="int8") + + with pytest.raises(TypeError): + s_1111 & "a" + with pytest.raises(TypeError): + s_1111 & ["a", "b", "c", "d"] + + def test_logical_operators_int_dtype_with_bool(self): + # GH#9016: support bitwise op for integer types + s_0123 = Series(range(4), dtype="int64") + + expected = Series([False] * 4) + + result = s_0123 & False + assert_series_equal(result, expected) + + result = s_0123 & [False] + assert_series_equal(result, expected) + + result = s_0123 & (False,) + assert_series_equal(result, expected) - s_ftft = Series([False, True, False, True]) - assert_series_equal(s_0123 & Series([0.1, 4, -3.14, 2]), s_ftft) + result = s_0123 ^ False + expected = Series([False, True, True, True]) + assert_series_equal(result, expected) + + def test_logical_operators_int_dtype_with_object(self): + # GH#9016: support bitwise op for integer types + s_0123 = Series(range(4), dtype="int64") + + result = s_0123 & Series([False, np.NaN, False, False]) + expected = Series([False] * 4) + assert_series_equal(result, expected) s_abNd = Series(["a", "b", np.NaN, "d"]) - res = s_0123 & s_abNd - expected = s_ftft + result = s_0123 & s_abNd + expected = Series([False, True, False, True]) + assert_series_equal(result, expected) + + def test_logical_operators_bool_dtype_with_int(self): + index = list("bca") + + s_tft = Series([True, False, True], index=index) + s_fff = Series([False, False, False], index=index) + + res = s_tft & 0 + expected = s_fff + assert_series_equal(res, expected) + + res = s_tft & 1 + expected = s_tft + assert_series_equal(res, expected) + + def test_logical_operators_int_dtype_with_bool_dtype_and_reindex(self): + # GH#9016: support bitwise op for integer types + + # with non-matching indexes, logical operators will cast to object + # before operating + index = list("bca") + + s_tft = Series([True, False, True], index=index) + s_tft = Series([True, False, True], index=index) + s_tff = Series([True, False, False], index=index) + + s_0123 = Series(range(4), dtype="int64") + + # s_0123 will be all false now because of reindexing like s_tft + expected = Series([False] * 7, index=[0, 1, 2, 3, "a", "b", "c"]) + result = s_tft & s_0123 + assert_series_equal(result, expected) + + expected = Series([False] * 7, index=[0, 1, 2, 3, "a", "b", "c"]) + result = s_0123 & s_tft + assert_series_equal(result, expected) + + s_a0b1c0 = Series([1], list("b")) + + res = s_tft & s_a0b1c0 + expected = s_tff.reindex(list("abc")) + assert_series_equal(res, expected) + + res = s_tft | s_a0b1c0 + expected = s_tft.reindex(list("abc")) assert_series_equal(res, expected) def test_scalar_na_logical_ops_corners(self): @@ -523,6 +578,7 @@ def test_comparison_operators_with_nas(self): assert_series_equal(result, expected) + # FIXME: dont leave commented-out # fffffffuuuuuuuuuuuu # result = f(val, s) # expected = f(val, s.dropna()).reindex(s.index) From 2cd78883c77dc2d2fed10e07f245ba9d9ceb635a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 2 Sep 2019 14:09:19 -0700 Subject: [PATCH 51/95] REF: do extract_array earlier in series arith/comparison ops (#28066) --- pandas/core/ops/__init__.py | 100 +++++++++++++++++++++++------------- 1 file changed, 63 insertions(+), 37 deletions(-) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index dec2722275d6ea..cc2d4ced1243f1 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -5,7 +5,7 @@ """ import datetime import operator -from typing import Any, Callable, Tuple +from typing import Any, Callable, Tuple, Union import numpy as np @@ -34,10 +34,11 @@ ABCIndexClass, ABCSeries, ABCSparseSeries, + ABCTimedeltaArray, + ABCTimedeltaIndex, ) from pandas.core.dtypes.missing import isna, notna -import pandas as pd from pandas._typing import ArrayLike from pandas.core.construction import array, extract_array from pandas.core.ops.array_ops import comp_method_OBJECT_ARRAY, define_na_arithmetic_op @@ -148,6 +149,8 @@ def maybe_upcast_for_op(obj, shape: Tuple[int, ...]): Be careful to call this *after* determining the `name` attribute to be attached to the result of the arithmetic operation. """ + from pandas.core.arrays import TimedeltaArray + if type(obj) is datetime.timedelta: # GH#22390 cast up to Timedelta to rely on Timedelta # implementation; otherwise operation against numeric-dtype @@ -157,12 +160,10 @@ def maybe_upcast_for_op(obj, shape: Tuple[int, ...]): if isna(obj): # wrapping timedelta64("NaT") in Timedelta returns NaT, # which would incorrectly be treated as a datetime-NaT, so - # we broadcast and wrap in a Series + # we broadcast and wrap in a TimedeltaArray + obj = obj.astype("timedelta64[ns]") right = np.broadcast_to(obj, shape) - - # Note: we use Series instead of TimedeltaIndex to avoid having - # to worry about catching NullFrequencyError. - return pd.Series(right) + return TimedeltaArray(right) # In particular non-nanosecond timedelta64 needs to be cast to # nanoseconds, or else we get undesired behavior like @@ -173,7 +174,7 @@ def maybe_upcast_for_op(obj, shape: Tuple[int, ...]): # GH#22390 Unfortunately we need to special-case right-hand # timedelta64 dtypes because numpy casts integer dtypes to # timedelta64 when operating with timedelta64 - return pd.TimedeltaIndex(obj) + return TimedeltaArray._from_sequence(obj) return obj @@ -520,13 +521,34 @@ def column_op(a, b): return result -def dispatch_to_extension_op(op, left, right): +def dispatch_to_extension_op( + op, + left: Union[ABCExtensionArray, np.ndarray], + right: Any, + keep_null_freq: bool = False, +): """ Assume that left or right is a Series backed by an ExtensionArray, apply the operator defined by op. + + Parameters + ---------- + op : binary operator + left : ExtensionArray or np.ndarray + right : object + keep_null_freq : bool, default False + Whether to re-raise a NullFrequencyError unchanged, as opposed to + catching and raising TypeError. + + Returns + ------- + ExtensionArray or np.ndarray + 2-tuple of these if op is divmod or rdivmod """ + # NB: left and right should already be unboxed, so neither should be + # a Series or Index. - if left.dtype.kind in "mM": + if left.dtype.kind in "mM" and isinstance(left, np.ndarray): # We need to cast datetime64 and timedelta64 ndarrays to # DatetimeArray/TimedeltaArray. But we avoid wrapping others in # PandasArray as that behaves poorly with e.g. IntegerArray. @@ -535,15 +557,15 @@ def dispatch_to_extension_op(op, left, right): # The op calls will raise TypeError if the op is not defined # on the ExtensionArray - # unbox Series and Index to arrays - new_left = extract_array(left, extract_numpy=True) - new_right = extract_array(right, extract_numpy=True) - try: - res_values = op(new_left, new_right) + res_values = op(left, right) except NullFrequencyError: # DatetimeIndex and TimedeltaIndex with freq == None raise ValueError # on add/sub of integers (or int-like). We re-raise as a TypeError. + if keep_null_freq: + # TODO: remove keep_null_freq after Timestamp+int deprecation + # GH#22535 is enforced + raise raise TypeError( "incompatible type for a datetime/timedelta " "operation [{name}]".format(name=op.__name__) @@ -615,25 +637,29 @@ def wrapper(left, right): if isinstance(right, ABCDataFrame): return NotImplemented + keep_null_freq = isinstance( + right, + (ABCDatetimeIndex, ABCDatetimeArray, ABCTimedeltaIndex, ABCTimedeltaArray), + ) + left, right = _align_method_SERIES(left, right) res_name = get_op_result_name(left, right) - right = maybe_upcast_for_op(right, left.shape) - if should_extension_dispatch(left, right): - result = dispatch_to_extension_op(op, left, right) + lvalues = extract_array(left, extract_numpy=True) + rvalues = extract_array(right, extract_numpy=True) - elif is_timedelta64_dtype(right) or isinstance( - right, (ABCDatetimeArray, ABCDatetimeIndex) - ): - # We should only get here with td64 right with non-scalar values - # for right upcast by maybe_upcast_for_op - assert not isinstance(right, (np.timedelta64, np.ndarray)) - result = op(left._values, right) + rvalues = maybe_upcast_for_op(rvalues, lvalues.shape) - else: - lvalues = extract_array(left, extract_numpy=True) - rvalues = extract_array(right, extract_numpy=True) + if should_extension_dispatch(lvalues, rvalues): + result = dispatch_to_extension_op(op, lvalues, rvalues, keep_null_freq) + + elif is_timedelta64_dtype(rvalues) or isinstance(rvalues, ABCDatetimeArray): + # We should only get here with td64 rvalues with non-scalar values + # for rvalues upcast by maybe_upcast_for_op + assert not isinstance(rvalues, (np.timedelta64, np.ndarray)) + result = dispatch_to_extension_op(op, lvalues, rvalues, keep_null_freq) + else: with np.errstate(all="ignore"): result = na_op(lvalues, rvalues) @@ -708,25 +734,25 @@ def wrapper(self, other, axis=None): if len(self) != len(other): raise ValueError("Lengths must match to compare") - if should_extension_dispatch(self, other): - res_values = dispatch_to_extension_op(op, self, other) + lvalues = extract_array(self, extract_numpy=True) + rvalues = extract_array(other, extract_numpy=True) - elif is_scalar(other) and isna(other): + if should_extension_dispatch(lvalues, rvalues): + res_values = dispatch_to_extension_op(op, lvalues, rvalues) + + elif is_scalar(rvalues) and isna(rvalues): # numpy does not like comparisons vs None if op is operator.ne: - res_values = np.ones(len(self), dtype=bool) + res_values = np.ones(len(lvalues), dtype=bool) else: - res_values = np.zeros(len(self), dtype=bool) + res_values = np.zeros(len(lvalues), dtype=bool) else: - lvalues = extract_array(self, extract_numpy=True) - rvalues = extract_array(other, extract_numpy=True) - with np.errstate(all="ignore"): res_values = na_op(lvalues, rvalues) if is_scalar(res_values): raise TypeError( - "Could not compare {typ} type with Series".format(typ=type(other)) + "Could not compare {typ} type with Series".format(typ=type(rvalues)) ) result = self._constructor(res_values, index=self.index) From 30fb087095d40230765fc544b3477700e04f0332 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 2 Sep 2019 14:17:43 -0700 Subject: [PATCH 52/95] BUG: fix+test Timestamp with int array (#28161) --- pandas/_libs/tslibs/c_timestamp.pyx | 16 ++++++ .../tests/scalar/timestamp/test_arithmetic.py | 53 +++++++++++++++++++ 2 files changed, 69 insertions(+) diff --git a/pandas/_libs/tslibs/c_timestamp.pyx b/pandas/_libs/tslibs/c_timestamp.pyx index 10ed2588deaca5..41e2ae6b5b59b6 100644 --- a/pandas/_libs/tslibs/c_timestamp.pyx +++ b/pandas/_libs/tslibs/c_timestamp.pyx @@ -251,6 +251,14 @@ cdef class _Timestamp(datetime): result = result.normalize() return result + elif is_array(other): + if other.dtype.kind in ['i', 'u']: + maybe_integer_op_deprecated(self) + if self.freq is None: + raise ValueError("Cannot add integer-dtype array " + "to Timestamp without freq.") + return self.freq * other + self + # index/series like elif hasattr(other, '_typ'): return NotImplemented @@ -268,6 +276,14 @@ cdef class _Timestamp(datetime): neg_other = -other return self + neg_other + elif is_array(other): + if other.dtype.kind in ['i', 'u']: + maybe_integer_op_deprecated(self) + if self.freq is None: + raise ValueError("Cannot subtract integer-dtype array " + "from Timestamp without freq.") + return self - self.freq * other + typ = getattr(other, '_typ', None) if typ is not None: return NotImplemented diff --git a/pandas/tests/scalar/timestamp/test_arithmetic.py b/pandas/tests/scalar/timestamp/test_arithmetic.py index 58bd03129f2df0..2ef4fe79eeacf5 100644 --- a/pandas/tests/scalar/timestamp/test_arithmetic.py +++ b/pandas/tests/scalar/timestamp/test_arithmetic.py @@ -151,3 +151,56 @@ def test_timestamp_add_timedelta64_unit(self, other, expected_difference): result = ts + other valdiff = result.value - ts.value assert valdiff == expected_difference + + @pytest.mark.parametrize("ts", [Timestamp.now(), Timestamp.now("utc")]) + @pytest.mark.parametrize( + "other", + [ + 1, + np.int64(1), + np.array([1, 2], dtype=np.int32), + np.array([3, 4], dtype=np.uint64), + ], + ) + def test_add_int_no_freq_raises(self, ts, other): + with pytest.raises(ValueError, match="without freq"): + ts + other + with pytest.raises(ValueError, match="without freq"): + other + ts + + with pytest.raises(ValueError, match="without freq"): + ts - other + with pytest.raises(TypeError): + other - ts + + @pytest.mark.parametrize( + "ts", + [ + Timestamp("1776-07-04", freq="D"), + Timestamp("1776-07-04", tz="UTC", freq="D"), + ], + ) + @pytest.mark.parametrize( + "other", + [ + 1, + np.int64(1), + np.array([1, 2], dtype=np.int32), + np.array([3, 4], dtype=np.uint64), + ], + ) + def test_add_int_with_freq(self, ts, other): + with tm.assert_produces_warning(FutureWarning): + result1 = ts + other + with tm.assert_produces_warning(FutureWarning): + result2 = other + ts + + assert np.all(result1 == result2) + + with tm.assert_produces_warning(FutureWarning): + result = result1 - other + + assert np.all(result == ts) + + with pytest.raises(TypeError): + other - ts From 562f423fc755c2c59307053bc5afceebb068397f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 2 Sep 2019 14:23:18 -0700 Subject: [PATCH 53/95] CLN: avoid catching Exception in _choose_path (#28205) --- pandas/core/groupby/generic.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 5e463d50d43d6d..6c95b521110a98 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -649,20 +649,21 @@ def _choose_path(self, fast_path, slow_path, group): # if we make it here, test if we can use the fast path try: res_fast = fast_path(group) - - # verify fast path does not change columns (and names), otherwise - # its results cannot be joined with those of the slow path - if res_fast.columns != group.columns: - return path, res - # verify numerical equality with the slow path - if res.shape == res_fast.shape: - res_r = res.values.ravel() - res_fast_r = res_fast.values.ravel() - mask = notna(res_r) - if (res_r[mask] == res_fast_r[mask]).all(): - path = fast_path except Exception: - pass + # Hard to know ex-ante what exceptions `fast_path` might raise + return path, res + + # verify fast path does not change columns (and names), otherwise + # its results cannot be joined with those of the slow path + if not isinstance(res_fast, DataFrame): + return path, res + + if not res_fast.columns.equals(group.columns): + return path, res + + if res_fast.equals(res): + path = fast_path + return path, res def _transform_item_by_item(self, obj, wrapper): From 89e5f8445a7c150c133c5c4db3e852c9947e88b4 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 2 Sep 2019 14:26:53 -0700 Subject: [PATCH 54/95] REF: use dispatch_to_extension_op for bool ops (#27959) --- pandas/core/ops/__init__.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index cc2d4ced1243f1..df097d7ad91dc7 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -829,6 +829,11 @@ def wrapper(self, other): # Defer to DataFrame implementation; fail early return NotImplemented + elif should_extension_dispatch(self, other): + # e.g. SparseArray + res_values = dispatch_to_extension_op(op, self, other) + return _construct_result(self, res_values, index=self.index, name=res_name) + elif isinstance(other, (ABCSeries, ABCIndexClass)): is_other_int_dtype = is_integer_dtype(other.dtype) other = other if is_other_int_dtype else fill_bool(other) From 498f3008f583407e996322409a4f9af8dec8d775 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 2 Sep 2019 14:31:29 -0700 Subject: [PATCH 55/95] TST: parametrize and de-duplicate arith tests (#27950) --- pandas/tests/arithmetic/conftest.py | 8 +- pandas/tests/arithmetic/test_datetime64.py | 64 ++++------ pandas/tests/arithmetic/test_numeric.py | 16 ++- pandas/tests/arithmetic/test_object.py | 13 +- pandas/tests/arithmetic/test_timedelta64.py | 131 ++++++++------------ 5 files changed, 96 insertions(+), 136 deletions(-) diff --git a/pandas/tests/arithmetic/conftest.py b/pandas/tests/arithmetic/conftest.py index f047154f2c6362..774ff14398bdb4 100644 --- a/pandas/tests/arithmetic/conftest.py +++ b/pandas/tests/arithmetic/conftest.py @@ -190,7 +190,12 @@ def box(request): @pytest.fixture( - params=[pd.Index, pd.Series, pytest.param(pd.DataFrame, marks=pytest.mark.xfail)], + params=[ + pd.Index, + pd.Series, + pytest.param(pd.DataFrame, marks=pytest.mark.xfail), + tm.to_array, + ], ids=id_func, ) def box_df_fail(request): @@ -206,6 +211,7 @@ def box_df_fail(request): (pd.Series, False), (pd.DataFrame, False), pytest.param((pd.DataFrame, True), marks=pytest.mark.xfail), + (tm.to_array, False), ], ids=id_func, ) diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index 5931cd93cc8c5a..bc7b979d2c7d03 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -348,28 +348,6 @@ def test_dt64arr_timestamp_equality(self, box_with_array): expected = tm.box_expected([False, False], xbox) tm.assert_equal(result, expected) - @pytest.mark.parametrize( - "op", - [operator.eq, operator.ne, operator.gt, operator.ge, operator.lt, operator.le], - ) - def test_comparison_tzawareness_compat(self, op): - # GH#18162 - dr = pd.date_range("2016-01-01", periods=6) - dz = dr.tz_localize("US/Pacific") - - # Check that there isn't a problem aware-aware and naive-naive do not - # raise - naive_series = Series(dr) - aware_series = Series(dz) - msg = "Cannot compare tz-naive and tz-aware" - with pytest.raises(TypeError, match=msg): - op(dz, naive_series) - with pytest.raises(TypeError, match=msg): - op(dr, aware_series) - - # TODO: implement _assert_tzawareness_compat for the reverse - # comparison with the Series on the left-hand side - class TestDatetimeIndexComparisons: @@ -599,15 +577,18 @@ def test_comparison_tzawareness_compat(self, op, box_df_fail): with pytest.raises(TypeError, match=msg): op(dz, np.array(list(dr), dtype=object)) - # Check that there isn't a problem aware-aware and naive-naive do not - # raise + # The aware==aware and naive==naive comparisons should *not* raise assert_all(dr == dr) - assert_all(dz == dz) + assert_all(dr == list(dr)) + assert_all(list(dr) == dr) + assert_all(np.array(list(dr), dtype=object) == dr) + assert_all(dr == np.array(list(dr), dtype=object)) - # FIXME: DataFrame case fails to raise for == and !=, wrong - # message for inequalities - assert (dr == list(dr)).all() - assert (dz == list(dz)).all() + assert_all(dz == dz) + assert_all(dz == list(dz)) + assert_all(list(dz) == dz) + assert_all(np.array(list(dz), dtype=object) == dz) + assert_all(dz == np.array(list(dz), dtype=object)) @pytest.mark.parametrize( "op", @@ -844,6 +825,7 @@ def test_dt64arr_isub_timedeltalike_scalar( rng -= two_hours tm.assert_equal(rng, expected) + # TODO: redundant with test_dt64arr_add_timedeltalike_scalar def test_dt64arr_add_td64_scalar(self, box_with_array): # scalar timedeltas/np.timedelta64 objects # operate with np.timedelta64 correctly @@ -1709,14 +1691,12 @@ def test_operators_datetimelike(self): dt1 - dt2 dt2 - dt1 - # ## datetime64 with timetimedelta ### + # datetime64 with timetimedelta dt1 + td1 td1 + dt1 dt1 - td1 - # TODO: Decide if this ought to work. - # td1 - dt1 - # ## timetimedelta with datetime64 ### + # timetimedelta with datetime64 td1 + dt1 dt1 + td1 @@ -1914,7 +1894,7 @@ def test_dt64_series_add_intlike(self, tz, op): with pytest.raises(TypeError, match=msg): method(other) with pytest.raises(TypeError, match=msg): - method(other.values) + method(np.array(other)) with pytest.raises(TypeError, match=msg): method(pd.Index(other)) @@ -2380,34 +2360,34 @@ def test_ufunc_coercions(self): idx = date_range("2011-01-01", periods=3, freq="2D", name="x") delta = np.timedelta64(1, "D") + exp = date_range("2011-01-02", periods=3, freq="2D", name="x") for result in [idx + delta, np.add(idx, delta)]: assert isinstance(result, DatetimeIndex) - exp = date_range("2011-01-02", periods=3, freq="2D", name="x") tm.assert_index_equal(result, exp) assert result.freq == "2D" + exp = date_range("2010-12-31", periods=3, freq="2D", name="x") for result in [idx - delta, np.subtract(idx, delta)]: assert isinstance(result, DatetimeIndex) - exp = date_range("2010-12-31", periods=3, freq="2D", name="x") tm.assert_index_equal(result, exp) assert result.freq == "2D" delta = np.array( [np.timedelta64(1, "D"), np.timedelta64(2, "D"), np.timedelta64(3, "D")] ) + exp = DatetimeIndex( + ["2011-01-02", "2011-01-05", "2011-01-08"], freq="3D", name="x" + ) for result in [idx + delta, np.add(idx, delta)]: assert isinstance(result, DatetimeIndex) - exp = DatetimeIndex( - ["2011-01-02", "2011-01-05", "2011-01-08"], freq="3D", name="x" - ) tm.assert_index_equal(result, exp) assert result.freq == "3D" + exp = DatetimeIndex( + ["2010-12-31", "2011-01-01", "2011-01-02"], freq="D", name="x" + ) for result in [idx - delta, np.subtract(idx, delta)]: assert isinstance(result, DatetimeIndex) - exp = DatetimeIndex( - ["2010-12-31", "2011-01-01", "2011-01-02"], freq="D", name="x" - ) tm.assert_index_equal(result, exp) assert result.freq == "D" diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py index d686d9f90a5a4a..8e7e72fcdc5800 100644 --- a/pandas/tests/arithmetic/test_numeric.py +++ b/pandas/tests/arithmetic/test_numeric.py @@ -561,9 +561,9 @@ def test_div_int(self, numeric_idx): tm.assert_index_equal(result, expected) @pytest.mark.parametrize("op", [operator.mul, ops.rmul, operator.floordiv]) - def test_mul_int_identity(self, op, numeric_idx, box): + def test_mul_int_identity(self, op, numeric_idx, box_with_array): idx = numeric_idx - idx = tm.box_expected(idx, box) + idx = tm.box_expected(idx, box_with_array) result = op(idx, 1) tm.assert_equal(result, idx) @@ -615,8 +615,9 @@ def test_mul_size_mismatch_raises(self, numeric_idx): idx * np.array([1, 2]) @pytest.mark.parametrize("op", [operator.pow, ops.rpow]) - def test_pow_float(self, op, numeric_idx, box): + def test_pow_float(self, op, numeric_idx, box_with_array): # test power calculations both ways, GH#14973 + box = box_with_array idx = numeric_idx expected = pd.Float64Index(op(idx.values, 2.0)) @@ -626,8 +627,9 @@ def test_pow_float(self, op, numeric_idx, box): result = op(idx, 2.0) tm.assert_equal(result, expected) - def test_modulo(self, numeric_idx, box): + def test_modulo(self, numeric_idx, box_with_array): # GH#9244 + box = box_with_array idx = numeric_idx expected = Index(idx.values % 2) @@ -1041,7 +1043,8 @@ class TestObjectDtypeEquivalence: # Tests that arithmetic operations match operations executed elementwise @pytest.mark.parametrize("dtype", [None, object]) - def test_numarr_with_dtype_add_nan(self, dtype, box): + def test_numarr_with_dtype_add_nan(self, dtype, box_with_array): + box = box_with_array ser = pd.Series([1, 2, 3], dtype=dtype) expected = pd.Series([np.nan, np.nan, np.nan], dtype=dtype) @@ -1055,7 +1058,8 @@ def test_numarr_with_dtype_add_nan(self, dtype, box): tm.assert_equal(result, expected) @pytest.mark.parametrize("dtype", [None, object]) - def test_numarr_with_dtype_add_int(self, dtype, box): + def test_numarr_with_dtype_add_int(self, dtype, box_with_array): + box = box_with_array ser = pd.Series([1, 2, 3], dtype=dtype) expected = pd.Series([2, 3, 4], dtype=dtype) diff --git a/pandas/tests/arithmetic/test_object.py b/pandas/tests/arithmetic/test_object.py index fd9db806713603..f9c1de115b3a4f 100644 --- a/pandas/tests/arithmetic/test_object.py +++ b/pandas/tests/arithmetic/test_object.py @@ -89,7 +89,7 @@ def test_pow_ops_object(self): @pytest.mark.parametrize("op", [operator.add, ops.radd]) @pytest.mark.parametrize("other", ["category", "Int64"]) - def test_add_extension_scalar(self, other, box, op): + def test_add_extension_scalar(self, other, box_with_array, op): # GH#22378 # Check that scalars satisfying is_extension_array_dtype(obj) # do not incorrectly try to dispatch to an ExtensionArray operation @@ -97,8 +97,8 @@ def test_add_extension_scalar(self, other, box, op): arr = pd.Series(["a", "b", "c"]) expected = pd.Series([op(x, other) for x in arr]) - arr = tm.box_expected(arr, box) - expected = tm.box_expected(expected, box) + arr = tm.box_expected(arr, box_with_array) + expected = tm.box_expected(expected, box_with_array) result = op(arr, other) tm.assert_equal(result, expected) @@ -133,16 +133,17 @@ def test_objarr_radd_str(self, box): ], ) @pytest.mark.parametrize("dtype", [None, object]) - def test_objarr_radd_str_invalid(self, dtype, data, box): + def test_objarr_radd_str_invalid(self, dtype, data, box_with_array): ser = Series(data, dtype=dtype) - ser = tm.box_expected(ser, box) + ser = tm.box_expected(ser, box_with_array) with pytest.raises(TypeError): "foo_" + ser @pytest.mark.parametrize("op", [operator.add, ops.radd, operator.sub, ops.rsub]) - def test_objarr_add_invalid(self, op, box): + def test_objarr_add_invalid(self, op, box_with_array): # invalid ops + box = box_with_array obj_ser = tm.makeObjectSeries() obj_ser.name = "objects" diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index 6d6b85a1e81e1c..ee27ce97f269e9 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -968,71 +968,37 @@ def test_td64arr_add_datetime64_nat(self, box_with_array): # ------------------------------------------------------------------ # Operations with int-like others - def test_td64arr_add_int_series_invalid(self, box): - tdser = pd.Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]") - tdser = tm.box_expected(tdser, box) - err = TypeError if box is not pd.Index else NullFrequencyError - int_ser = Series([2, 3, 4]) - - with pytest.raises(err): - tdser + int_ser - with pytest.raises(err): - int_ser + tdser - with pytest.raises(err): - tdser - int_ser - with pytest.raises(err): - int_ser - tdser - - def test_td64arr_add_intlike(self, box_with_array): - # GH#19123 - tdi = TimedeltaIndex(["59 days", "59 days", "NaT"]) - ser = tm.box_expected(tdi, box_with_array) - - err = TypeError - if box_with_array in [pd.Index, tm.to_array]: - err = NullFrequencyError - - other = Series([20, 30, 40], dtype="uint8") - - # TODO: separate/parametrize - with pytest.raises(err): - ser + 1 - with pytest.raises(err): - ser - 1 - - with pytest.raises(err): - ser + other - with pytest.raises(err): - ser - other - - with pytest.raises(err): - ser + np.array(other) - with pytest.raises(err): - ser - np.array(other) - - with pytest.raises(err): - ser + pd.Index(other) - with pytest.raises(err): - ser - pd.Index(other) - - @pytest.mark.parametrize("scalar", [1, 1.5, np.array(2)]) - def test_td64arr_add_sub_numeric_scalar_invalid(self, box_with_array, scalar): + @pytest.mark.parametrize( + "other", + [ + # GH#19123 + 1, + Series([20, 30, 40], dtype="uint8"), + np.array([20, 30, 40], dtype="uint8"), + pd.UInt64Index([20, 30, 40]), + pd.Int64Index([20, 30, 40]), + Series([2, 3, 4]), + 1.5, + np.array(2), + ], + ) + def test_td64arr_addsub_numeric_invalid(self, box_with_array, other): box = box_with_array - tdser = pd.Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]") tdser = tm.box_expected(tdser, box) + err = TypeError - if box in [pd.Index, tm.to_array] and not isinstance(scalar, float): + if box in [pd.Index, tm.to_array] and not isinstance(other, float): err = NullFrequencyError with pytest.raises(err): - tdser + scalar + tdser + other with pytest.raises(err): - scalar + tdser + other + tdser with pytest.raises(err): - tdser - scalar + tdser - other with pytest.raises(err): - scalar - tdser + other - tdser @pytest.mark.parametrize( "dtype", @@ -1059,11 +1025,12 @@ def test_td64arr_add_sub_numeric_scalar_invalid(self, box_with_array, scalar): ], ids=lambda x: type(x).__name__, ) - def test_td64arr_add_sub_numeric_arr_invalid(self, box, vec, dtype): + def test_td64arr_add_sub_numeric_arr_invalid(self, box_with_array, vec, dtype): + box = box_with_array tdser = pd.Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]") tdser = tm.box_expected(tdser, box) err = TypeError - if box is pd.Index and not dtype.startswith("float"): + if box in [pd.Index, tm.to_array] and not dtype.startswith("float"): err = NullFrequencyError vector = vec.astype(dtype) @@ -1080,14 +1047,6 @@ def test_td64arr_add_sub_numeric_arr_invalid(self, box, vec, dtype): # Operations with timedelta-like others # TODO: this was taken from tests.series.test_ops; de-duplicate - @pytest.mark.parametrize( - "scalar_td", - [ - timedelta(minutes=5, seconds=4), - Timedelta(minutes=5, seconds=4), - Timedelta("5m4s").to_timedelta64(), - ], - ) def test_operators_timedelta64_with_timedelta(self, scalar_td): # smoke tests td1 = Series([timedelta(minutes=5, seconds=3)] * 3) @@ -1141,7 +1100,8 @@ def test_timedelta64_operations_with_timedeltas(self): # roundtrip tm.assert_series_equal(result + td2, td1) - def test_td64arr_add_td64_array(self, box): + def test_td64arr_add_td64_array(self, box_with_array): + box = box_with_array dti = pd.date_range("2016-01-01", periods=3) tdi = dti - dti.shift(1) tdarr = tdi.values @@ -1155,7 +1115,8 @@ def test_td64arr_add_td64_array(self, box): result = tdarr + tdi tm.assert_equal(result, expected) - def test_td64arr_sub_td64_array(self, box): + def test_td64arr_sub_td64_array(self, box_with_array): + box = box_with_array dti = pd.date_range("2016-01-01", periods=3) tdi = dti - dti.shift(1) tdarr = tdi.values @@ -1229,8 +1190,9 @@ def test_td64arr_add_sub_tdi(self, box, names): else: assert result.dtypes[0] == "timedelta64[ns]" - def test_td64arr_add_sub_td64_nat(self, box): + def test_td64arr_add_sub_td64_nat(self, box_with_array): # GH#23320 special handling for timedelta64("NaT") + box = box_with_array tdi = pd.TimedeltaIndex([NaT, Timedelta("1s")]) other = np.timedelta64("NaT") expected = pd.TimedeltaIndex(["NaT"] * 2) @@ -1247,8 +1209,9 @@ def test_td64arr_add_sub_td64_nat(self, box): result = other - obj tm.assert_equal(result, expected) - def test_td64arr_sub_NaT(self, box): + def test_td64arr_sub_NaT(self, box_with_array): # GH#18808 + box = box_with_array ser = Series([NaT, Timedelta("1s")]) expected = Series([NaT, NaT], dtype="timedelta64[ns]") @@ -1258,8 +1221,9 @@ def test_td64arr_sub_NaT(self, box): res = ser - pd.NaT tm.assert_equal(res, expected) - def test_td64arr_add_timedeltalike(self, two_hours, box): + def test_td64arr_add_timedeltalike(self, two_hours, box_with_array): # only test adding/sub offsets as + is now numeric + box = box_with_array rng = timedelta_range("1 days", "10 days") expected = timedelta_range("1 days 02:00:00", "10 days 02:00:00", freq="D") rng = tm.box_expected(rng, box) @@ -1268,8 +1232,9 @@ def test_td64arr_add_timedeltalike(self, two_hours, box): result = rng + two_hours tm.assert_equal(result, expected) - def test_td64arr_sub_timedeltalike(self, two_hours, box): + def test_td64arr_sub_timedeltalike(self, two_hours, box_with_array): # only test adding/sub offsets as - is now numeric + box = box_with_array rng = timedelta_range("1 days", "10 days") expected = timedelta_range("0 days 22:00:00", "9 days 22:00:00") @@ -1352,8 +1317,9 @@ def test_td64arr_add_offset_index(self, names, box): # TODO: combine with test_td64arr_add_offset_index by parametrizing # over second box? - def test_td64arr_add_offset_array(self, box): + def test_td64arr_add_offset_array(self, box_with_array): # GH#18849 + box = box_with_array tdi = TimedeltaIndex(["1 days 00:00:00", "3 days 04:00:00"]) other = np.array([pd.offsets.Hour(n=1), pd.offsets.Minute(n=-2)]) @@ -1433,13 +1399,12 @@ def test_td64arr_with_offset_series(self, names, box_df_fail): # GH#18849 box = box_df_fail box2 = Series if box in [pd.Index, tm.to_array] else box + exname = names[2] if box is not tm.to_array else names[1] tdi = TimedeltaIndex(["1 days 00:00:00", "3 days 04:00:00"], name=names[0]) other = Series([pd.offsets.Hour(n=1), pd.offsets.Minute(n=-2)], name=names[1]) - expected_add = Series( - [tdi[n] + other[n] for n in range(len(tdi))], name=names[2] - ) + expected_add = Series([tdi[n] + other[n] for n in range(len(tdi))], name=exname) tdi = tm.box_expected(tdi, box) expected_add = tm.box_expected(expected_add, box2) @@ -1452,9 +1417,7 @@ def test_td64arr_with_offset_series(self, names, box_df_fail): tm.assert_equal(res2, expected_add) # TODO: separate/parametrize add/sub test? - expected_sub = Series( - [tdi[n] - other[n] for n in range(len(tdi))], name=names[2] - ) + expected_sub = Series([tdi[n] - other[n] for n in range(len(tdi))], name=exname) expected_sub = tm.box_expected(expected_sub, box2) with tm.assert_produces_warning(PerformanceWarning): @@ -2055,6 +2018,8 @@ def test_td64arr_div_numeric_array(self, box_with_array, vector, dtype): def test_td64arr_mul_int_series(self, box_df_fail, names): # GH#19042 test for correct name attachment box = box_df_fail # broadcasts along wrong axis, but doesn't raise + exname = names[2] if box is not tm.to_array else names[1] + tdi = TimedeltaIndex( ["0days", "1day", "2days", "3days", "4days"], name=names[0] ) @@ -2064,11 +2029,11 @@ def test_td64arr_mul_int_series(self, box_df_fail, names): expected = Series( ["0days", "1day", "4days", "9days", "16days"], dtype="timedelta64[ns]", - name=names[2], + name=exname, ) tdi = tm.box_expected(tdi, box) - box = Series if (box is pd.Index and type(ser) is Series) else box + box = Series if (box is pd.Index or box is tm.to_array) else box expected = tm.box_expected(expected, box) result = ser * tdi @@ -2119,7 +2084,11 @@ def test_float_series_rdiv_td64arr(self, box_with_array, names): tm.assert_equal(result, expected) -class TestTimedeltaArraylikeInvalidArithmeticOps: +class TestTimedelta64ArrayLikeArithmetic: + # Arithmetic tests for timedelta64[ns] vectors fully parametrized over + # DataFrame/Series/TimedeltaIndex/TimedeltaArray. Ideally all arithmetic + # tests will eventually end up here. + def test_td64arr_pow_invalid(self, scalar_td, box_with_array): td1 = Series([timedelta(minutes=5, seconds=3)] * 3) td1.iloc[2] = np.nan From 15eb9cad864b6794c4f7e7c08c2933a0a1169859 Mon Sep 17 00:00:00 2001 From: Sergei Ivko Date: Tue, 3 Sep 2019 02:52:10 +0300 Subject: [PATCH 56/95] ENH: Enable read_csv interpret 'Infinity' as floating point value #10065 (#28181) --- doc/source/whatsnew/v1.0.0.rst | 2 +- pandas/_libs/parsers.pyx | 18 ++++++++++++++---- pandas/_libs/src/parse_helper.h | 19 ++++++++++++++++++- pandas/tests/io/parser/test_common.py | 17 +++++++++++++++++ 4 files changed, 50 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 6834435adb4780..cd0714838a3f15 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -162,7 +162,7 @@ I/O - :meth:`read_csv` now accepts binary mode file buffers when using the Python csv engine (:issue:`23779`) - Bug in :meth:`DataFrame.to_json` where using a Tuple as a column or index value and using ``orient="columns"`` or ``orient="index"`` would produce invalid JSON (:issue:`20500`) -- +- Improve infinity parsing. :meth:`read_csv` now interprets ``Infinity``, ``+Infinity``, ``-Infinity`` as floating point values (:issue:`10065`) Plotting ^^^^^^^^ diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 6cc9dd22ce7c92..62a3568932def4 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1693,6 +1693,10 @@ cdef: char* cposinf = b'+inf' char* cneginf = b'-inf' + char* cinfty = b'Infinity' + char* cposinfty = b'+Infinity' + char* cneginfty = b'-Infinity' + cdef _try_double(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, @@ -1772,9 +1776,12 @@ cdef inline int _try_double_nogil(parser_t *parser, if error != 0 or p_end == word or p_end[0]: error = 0 if (strcasecmp(word, cinf) == 0 or - strcasecmp(word, cposinf) == 0): + strcasecmp(word, cposinf) == 0 or + strcasecmp(word, cinfty) == 0 or + strcasecmp(word, cposinfty) == 0): data[0] = INF - elif strcasecmp(word, cneginf) == 0: + elif (strcasecmp(word, cneginf) == 0 or + strcasecmp(word, cneginfty) == 0 ): data[0] = NEGINF else: return 1 @@ -1793,9 +1800,12 @@ cdef inline int _try_double_nogil(parser_t *parser, if error != 0 or p_end == word or p_end[0]: error = 0 if (strcasecmp(word, cinf) == 0 or - strcasecmp(word, cposinf) == 0): + strcasecmp(word, cposinf) == 0 or + strcasecmp(word, cinfty) == 0 or + strcasecmp(word, cposinfty) == 0): data[0] = INF - elif strcasecmp(word, cneginf) == 0: + elif (strcasecmp(word, cneginf) == 0 or + strcasecmp(word, cneginfty) == 0): data[0] = NEGINF else: return 1 diff --git a/pandas/_libs/src/parse_helper.h b/pandas/_libs/src/parse_helper.h index 1db1878a8a773f..1db4c813bb4930 100644 --- a/pandas/_libs/src/parse_helper.h +++ b/pandas/_libs/src/parse_helper.h @@ -50,7 +50,7 @@ int floatify(PyObject *str, double *result, int *maybe_int) { status = to_double(data, result, sci, dec, maybe_int); if (!status) { - /* handle inf/-inf */ + /* handle inf/-inf infinity/-infinity */ if (strlen(data) == 3) { if (0 == strcasecmp(data, "inf")) { *result = HUGE_VAL; @@ -68,6 +68,23 @@ int floatify(PyObject *str, double *result, int *maybe_int) { } else { goto parsingerror; } + } else if (strlen(data) == 8) { + if (0 == strcasecmp(data, "infinity")) { + *result = HUGE_VAL; + *maybe_int = 0; + } else { + goto parsingerror; + } + } else if (strlen(data) == 9) { + if (0 == strcasecmp(data, "-infinity")) { + *result = -HUGE_VAL; + *maybe_int = 0; + } else if (0 == strcasecmp(data, "+infinity")) { + *result = HUGE_VAL; + *maybe_int = 0; + } else { + goto parsingerror; + } } else { goto parsingerror; } diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index e04535df56663c..0586593c87cc54 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -1865,6 +1865,23 @@ def test_inf_parsing(all_parsers, na_filter): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("na_filter", [True, False]) +def test_infinity_parsing(all_parsers, na_filter): + parser = all_parsers + data = """\ +,A +a,Infinity +b,-Infinity +c,+Infinity +""" + expected = DataFrame( + {"A": [float("infinity"), float("-infinity"), float("+infinity")]}, + index=["a", "b", "c"], + ) + result = parser.read_csv(StringIO(data), index_col=0, na_filter=na_filter) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("nrows", [0, 1, 2, 3, 4, 5]) def test_raise_on_no_columns(all_parsers, nrows): parser = all_parsers From 91e5b85aeaa4e06c18ec1c8a59e3fce3f2545f10 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 2 Sep 2019 17:07:26 -0700 Subject: [PATCH 57/95] Revert #27959 (#28258) --- pandas/core/ops/__init__.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index df097d7ad91dc7..cc2d4ced1243f1 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -829,11 +829,6 @@ def wrapper(self, other): # Defer to DataFrame implementation; fail early return NotImplemented - elif should_extension_dispatch(self, other): - # e.g. SparseArray - res_values = dispatch_to_extension_op(op, self, other) - return _construct_result(self, res_values, index=self.index, name=res_name) - elif isinstance(other, (ABCSeries, ABCIndexClass)): is_other_int_dtype = is_integer_dtype(other.dtype) other = other if is_other_int_dtype else fill_bool(other) From ae93c2302dd687ea8f5bcfdb6e9591fb9bfdb19a Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Mon, 2 Sep 2019 22:04:45 -0700 Subject: [PATCH 58/95] Add peakmem benchmarks for rolling (#28255) --- asv_bench/benchmarks/rolling.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index 3640513d31be26..b42fa553b495ce 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -21,6 +21,9 @@ def setup(self, constructor, window, dtype, method): def time_rolling(self, constructor, window, dtype, method): getattr(self.roll, method)() + def peakmem_rolling(self, constructor, window, dtype, method): + getattr(self.roll, method)() + class ExpandingMethods: From 9cb5de04bc61f23047eb7f34bef2bb14ef58da8e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 3 Sep 2019 04:39:20 -0700 Subject: [PATCH 59/95] CLN: Catch more specific exceptions in groupby (#27909) * catch stricter --- pandas/_libs/index.pyx | 1 + pandas/_libs/index_class_helper.pxi.in | 11 ++++++++++- pandas/core/groupby/generic.py | 2 +- pandas/core/groupby/groupby.py | 3 +-- pandas/core/groupby/grouper.py | 6 ++++-- pandas/core/groupby/ops.py | 4 ++-- 6 files changed, 19 insertions(+), 8 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 7424c4ddc3d924..979dad6db0838f 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -1,4 +1,5 @@ from datetime import datetime, timedelta, date +import warnings import cython diff --git a/pandas/_libs/index_class_helper.pxi.in b/pandas/_libs/index_class_helper.pxi.in index 3c9a096e7ecc0c..4db048eeb03831 100644 --- a/pandas/_libs/index_class_helper.pxi.in +++ b/pandas/_libs/index_class_helper.pxi.in @@ -60,7 +60,16 @@ cdef class {{name}}Engine(IndexEngine): # A view is needed for some subclasses, such as PeriodEngine: values = self._get_index_values().view('{{dtype}}') - indexer = values == val + try: + with warnings.catch_warnings(): + # e.g. if values is float64 and `val` is a str, suppress warning + warnings.filterwarnings("ignore", category=FutureWarning) + indexer = values == val + except TypeError: + # if the equality above returns a bool, cython will raise TypeError + # when trying to cast it to ndarray + raise KeyError(val) + found = np.where(indexer)[0] count = len(found) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 6c95b521110a98..c0436e93890782 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -349,7 +349,7 @@ def _decide_output_index(self, output, labels): output_keys = sorted(output) try: output_keys.sort() - except Exception: # pragma: no cover + except TypeError: pass if isinstance(labels, MultiIndex): diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 6deef16bdec131..55def024cb1d46 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -727,8 +727,7 @@ def f(g): with option_context("mode.chained_assignment", None): try: result = self._python_apply_general(f) - except Exception: - + except TypeError: # gh-20949 # try again, with .apply acting as a filtering # operation, by excluding the grouping column diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 31623171e9e631..d079a1c4ef4f7b 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -592,9 +592,11 @@ def is_in_axis(key): # if the grouper is obj[name] def is_in_obj(gpr): + if not hasattr(gpr, "name"): + return False try: - return id(gpr) == id(obj[gpr.name]) - except Exception: + return gpr is obj[gpr.name] + except (KeyError, IndexError): return False for i, (gpr, level) in enumerate(zip(keys, levels)): diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 7afb0a28f943ee..6263973fb0d2fe 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -212,8 +212,8 @@ def apply(self, f, data, axis=0): # This Exception is also raised if `f` triggers an exception # but it is preferable to raise the exception in Python. pass - except Exception: - # raise this error to the caller + except TypeError: + # occurs if we have any EAs pass for key, (i, group) in zip(group_keys, splitter): From 45668500c0b48bd4b534b57f84e7cfc374b9da80 Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Tue, 3 Sep 2019 12:47:35 +0100 Subject: [PATCH 60/95] DOC: Add missing public plotting functions to the docs (#28179) * DOC: Add missing public plotting functions to the docs --- doc/source/reference/plotting.rst | 4 ++++ pandas/plotting/_misc.py | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/doc/source/reference/plotting.rst b/doc/source/reference/plotting.rst index 7615e1d20f5e27..95657dfa5fde5b 100644 --- a/doc/source/reference/plotting.rst +++ b/doc/source/reference/plotting.rst @@ -13,10 +13,14 @@ The following functions are contained in the `pandas.plotting` module. :toctree: api/ andrews_curves + autocorrelation_plot bootstrap_plot + boxplot deregister_matplotlib_converters lag_plot parallel_coordinates + plot_params radviz register_matplotlib_converters scatter_matrix + table diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index 7ed0ffc6d0115e..a8e86d9dfa997d 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -417,8 +417,8 @@ def autocorrelation_plot(series, ax=None, **kwds): Parameters ---------- - series: Time series - ax: Matplotlib axis object, optional + series : Time series + ax : Matplotlib axis object, optional kwds : keywords Options to pass to matplotlib plotting method From afe0cc360950302be41b4c7e8fd3c5272b537297 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fuzhan=20=C3=96=C4=9Freden?= Date: Tue, 3 Sep 2019 20:36:06 +0200 Subject: [PATCH 61/95] DOC: Add docstring to the insertion method & add empty result note (#26872) * Add docstring to the insertion method & fix #21364 Credit for empty result documentation goes to MagnetarAlex --- doc/source/user_guide/io.rst | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 338c890ce317c5..f6b0c55d39f65d 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -5047,6 +5047,17 @@ Example of a callable using PostgreSQL `COPY clause from io import StringIO def psql_insert_copy(table, conn, keys, data_iter): + """ + Execute SQL statement inserting data + + Parameters + ---------- + table : pandas.io.sql.SQLTable + conn : sqlalchemy.engine.Engine or sqlalchemy.engine.Connection + keys : list of str + Column names + data_iter : Iterable that iterates the values to be inserted + """ # gets a DBAPI connection that can provide a cursor dbapi_conn = conn.connection with dbapi_conn.cursor() as cur: @@ -5080,6 +5091,18 @@ table name and optionally a subset of columns to read. pd.read_sql_table('data', engine) +.. note:: + + Note that pandas infers column dtypes from query outputs, and not by looking + up data types in the physical database schema. For example, assume ``userid`` + is an integer column in a table. Then, intuitively, ``select userid ...`` will + return integer-valued series, while ``select cast(userid as text) ...`` will + return object-valued (str) series. Accordingly, if the query output is empty, + then all resulting columns will be returned as object-valued (since they are + most general). If you foresee that your query will sometimes generate an empty + result, you may want to explicitly typecast afterwards to ensure dtype + integrity. + You can also specify the name of the column as the ``DataFrame`` index, and specify a subset of columns to be read. From 9777e8402cf353ce9c33375e1ed885202264a34d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 3 Sep 2019 11:53:52 -0700 Subject: [PATCH 62/95] REF: use dispatch_to_extension_op for bool ops (#28260) re-implement #27959, which was previously merged and reverted. --- pandas/core/ops/__init__.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index cc2d4ced1243f1..9fd6efe32de291 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -829,6 +829,13 @@ def wrapper(self, other): # Defer to DataFrame implementation; fail early return NotImplemented + elif should_extension_dispatch(self, other): + lvalues = extract_array(self, extract_numpy=True) + rvalues = extract_array(other, extract_numpy=True) + res_values = dispatch_to_extension_op(op, lvalues, rvalues) + result = self._constructor(res_values, index=self.index, name=res_name) + return finalizer(result) + elif isinstance(other, (ABCSeries, ABCIndexClass)): is_other_int_dtype = is_integer_dtype(other.dtype) other = other if is_other_int_dtype else fill_bool(other) From efa177d4eedf03ce0dd33063b09b7bd5580c5a98 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 3 Sep 2019 14:01:08 -0500 Subject: [PATCH 63/95] VIS: Validate plot backend when setting. (#28164) * Validate plot backend when setting. Closes https://github.com/pandas-dev/pandas/issues/28163 --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/core/config_init.py | 29 +++--------- pandas/plotting/_core.py | 24 +++++++--- pandas/tests/plotting/test_backend.py | 63 +++++++++++++-------------- pandas/tests/plotting/test_misc.py | 2 +- 5 files changed, 58 insertions(+), 61 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index cd0714838a3f15..91e8c9efba693d 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -172,6 +172,7 @@ Plotting - Bug in :meth:`DataFrame.plot` producing incorrect legend markers when plotting multiple series on the same axis (:issue:`18222`) - Bug in :meth:`DataFrame.plot` when ``kind='box'`` and data contains datetime or timedelta data. These types are now automatically dropped (:issue:`22799`) - Bug in :meth:`DataFrame.plot.line` and :meth:`DataFrame.plot.area` produce wrong xlim in x-axis (:issue:`27686`, :issue:`25160`, :issue:`24784`) +- :func:`set_option` now validates that the plot backend provided to ``'plotting.backend'`` implements the backend when the option is set, rather than when a plot is created (:issue:`28163`) Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 08dce6aca6e6d1..dfc80140433f8e 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -9,8 +9,6 @@ module is imported, register them here rather then in the module. """ -import importlib - import pandas._config.config as cf from pandas._config.config import ( is_bool, @@ -581,26 +579,12 @@ def use_inf_as_na_cb(key): def register_plotting_backend_cb(key): - backend_str = cf.get_option(key) - if backend_str == "matplotlib": - try: - import pandas.plotting._matplotlib # noqa - except ImportError: - raise ImportError( - "matplotlib is required for plotting when the " - 'default backend "matplotlib" is selected.' - ) - else: - return + if key == "matplotlib": + # We defer matplotlib validation, since it's the default + return + from pandas.plotting._core import _get_plot_backend - try: - importlib.import_module(backend_str) - except ImportError: - raise ValueError( - '"{}" does not seem to be an installed module. ' - "A pandas plotting backend must be a module that " - "can be imported".format(backend_str) - ) + _get_plot_backend(key) with cf.config_prefix("plotting"): @@ -608,8 +592,7 @@ def register_plotting_backend_cb(key): "backend", defval="matplotlib", doc=plotting_backend_doc, - validator=str, - cb=register_plotting_backend_cb, + validator=register_plotting_backend_cb, ) diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 2e6a401b49efc4..d3c9e8ccfa51ca 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -1576,10 +1576,18 @@ def _find_backend(backend: str): # We re-raise later on. pass else: - _backends[backend] = module - return module - - raise ValueError("No backend {}".format(backend)) + if hasattr(module, "plot"): + # Validate that the interface is implemented when the option + # is set, rather than at plot time. + _backends[backend] = module + return module + + msg = ( + "Could not find plotting backend '{name}'. Ensure that you've installed the " + "package providing the '{name}' entrypoint, or that the package has a" + "top-level `.plot` method." + ) + raise ValueError(msg.format(name=backend)) def _get_plot_backend(backend=None): @@ -1600,7 +1608,13 @@ def _get_plot_backend(backend=None): if backend == "matplotlib": # Because matplotlib is an optional dependency and first-party backend, # we need to attempt an import here to raise an ImportError if needed. - import pandas.plotting._matplotlib as module + try: + import pandas.plotting._matplotlib as module + except ImportError: + raise ImportError( + "matplotlib is required for plotting when the " + 'default backend "matplotlib" is selected.' + ) from None _backends["matplotlib"] = module diff --git a/pandas/tests/plotting/test_backend.py b/pandas/tests/plotting/test_backend.py index d126407cfd823e..6511d94aa4c094 100644 --- a/pandas/tests/plotting/test_backend.py +++ b/pandas/tests/plotting/test_backend.py @@ -8,44 +8,38 @@ import pandas +dummy_backend = types.ModuleType("pandas_dummy_backend") +dummy_backend.plot = lambda *args, **kwargs: None -def test_matplotlib_backend_error(): - msg = ( - "matplotlib is required for plotting when the default backend " - '"matplotlib" is selected.' - ) - try: - import matplotlib # noqa - except ImportError: - with pytest.raises(ImportError, match=msg): - pandas.set_option("plotting.backend", "matplotlib") + +@pytest.fixture +def restore_backend(): + """Restore the plotting backend to matplotlib""" + pandas.set_option("plotting.backend", "matplotlib") + yield + pandas.set_option("plotting.backend", "matplotlib") def test_backend_is_not_module(): - msg = ( - '"not_an_existing_module" does not seem to be an installed module. ' - "A pandas plotting backend must be a module that can be imported" - ) + msg = "Could not find plotting backend 'not_an_existing_module'." with pytest.raises(ValueError, match=msg): pandas.set_option("plotting.backend", "not_an_existing_module") + assert pandas.options.plotting.backend == "matplotlib" -def test_backend_is_correct(monkeypatch): - monkeypatch.setattr( - "pandas.core.config_init.importlib.import_module", lambda name: None - ) - pandas.set_option("plotting.backend", "correct_backend") - assert pandas.get_option("plotting.backend") == "correct_backend" - # Restore backend for other tests (matplotlib can be not installed) - try: - pandas.set_option("plotting.backend", "matplotlib") - except ImportError: - pass +def test_backend_is_correct(monkeypatch, restore_backend): + monkeypatch.setitem(sys.modules, "pandas_dummy_backend", dummy_backend) + + pandas.set_option("plotting.backend", "pandas_dummy_backend") + assert pandas.get_option("plotting.backend") == "pandas_dummy_backend" + assert ( + pandas.plotting._core._get_plot_backend("pandas_dummy_backend") is dummy_backend + ) @td.skip_if_no_mpl -def test_register_entrypoint(): +def test_register_entrypoint(restore_backend): dist = pkg_resources.get_distribution("pandas") if dist.module_path not in pandas.__file__: @@ -74,13 +68,18 @@ def test_register_entrypoint(): assert result is mod -def test_register_import(): - mod = types.ModuleType("my_backend2") - mod.plot = lambda *args, **kwargs: 1 - sys.modules["my_backend2"] = mod +def test_setting_backend_without_plot_raises(): + # GH-28163 + module = types.ModuleType("pandas_plot_backend") + sys.modules["pandas_plot_backend"] = module - result = pandas.plotting._core._get_plot_backend("my_backend2") - assert result is mod + assert pandas.options.plotting.backend == "matplotlib" + with pytest.raises( + ValueError, match="Could not find plotting backend 'pandas_plot_backend'." + ): + pandas.set_option("plotting.backend", "pandas_plot_backend") + + assert pandas.options.plotting.backend == "matplotlib" @td.skip_if_mpl diff --git a/pandas/tests/plotting/test_misc.py b/pandas/tests/plotting/test_misc.py index 6cb6f818d40fdd..940cfef4058e03 100644 --- a/pandas/tests/plotting/test_misc.py +++ b/pandas/tests/plotting/test_misc.py @@ -21,7 +21,7 @@ def test_import_error_message(): # GH-19810 df = DataFrame({"A": [1, 2]}) - with pytest.raises(ImportError, match="No module named 'matplotlib'"): + with pytest.raises(ImportError, match="matplotlib is required for plotting"): df.plot() From bfff080275b4456b28d71f0c7b4ec9e678d4270c Mon Sep 17 00:00:00 2001 From: jeschwar <36767735+jeschwar@users.noreply.github.com> Date: Tue, 3 Sep 2019 13:26:01 -0600 Subject: [PATCH 64/95] ENH: added optional caption and label arguments to DataFrame.to_latex() (#25437) * ENH: added optional caption and label support to DataFrame.to_latex() (#25436) --- doc/source/whatsnew/v1.0.0.rst | 3 +- pandas/core/generic.py | 29 ++++- pandas/io/formats/format.py | 4 + pandas/io/formats/latex.py | 128 +++++++++++++++++++-- pandas/tests/io/formats/test_to_latex.py | 138 ++++++++++++++++++++++- 5 files changed, 283 insertions(+), 19 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 91e8c9efba693d..0d2b81eca6789c 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -20,8 +20,7 @@ including other versions of pandas. Enhancements ~~~~~~~~~~~~ - -- +- :meth:`DataFrame.to_latex` now accepts ``caption`` and ``label`` arguments (:issue:`25436`) - .. _whatsnew_1000.enhancements.other: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 1a5b36b07e93ca..b427b1f0ac8580 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2925,15 +2925,21 @@ def to_latex( multicolumn=None, multicolumn_format=None, multirow=None, + caption=None, + label=None, ): r""" - Render an object to a LaTeX tabular environment table. + Render object to a LaTeX tabular, longtable, or nested table/tabular. - Render an object to a tabular environment table. You can splice - this into a LaTeX document. Requires \usepackage{booktabs}. + Requires ``\usepackage{booktabs}``. The output can be copy/pasted + into a main LaTeX document or read from an external file + with ``\input{table.tex}``. .. versionchanged:: 0.20.2 - Added to Series + Added to Series. + + .. versionchanged:: 1.0.0 + Added caption and label arguments. Parameters ---------- @@ -3002,6 +3008,17 @@ def to_latex( from the pandas config module. .. versionadded:: 0.20.0 + + caption : str, optional + The LaTeX caption to be placed inside ``\caption{}`` in the output. + + .. versionadded:: 1.0.0 + + label : str, optional + The LaTeX label to be placed inside ``\label{}`` in the output. + This is used with ``\ref{}`` in the main ``.tex`` file. + + .. versionadded:: 1.0.0 %(returns)s See Also -------- @@ -3014,7 +3031,7 @@ def to_latex( >>> df = pd.DataFrame({'name': ['Raphael', 'Donatello'], ... 'mask': ['red', 'purple'], ... 'weapon': ['sai', 'bo staff']}) - >>> print(df.to_latex(index=False)) # doctest: +NORMALIZE_WHITESPACE + >>> print(df.to_latex(index=False)) # doctest: +NORMALIZE_WHITESPACE \begin{tabular}{lll} \toprule name & mask & weapon \\ @@ -3061,6 +3078,8 @@ def to_latex( multicolumn=multicolumn, multicolumn_format=multicolumn_format, multirow=multirow, + caption=caption, + label=label, ) def to_csv( diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 8ff4b9bda0430a..f8db1b19dadfa8 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -888,6 +888,8 @@ def to_latex( multicolumn: bool = False, multicolumn_format: Optional[str] = None, multirow: bool = False, + caption: Optional[str] = None, + label: Optional[str] = None, ) -> Optional[str]: """ Render a DataFrame to a LaTeX tabular/longtable environment output. @@ -902,6 +904,8 @@ def to_latex( multicolumn=multicolumn, multicolumn_format=multicolumn_format, multirow=multirow, + caption=caption, + label=label, ).get_result(buf=buf, encoding=encoding) def _format_col(self, i: int) -> List[str]: diff --git a/pandas/io/formats/latex.py b/pandas/io/formats/latex.py index 4c4d5ec73269a5..ca9db88ae7be46 100644 --- a/pandas/io/formats/latex.py +++ b/pandas/io/formats/latex.py @@ -36,6 +36,8 @@ def __init__( multicolumn: bool = False, multicolumn_format: Optional[str] = None, multirow: bool = False, + caption: Optional[str] = None, + label: Optional[str] = None, ): self.fmt = formatter self.frame = self.fmt.frame @@ -45,11 +47,14 @@ def __init__( self.multicolumn = multicolumn self.multicolumn_format = multicolumn_format self.multirow = multirow + self.caption = caption + self.label = label self.escape = self.fmt.escape def write_result(self, buf: IO[str]) -> None: """ - Render a DataFrame to a LaTeX tabular/longtable environment output. + Render a DataFrame to a LaTeX tabular, longtable, or table/tabular + environment output. """ # string representation of the columns @@ -114,12 +119,12 @@ def pad_empties(x): "not {typ}".format(typ=type(column_format)) ) - if not self.longtable: - buf.write("\\begin{{tabular}}{{{fmt}}}\n".format(fmt=column_format)) - buf.write("\\toprule\n") + if self.longtable: + self._write_longtable_begin(buf, column_format) else: - buf.write("\\begin{{longtable}}{{{fmt}}}\n".format(fmt=column_format)) - buf.write("\\toprule\n") + self._write_tabular_begin(buf, column_format) + + buf.write("\\toprule\n") ilevels = self.frame.index.nlevels clevels = self.frame.columns.nlevels @@ -183,11 +188,10 @@ def pad_empties(x): if self.multirow and i < len(strrows) - 1: self._print_cline(buf, i, len(strcols)) - if not self.longtable: - buf.write("\\bottomrule\n") - buf.write("\\end{tabular}\n") + if self.longtable: + self._write_longtable_end(buf) else: - buf.write("\\end{longtable}\n") + self._write_tabular_end(buf) def _format_multicolumn(self, row: List[str], ilevels: int) -> List[str]: r""" @@ -268,3 +272,107 @@ def _print_cline(self, buf: IO[str], i: int, icol: int) -> None: buf.write("\\cline{{{cl:d}-{icol:d}}}\n".format(cl=cl[1], icol=icol)) # remove entries that have been written to buffer self.clinebuf = [x for x in self.clinebuf if x[0] != i] + + def _write_tabular_begin(self, buf, column_format): + """ + Write the beginning of a tabular environment or + nested table/tabular environments including caption and label. + + Parameters + ---------- + buf : string or file handle + File path or object. If not specified, the result is returned as + a string. + column_format : str, default None + The columns format as specified in `LaTeX table format + `__ e.g 'rcl' + for 3 columns + + """ + if self.caption is not None or self.label is not None: + # then write output in a nested table/tabular environment + if self.caption is None: + caption_ = "" + else: + caption_ = "\n\\caption{{{}}}".format(self.caption) + + if self.label is None: + label_ = "" + else: + label_ = "\n\\label{{{}}}".format(self.label) + + buf.write("\\begin{{table}}\n\\centering{}{}\n".format(caption_, label_)) + else: + # then write output only in a tabular environment + pass + + buf.write("\\begin{{tabular}}{{{fmt}}}\n".format(fmt=column_format)) + + def _write_tabular_end(self, buf): + """ + Write the end of a tabular environment or nested table/tabular + environment. + + Parameters + ---------- + buf : string or file handle + File path or object. If not specified, the result is returned as + a string. + + """ + buf.write("\\bottomrule\n") + buf.write("\\end{tabular}\n") + if self.caption is not None or self.label is not None: + buf.write("\\end{table}\n") + else: + pass + + def _write_longtable_begin(self, buf, column_format): + """ + Write the beginning of a longtable environment including caption and + label if provided by user. + + Parameters + ---------- + buf : string or file handle + File path or object. If not specified, the result is returned as + a string. + column_format : str, default None + The columns format as specified in `LaTeX table format + `__ e.g 'rcl' + for 3 columns + + """ + buf.write("\\begin{{longtable}}{{{fmt}}}\n".format(fmt=column_format)) + + if self.caption is not None or self.label is not None: + if self.caption is None: + pass + else: + buf.write("\\caption{{{}}}".format(self.caption)) + + if self.label is None: + pass + else: + buf.write("\\label{{{}}}".format(self.label)) + + # a double-backslash is required at the end of the line + # as discussed here: + # https://tex.stackexchange.com/questions/219138 + buf.write("\\\\\n") + else: + pass + + @staticmethod + def _write_longtable_end(buf): + """ + Write the end of a longtable environment. + + Parameters + ---------- + buf : string or file handle + File path or object. If not specified, the result is returned as + a string. + + """ + buf.write("\\end{longtable}\n") diff --git a/pandas/tests/io/formats/test_to_latex.py b/pandas/tests/io/formats/test_to_latex.py index 924b2a19e85046..9ffb54d23e37e3 100644 --- a/pandas/tests/io/formats/test_to_latex.py +++ b/pandas/tests/io/formats/test_to_latex.py @@ -388,8 +388,7 @@ def test_to_latex_special_escape(self): """ assert escaped_result == escaped_expected - def test_to_latex_longtable(self, float_frame): - float_frame.to_latex(longtable=True) + def test_to_latex_longtable(self): df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) withindex_result = df.to_latex(longtable=True) @@ -439,6 +438,141 @@ def test_to_latex_longtable(self, float_frame): with3columns_result = df.to_latex(index=False, longtable=True) assert r"\multicolumn{3}" in with3columns_result + def test_to_latex_caption_label(self): + # GH 25436 + the_caption = "a table in a \\texttt{table/tabular} environment" + the_label = "tab:table_tabular" + + df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) + + # test when only the caption is provided + result_c = df.to_latex(caption=the_caption) + + expected_c = r"""\begin{table} +\centering +\caption{a table in a \texttt{table/tabular} environment} +\begin{tabular}{lrl} +\toprule +{} & a & b \\ +\midrule +0 & 1 & b1 \\ +1 & 2 & b2 \\ +\bottomrule +\end{tabular} +\end{table} +""" + assert result_c == expected_c + + # test when only the label is provided + result_l = df.to_latex(label=the_label) + + expected_l = r"""\begin{table} +\centering +\label{tab:table_tabular} +\begin{tabular}{lrl} +\toprule +{} & a & b \\ +\midrule +0 & 1 & b1 \\ +1 & 2 & b2 \\ +\bottomrule +\end{tabular} +\end{table} +""" + assert result_l == expected_l + + # test when the caption and the label are provided + result_cl = df.to_latex(caption=the_caption, label=the_label) + + expected_cl = r"""\begin{table} +\centering +\caption{a table in a \texttt{table/tabular} environment} +\label{tab:table_tabular} +\begin{tabular}{lrl} +\toprule +{} & a & b \\ +\midrule +0 & 1 & b1 \\ +1 & 2 & b2 \\ +\bottomrule +\end{tabular} +\end{table} +""" + assert result_cl == expected_cl + + def test_to_latex_longtable_caption_label(self): + # GH 25436 + the_caption = "a table in a \\texttt{longtable} environment" + the_label = "tab:longtable" + + df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) + + # test when only the caption is provided + result_c = df.to_latex(longtable=True, caption=the_caption) + + expected_c = r"""\begin{longtable}{lrl} +\caption{a table in a \texttt{longtable} environment}\\ +\toprule +{} & a & b \\ +\midrule +\endhead +\midrule +\multicolumn{3}{r}{{Continued on next page}} \\ +\midrule +\endfoot + +\bottomrule +\endlastfoot +0 & 1 & b1 \\ +1 & 2 & b2 \\ +\end{longtable} +""" + assert result_c == expected_c + + # test when only the label is provided + result_l = df.to_latex(longtable=True, label=the_label) + + expected_l = r"""\begin{longtable}{lrl} +\label{tab:longtable}\\ +\toprule +{} & a & b \\ +\midrule +\endhead +\midrule +\multicolumn{3}{r}{{Continued on next page}} \\ +\midrule +\endfoot + +\bottomrule +\endlastfoot +0 & 1 & b1 \\ +1 & 2 & b2 \\ +\end{longtable} +""" + assert result_l == expected_l + + # test when the caption and the label are provided + result_cl = df.to_latex(longtable=True, caption=the_caption, label=the_label) + + expected_cl = r"""\begin{longtable}{lrl} +\caption{a table in a \texttt{longtable} environment}\label{tab:longtable}\\ +\toprule +{} & a & b \\ +\midrule +\endhead +\midrule +\multicolumn{3}{r}{{Continued on next page}} \\ +\midrule +\endfoot + +\bottomrule +\endlastfoot +0 & 1 & b1 \\ +1 & 2 & b2 \\ +\end{longtable} +""" + assert result_cl == expected_cl + def test_to_latex_escape_special_chars(self): special_characters = ["&", "%", "$", "#", "_", "{", "}", "~", "^", "\\"] df = DataFrame(data=special_characters) From 60ff4e1dad69af4585644ed8e6b7b6b10fb6a98b Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 4 Sep 2019 04:23:11 -0700 Subject: [PATCH 65/95] CLN: catch Exception in fewer places, assorted cleanups (#28276) --- ci/code_checks.sh | 2 +- pandas/_libs/lib.pyx | 8 ++++---- pandas/core/common.py | 2 +- pandas/core/groupby/grouper.py | 6 ++++-- pandas/core/groupby/ops.py | 30 +++++------------------------- pandas/core/ops/__init__.py | 7 ++----- pandas/tests/test_downstream.py | 1 + 7 files changed, 18 insertions(+), 38 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 333136ddfddd95..d9369b916fe4dc 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -203,7 +203,7 @@ if [[ -z "$CHECK" || "$CHECK" == "code" ]]; then import sys import pandas -blacklist = {'bs4', 'gcsfs', 'html5lib', 'ipython', 'jinja2' 'hypothesis', +blacklist = {'bs4', 'gcsfs', 'html5lib', 'ipython', 'jinja2', 'hypothesis', 'lxml', 'numexpr', 'openpyxl', 'py', 'pytest', 's3fs', 'scipy', 'tables', 'xlrd', 'xlsxwriter', 'xlwt'} mods = blacklist & set(m.split('.')[0] for m in sys.modules) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 47d1e98f214a11..4ef17b116a1d94 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -235,7 +235,7 @@ def fast_unique_multiple(list arrays, sort: bool=True): if sort is None: try: uniques.sort() - except Exception: + except TypeError: # TODO: RuntimeWarning? pass @@ -264,7 +264,7 @@ def fast_unique_multiple_list(lists: list, sort: bool=True) -> list: if sort: try: uniques.sort() - except Exception: + except TypeError: pass return uniques @@ -304,7 +304,7 @@ def fast_unique_multiple_list_gen(object gen, bint sort=True): if sort: try: uniques.sort() - except Exception: + except TypeError: pass return uniques @@ -1410,7 +1410,7 @@ def infer_datetimelike_array(arr: object) -> object: try: array_to_datetime(objs, errors='raise') return 'datetime' - except: + except (ValueError, TypeError): pass # we are *not* going to infer from strings diff --git a/pandas/core/common.py b/pandas/core/common.py index a507625ccfa01f..cf113c8aecbfe5 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -211,7 +211,7 @@ def try_sort(iterable): listed = list(iterable) try: return sorted(listed) - except Exception: + except TypeError: return listed diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index d079a1c4ef4f7b..2ebfbed0b132a2 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -583,9 +583,11 @@ def _get_grouper( # if the actual grouper should be obj[key] def is_in_axis(key): if not _is_label_like(key): + items = obj._data.items try: - obj._data.items.get_loc(key) - except Exception: + items.get_loc(key) + except (KeyError, TypeError): + # TypeError shows up here if we pass e.g. Int64Index return False return True diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 6263973fb0d2fe..bcda25bf3ce394 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -615,14 +615,9 @@ def _aggregate( is_datetimelike, min_count=-1, ): - if values.ndim > 3: + if values.ndim > 2: # punting for now - raise NotImplementedError("number of dimensions is currently limited to 3") - elif values.ndim > 2: - for i, chunk in enumerate(values.transpose(2, 0, 1)): - - chunk = chunk.squeeze() - agg_func(result[:, :, i], counts, chunk, comp_ids, min_count) + raise NotImplementedError("number of dimensions is currently limited to 2") else: agg_func(result, counts, values, comp_ids, min_count) @@ -640,20 +635,9 @@ def _transform( ): comp_ids, _, ngroups = self.group_info - if values.ndim > 3: + if values.ndim > 2: # punting for now - raise NotImplementedError("number of dimensions is currently limited to 3") - elif values.ndim > 2: - for i, chunk in enumerate(values.transpose(2, 0, 1)): - - transform_func( - result[:, :, i], - values, - comp_ids, - ngroups, - is_datetimelike, - **kwargs - ) + raise NotImplementedError("number of dimensions is currently limited to 2") else: transform_func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs) @@ -932,11 +916,7 @@ def _chop(self, sdata, slice_obj): class FrameSplitter(DataSplitter): def fast_apply(self, f, names): # must return keys::list, values::list, mutated::bool - try: - starts, ends = lib.generate_slices(self.slabels, self.ngroups) - except Exception: - # fails when all -1 - return [], True + starts, ends = lib.generate_slices(self.slabels, self.ngroups) sdata = self._get_sorted_data() return libreduction.apply_frame_axis0(sdata, f, names, starts, ends) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 9fd6efe32de291..a94a4ccff0efe5 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -698,10 +698,7 @@ def na_op(x, y): return result - def wrapper(self, other, axis=None): - # Validate the axis parameter - if axis is not None: - self._get_axis_number(axis) + def wrapper(self, other): res_name = get_op_result_name(self, other) other = lib.item_from_zerodim(other) @@ -1104,7 +1101,7 @@ def f(self, other): # straight boolean comparisons we want to allow all columns # (regardless of dtype to pass thru) See #4537 for discussion. res = self._combine_const(other, func) - return res.fillna(True).astype(bool) + return res f.__name__ = op_name diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index 93baafddedeb48..3a24736c57c011 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -145,6 +145,7 @@ def _getitem_tuple(self, tup): # Cython import warning @pytest.mark.filterwarnings("ignore:can't resolve:ImportWarning") +@pytest.mark.filterwarnings("ignore:RangeIndex.* is deprecated:DeprecationWarning") def test_pyarrow(df): pyarrow = import_module("pyarrow") # noqa From 243c1bcfd09342efeae50f5b8104d92e2f9f06bd Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 4 Sep 2019 06:26:11 -0500 Subject: [PATCH 66/95] DEV: Remove seed-isort-config hook (#28272) This was causing issues for me locally. Anyone else? It took a while to run, and didn't seem to give the same output as others (depends on something peculiar to my environment) which doesn't seem to be great for a pre-commit hook. Closes https://github.com/pandas-dev/pandas/issues/28236 --- .pre-commit-config.yaml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5cc22c638c9b13..b79f0f71dac23d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -15,7 +15,3 @@ repos: hooks: - id: isort language: python_venv -- repo: https://github.com/asottile/seed-isort-config - rev: v1.9.2 - hooks: - - id: seed-isort-config From 4c778a1eb73da3a2935357dbbfcbe46f3be52f31 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Wed, 4 Sep 2019 04:56:25 -0700 Subject: [PATCH 67/95] Fix to_json Memory Tests (#28259) --- asv_bench/benchmarks/io/json.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/io/json.py b/asv_bench/benchmarks/io/json.py index b249c92b53e93e..5c1d39776b91c9 100644 --- a/asv_bench/benchmarks/io/json.py +++ b/asv_bench/benchmarks/io/json.py @@ -118,7 +118,7 @@ def setup(self, orient, frame): def time_to_json(self, orient, frame): getattr(self, frame).to_json(self.fname, orient=orient) - def mem_to_json(self, orient, frame): + def peakmem_to_json(self, orient, frame): getattr(self, frame).to_json(self.fname, orient=orient) def time_to_json_wide(self, orient, frame): @@ -126,7 +126,7 @@ def time_to_json_wide(self, orient, frame): df = concat([base_df.iloc[:100]] * 1000, ignore_index=True, axis=1) df.to_json(self.fname, orient=orient) - def mem_to_json_wide(self, orient, frame): + def peakmem_to_json_wide(self, orient, frame): base_df = getattr(self, frame).copy() df = concat([base_df.iloc[:100]] * 1000, ignore_index=True, axis=1) df.to_json(self.fname, orient=orient) From 4252ab7718b06820ce485b8136294616e34ab168 Mon Sep 17 00:00:00 2001 From: tobycheese Date: Wed, 4 Sep 2019 18:13:11 +0200 Subject: [PATCH 68/95] fix typo in example (#28281) --- doc/source/user_guide/options.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/options.rst b/doc/source/user_guide/options.rst index f32a8adfd4d335..1f1dff417e68f3 100644 --- a/doc/source/user_guide/options.rst +++ b/doc/source/user_guide/options.rst @@ -163,7 +163,7 @@ determines how many rows are shown in the truncated repr. .. ipython:: python pd.set_option('max_rows', 8) - pd.set_option('max_rows', 4) + pd.set_option('min_rows', 4) # below max_rows -> all rows shown df = pd.DataFrame(np.random.randn(7, 2)) df From 0bde7cedf46209a9fd4fa8c7f9fbce8b49aa78cd Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Wed, 4 Sep 2019 18:07:16 +0100 Subject: [PATCH 69/95] BUG: Make sure correct values are passed to Rolling._on when axis=1 (#28267) * Make sure correct values are passed to Rolling._on when axis=1 * Update rolling.py * Capitalise 'd' as in documentation * Parametrize over tz_naive_fixture * autoformat --- doc/source/whatsnew/v1.0.0.rst | 2 +- pandas/core/window/rolling.py | 5 ++++- pandas/tests/window/test_rolling.py | 27 +++++++++++++++++++++++++++ 3 files changed, 32 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 0d2b81eca6789c..58892b316c9408 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -177,7 +177,7 @@ Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - -- +- Bug in :meth:`DataFrame.rolling` not allowing for rolling over datetimes when ``axis=1`` (:issue: `28192`) - Bug in :meth:`DataFrame.groupby` not offering selection by column name when ``axis=1`` (:issue:`27614`) - Bug in :meth:`DataFrameGroupby.agg` not able to use lambda function with named aggregation (:issue:`27519`) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index a7e122fa3528ff..29ef2e917ae57f 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -1653,7 +1653,10 @@ def is_datetimelike(self): def _on(self): if self.on is None: - return self.obj.index + if self.axis == 0: + return self.obj.index + elif self.axis == 1: + return self.obj.columns elif isinstance(self.obj, ABCDataFrame) and self.on in self.obj.columns: return Index(self.obj[self.on]) else: diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index b4787bf25e3bb6..70ba85120af3c6 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -334,3 +334,30 @@ def test_readonly_array(self): result = pd.Series(arr).rolling(2).mean() expected = pd.Series([np.nan, 2, np.nan, np.nan, 4]) tm.assert_series_equal(result, expected) + + def test_rolling_datetime(self, axis_frame, tz_naive_fixture): + # GH-28192 + tz = tz_naive_fixture + df = pd.DataFrame( + { + i: [1] * 2 + for i in pd.date_range("2019-8-01", "2019-08-03", freq="D", tz=tz) + } + ) + if axis_frame in [0, "index"]: + result = df.T.rolling("2D", axis=axis_frame).sum().T + else: + result = df.rolling("2D", axis=axis_frame).sum() + expected = pd.DataFrame( + { + **{ + i: [1.0] * 2 + for i in pd.date_range("2019-8-01", periods=1, freq="D", tz=tz) + }, + **{ + i: [2.0] * 2 + for i in pd.date_range("2019-8-02", "2019-8-03", freq="D", tz=tz) + }, + } + ) + tm.assert_frame_equal(result, expected) From 6a7ba96de03b1f6c3ee534bfa76afba03596a9f4 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 5 Sep 2019 08:34:44 -0700 Subject: [PATCH 70/95] PERF: asv for import (#28239) --- asv_bench/benchmarks/package.py | 25 +++++++++++++++++++++++++ pandas/core/dtypes/dtypes.py | 2 +- pandas/util/_test_decorators.py | 2 +- 3 files changed, 27 insertions(+), 2 deletions(-) create mode 100644 asv_bench/benchmarks/package.py diff --git a/asv_bench/benchmarks/package.py b/asv_bench/benchmarks/package.py new file mode 100644 index 00000000000000..8ca33db361fa07 --- /dev/null +++ b/asv_bench/benchmarks/package.py @@ -0,0 +1,25 @@ +""" +Benchmarks for pandas at the package-level. +""" +import subprocess +import sys + +from pandas.compat import PY37 + + +class TimeImport: + def time_import(self): + if PY37: + # on py37+ we the "-X importtime" usage gives us a more precise + # measurement of the import time we actually care about, + # without the subprocess or interpreter overhead + cmd = [sys.executable, "-X", "importtime", "-c", "import pandas as pd"] + p = subprocess.run(cmd, stderr=subprocess.PIPE) + + line = p.stderr.splitlines()[-1] + field = line.split(b"|")[-2].strip() + total = int(field) # microseconds + return total + + cmd = [sys.executable, "-c", "import pandas as pd"] + subprocess.run(cmd, stderr=subprocess.PIPE) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index ee1866e60644b8..aa7e6801ba431c 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -23,7 +23,7 @@ ordered_sentinel = object() # type: object -def register_extension_dtype(cls: Type[ExtensionDtype],) -> Type[ExtensionDtype]: +def register_extension_dtype(cls: Type[ExtensionDtype]) -> Type[ExtensionDtype]: """ Register an ExtensionType with pandas as class decorator. diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index 627757aaa37412..0e07b9f5fe9f76 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -102,7 +102,7 @@ def _skip_if_no_scipy(): ) -def skip_if_installed(package: str,) -> Callable: +def skip_if_installed(package: str) -> Callable: """ Skip a test if a package is installed. From 04e67c46e5c9f93f26d41d9e970dc7554e80916c Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 5 Sep 2019 08:52:21 -0700 Subject: [PATCH 71/95] PERF: trim import time ~5% (#28227) * PERF: trim import time ~5% with lazy imports --- ci/code_checks.sh | 10 +++++++--- pandas/io/common.py | 16 +++++++++++++--- pandas/io/excel/_base.py | 2 +- pandas/tests/io/excel/test_readers.py | 3 +-- pandas/tests/io/parser/test_common.py | 2 +- pandas/tests/io/test_html.py | 3 ++- pandas/util/testing.py | 22 +++++++++++++++++----- 7 files changed, 42 insertions(+), 16 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index d9369b916fe4dc..f839d86318e2ec 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -203,10 +203,14 @@ if [[ -z "$CHECK" || "$CHECK" == "code" ]]; then import sys import pandas -blacklist = {'bs4', 'gcsfs', 'html5lib', 'ipython', 'jinja2', 'hypothesis', +blacklist = {'bs4', 'gcsfs', 'html5lib', 'http', 'ipython', 'jinja2', 'hypothesis', 'lxml', 'numexpr', 'openpyxl', 'py', 'pytest', 's3fs', 'scipy', - 'tables', 'xlrd', 'xlsxwriter', 'xlwt'} -mods = blacklist & set(m.split('.')[0] for m in sys.modules) + 'tables', 'urllib.request', 'xlrd', 'xlsxwriter', 'xlwt'} + +# GH#28227 for some of these check for top-level modules, while others are +# more specific (e.g. urllib.request) +import_mods = set(m.split('.')[0] for m in sys.modules) | set(sys.modules) +mods = blacklist & import_mods if mods: sys.stderr.write('err: pandas should not import: {}\n'.format(', '.join(mods))) sys.exit(len(mods)) diff --git a/pandas/io/common.py b/pandas/io/common.py index 30228d660e8167..ac8dee8467370d 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -4,7 +4,6 @@ import codecs import csv import gzip -from http.client import HTTPException # noqa from io import BufferedIOBase, BytesIO import mmap import os @@ -22,7 +21,6 @@ Type, Union, ) -from urllib.error import URLError # noqa from urllib.parse import ( # noqa urlencode, urljoin, @@ -31,7 +29,6 @@ uses_params, uses_relative, ) -from urllib.request import pathname2url, urlopen import zipfile from pandas.compat import _get_lzma_file, _import_lzma @@ -188,6 +185,16 @@ def is_gcs_url(url) -> bool: return False +def urlopen(*args, **kwargs): + """ + Lazy-import wrapper for stdlib urlopen, as that imports a big chunk of + the stdlib. + """ + import urllib.request + + return urllib.request.urlopen(*args, **kwargs) + + def get_filepath_or_buffer( filepath_or_buffer: FilePathOrBuffer, encoding: Optional[str] = None, @@ -261,6 +268,9 @@ def file_path_to_url(path: str) -> str: ------- a valid FILE URL """ + # lazify expensive import (~30ms) + from urllib.request import pathname2url + return urljoin("file:", pathname2url(path)) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 997edf49d9e8fc..949eff45c0e92c 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -4,7 +4,6 @@ from io import BytesIO import os from textwrap import fill -from urllib.request import urlopen from pandas._config import config @@ -21,6 +20,7 @@ _stringify_path, _validate_header_arg, get_filepath_or_buffer, + urlopen, ) from pandas.io.excel._util import ( _fill_mi_header, diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index a39cface0e0157..5326f2df68972f 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -3,6 +3,7 @@ from datetime import datetime, time from functools import partial import os +from urllib.error import URLError import warnings import numpy as np @@ -14,8 +15,6 @@ from pandas import DataFrame, Index, MultiIndex, Series import pandas.util.testing as tm -from pandas.io.common import URLError - @contextlib.contextmanager def ignore_xlrd_time_clock_warning(): diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 0586593c87cc54..756463e9d8d335 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -11,6 +11,7 @@ import os import platform from tempfile import TemporaryFile +from urllib.error import URLError import numpy as np import pytest @@ -21,7 +22,6 @@ from pandas import DataFrame, Index, MultiIndex, Series, compat, concat import pandas.util.testing as tm -from pandas.io.common import URLError from pandas.io.parsers import CParserWrapper, TextFileReader, TextParser diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 615e2735cd288f..183d217eb09d61 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -4,6 +4,7 @@ import os import re import threading +from urllib.error import URLError import numpy as np from numpy.random import rand @@ -17,7 +18,7 @@ import pandas.util.testing as tm from pandas.util.testing import makeCustomDataframe as mkdf, network -from pandas.io.common import URLError, file_path_to_url +from pandas.io.common import file_path_to_url import pandas.io.html from pandas.io.html import read_html diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 0d543f891a5f63..c54dab046f57e7 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -4,7 +4,6 @@ from datetime import datetime from functools import wraps import gzip -import http.client import os import re from shutil import rmtree @@ -2275,11 +2274,17 @@ def dec(f): # But some tests (test_data yahoo) contact incredibly flakey # servers. -# and conditionally raise on these exception types -_network_error_classes = (IOError, http.client.HTTPException, TimeoutError) +# and conditionally raise on exception types in _get_default_network_errors -def can_connect(url, error_classes=_network_error_classes): +def _get_default_network_errors(): + # Lazy import for http.client because it imports many things from the stdlib + import http.client + + return (IOError, http.client.HTTPException, TimeoutError) + + +def can_connect(url, error_classes=None): """Try to connect to the given url. True if succeeds, False if IOError raised @@ -2294,6 +2299,10 @@ def can_connect(url, error_classes=_network_error_classes): Return True if no IOError (unable to connect) or URLError (bad url) was raised """ + + if error_classes is None: + error_classes = _get_default_network_errors() + try: with urlopen(url): pass @@ -2309,7 +2318,7 @@ def network( url="http://www.google.com", raise_on_error=_RAISE_NETWORK_ERROR_DEFAULT, check_before_test=False, - error_classes=_network_error_classes, + error_classes=None, skip_errnos=_network_errno_vals, _skip_on_messages=_network_error_messages, ): @@ -2397,6 +2406,9 @@ def network( """ from pytest import skip + if error_classes is None: + error_classes = _get_default_network_errors() + t.network = True @wraps(t) From 2915223e8c6866149e78f5bdab184881fa39354c Mon Sep 17 00:00:00 2001 From: Igor Filippov Date: Thu, 5 Sep 2019 19:10:53 +0200 Subject: [PATCH 72/95] Improved benchmark coverage for reading spreadsheets (#28230) * Improved benchmark coverage for reading spreadsheets * Added blank lines * More blank lines * Updated whatsnew * - Removed whatsnew entry - Added comment in environment.yml - Added conda-forge to asv config - Refactored reader benchmark * Updated requirements-dev.txt * Fixed imports order * Fixed imports again * Run black * Changed conda channels order in ASV config * Used setup_cache to speed up read benchmark --- asv_bench/asv.conf.json | 3 +- asv_bench/benchmarks/io/excel.py | 76 +++++++++++++++++++++++--------- environment.yml | 1 + requirements-dev.txt | 1 + 4 files changed, 58 insertions(+), 23 deletions(-) diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index 571ede1a211340..c04bbf53a86a6f 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -50,12 +50,13 @@ "xlsxwriter": [], "xlrd": [], "xlwt": [], + "odfpy": [], "pytest": [], // If using Windows with python 2.7 and want to build using the // mingw toolchain (rather than MSVC), uncomment the following line. // "libpython": [], }, - + "conda_channels": ["defaults", "conda-forge"], // Combinations of libraries/python versions can be excluded/included // from the set to test. Each entry is a dictionary containing additional // key-value pairs to include/exclude. diff --git a/asv_bench/benchmarks/io/excel.py b/asv_bench/benchmarks/io/excel.py index 9aa5cbd5b6f7c3..c97cf768e27d97 100644 --- a/asv_bench/benchmarks/io/excel.py +++ b/asv_bench/benchmarks/io/excel.py @@ -1,40 +1,72 @@ from io import BytesIO import numpy as np +from odf.opendocument import OpenDocumentSpreadsheet +from odf.table import Table, TableCell, TableRow +from odf.text import P from pandas import DataFrame, ExcelWriter, date_range, read_excel import pandas.util.testing as tm -class Excel: +def _generate_dataframe(): + N = 2000 + C = 5 + df = DataFrame( + np.random.randn(N, C), + columns=["float{}".format(i) for i in range(C)], + index=date_range("20000101", periods=N, freq="H"), + ) + df["object"] = tm.makeStringIndex(N) + return df + + +class WriteExcel: params = ["openpyxl", "xlsxwriter", "xlwt"] param_names = ["engine"] def setup(self, engine): - N = 2000 - C = 5 - self.df = DataFrame( - np.random.randn(N, C), - columns=["float{}".format(i) for i in range(C)], - index=date_range("20000101", periods=N, freq="H"), - ) - self.df["object"] = tm.makeStringIndex(N) - self.bio_read = BytesIO() - self.writer_read = ExcelWriter(self.bio_read, engine=engine) - self.df.to_excel(self.writer_read, sheet_name="Sheet1") - self.writer_read.save() - self.bio_read.seek(0) - - def time_read_excel(self, engine): - read_excel(self.bio_read) + self.df = _generate_dataframe() def time_write_excel(self, engine): - bio_write = BytesIO() - bio_write.seek(0) - writer_write = ExcelWriter(bio_write, engine=engine) - self.df.to_excel(writer_write, sheet_name="Sheet1") - writer_write.save() + bio = BytesIO() + bio.seek(0) + writer = ExcelWriter(bio, engine=engine) + self.df.to_excel(writer, sheet_name="Sheet1") + writer.save() + + +class ReadExcel: + + params = ["xlrd", "openpyxl", "odf"] + param_names = ["engine"] + fname_excel = "spreadsheet.xlsx" + fname_odf = "spreadsheet.ods" + + def _create_odf(self): + doc = OpenDocumentSpreadsheet() + table = Table(name="Table1") + for row in self.df.values: + tr = TableRow() + for val in row: + tc = TableCell(valuetype="string") + tc.addElement(P(text=val)) + tr.addElement(tc) + table.addElement(tr) + + doc.spreadsheet.addElement(table) + doc.save(self.fname_odf) + + def setup_cache(self): + self.df = _generate_dataframe() + + self.df.to_excel(self.fname_excel, sheet_name="Sheet1") + self._create_odf() + + def time_read_excel(self, engine): + fname = self.fname_odf if engine == "odf" else self.fname_excel + read_excel(fname, engine=engine) from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/environment.yml b/environment.yml index 6d2cd701c38540..d72972ffc4da48 100644 --- a/environment.yml +++ b/environment.yml @@ -80,4 +80,5 @@ dependencies: - xlrd # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile - xlsxwriter # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile - xlwt # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile + - odfpy # pandas.read_excel - pyreadstat # pandas.read_spss diff --git a/requirements-dev.txt b/requirements-dev.txt index cf11a3ee282584..c0fb9ee331b11a 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -54,4 +54,5 @@ xarray xlrd xlsxwriter xlwt +odfpy pyreadstat \ No newline at end of file From 813123b2d67860b7104f12ad2f6469aa64833fb2 Mon Sep 17 00:00:00 2001 From: zys5945 Date: Thu, 5 Sep 2019 11:03:56 -0700 Subject: [PATCH 73/95] DOC: fix read_excel and ExcelFile engine parameter description (#28231) (#28245) --- pandas/io/excel/_base.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 949eff45c0e92c..6dba5e042562b7 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -112,7 +112,7 @@ engine : str, default None If io is not a buffer or path, this must be set to identify io. - Acceptable values are None or xlrd. + Acceptable values are None, "xlrd", "openpyxl" or "odf". converters : dict, default None Dict of functions for converting values in certain columns. Keys can either be integers or column labels, values are functions that take one @@ -783,11 +783,12 @@ class ExcelFile: Parameters ---------- io : string, path object (pathlib.Path or py._path.local.LocalPath), - file-like object or xlrd workbook - If a string or path object, expected to be a path to xls or xlsx file. + a file-like object, xlrd workbook or openpypl workbook. + If a string or path object, expected to be a path to xls, xlsx or odf file. engine : string, default None If io is not a buffer or path, this must be set to identify io. - Acceptable values are None or ``xlrd``. + Acceptable values are None, ``xlrd``, ``openpyxl`` or ``odf``. + Note that ``odf`` reads tables out of OpenDocument formatted files. """ from pandas.io.excel._odfreader import _ODFReader From 2d65e38f5c245a8410c7cb37ec17424def00fa78 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 5 Sep 2019 11:14:17 -0700 Subject: [PATCH 74/95] Fix inconsistent casting to bool (#28290) --- pandas/core/ops/__init__.py | 10 ++++++++-- pandas/tests/series/test_operators.py | 12 ++++-------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index a94a4ccff0efe5..60fa1bef01f3dc 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -806,7 +806,13 @@ def na_op(x, y): return result fill_int = lambda x: x.fillna(0) - fill_bool = lambda x: x.fillna(False).astype(bool) + + def fill_bool(x, left=None): + # if `left` is specifically not-boolean, we do not cast to bool + x = x.fillna(False) + if left is None or is_bool_dtype(left.dtype): + x = x.astype(bool) + return x def wrapper(self, other): is_self_int_dtype = is_integer_dtype(self.dtype) @@ -835,7 +841,7 @@ def wrapper(self, other): elif isinstance(other, (ABCSeries, ABCIndexClass)): is_other_int_dtype = is_integer_dtype(other.dtype) - other = other if is_other_int_dtype else fill_bool(other) + other = other if is_other_int_dtype else fill_bool(other, self) else: # scalars, list, tuple, np.array diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index aa44760dcd9180..bf725a04de0589 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -103,11 +103,8 @@ def test_logical_operators_int_dtype_with_float(self): s_0123 & [0.1, 4, 3.14, 2] with pytest.raises(TypeError): s_0123 & np.array([0.1, 4, 3.14, 2]) - - # FIXME: this should be consistent with the list case above - expected = Series([False, True, False, True]) - result = s_0123 & Series([0.1, 4, -3.14, 2]) - assert_series_equal(result, expected) + with pytest.raises(TypeError): + s_0123 & Series([0.1, 4, -3.14, 2]) def test_logical_operators_int_dtype_with_str(self): s_1111 = Series([1] * 4, dtype="int8") @@ -145,9 +142,8 @@ def test_logical_operators_int_dtype_with_object(self): assert_series_equal(result, expected) s_abNd = Series(["a", "b", np.NaN, "d"]) - result = s_0123 & s_abNd - expected = Series([False, True, False, True]) - assert_series_equal(result, expected) + with pytest.raises(TypeError, match="unsupported.* 'int' and 'str'"): + s_0123 & s_abNd def test_logical_operators_bool_dtype_with_int(self): index = list("bca") From 820072a0f9bccdfbfb11fe82caf60adb9fad1323 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sat, 7 Sep 2019 12:29:40 +0100 Subject: [PATCH 75/95] BUG: Remove null values before sorting during groupby nunique calculation (#27951) Closes #27904 --- doc/source/whatsnew/v1.0.0.rst | 2 +- pandas/core/groupby/generic.py | 4 +++ pandas/tests/groupby/test_function.py | 48 ++++++++++++++++++++++++++- 3 files changed, 52 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 58892b316c9408..2f72de25c579ba 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -97,7 +97,7 @@ Datetimelike - Bug in :meth:`Series.__setitem__` incorrectly casting ``np.timedelta64("NaT")`` to ``np.datetime64("NaT")`` when inserting into a :class:`Series` with datetime64 dtype (:issue:`27311`) - Bug in :meth:`Series.dt` property lookups when the underlying data is read-only (:issue:`27529`) - Bug in ``HDFStore.__getitem__`` incorrectly reading tz attribute created in Python 2 (:issue:`26443`) -- +- Bug in :meth:`pandas.core.groupby.SeriesGroupBy.nunique` where ``NaT`` values were interfering with the count of unique values (:issue:`27951`) Timedelta diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index c0436e93890782..e514162f84c374 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1147,6 +1147,10 @@ def nunique(self, dropna=True): val = self.obj._internal_get_values() + # GH 27951 + # temporary fix while we wait for NumPy bug 12629 to be fixed + val[isna(val)] = np.datetime64("NaT") + try: sorter = np.lexsort((val, ids)) except TypeError: # catches object dtypes diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index d89233f2fd603c..afb22a732691cd 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1,4 +1,5 @@ import builtins +import datetime as dt from io import StringIO from itertools import product from string import ascii_lowercase @@ -9,7 +10,16 @@ from pandas.errors import UnsupportedFunctionCall import pandas as pd -from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, date_range, isna +from pandas import ( + DataFrame, + Index, + MultiIndex, + NaT, + Series, + Timestamp, + date_range, + isna, +) import pandas.core.nanops as nanops from pandas.util import _test_decorators as td, testing as tm @@ -1015,6 +1025,42 @@ def test_nunique_with_timegrouper(): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize( + "key, data, dropna, expected", + [ + ( + ["x", "x", "x"], + [Timestamp("2019-01-01"), NaT, Timestamp("2019-01-01")], + True, + Series([1], index=pd.Index(["x"], name="key"), name="data"), + ), + ( + ["x", "x", "x"], + [dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1)], + True, + Series([1], index=pd.Index(["x"], name="key"), name="data"), + ), + ( + ["x", "x", "x", "y", "y"], + [dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1)], + False, + Series([2, 2], index=pd.Index(["x", "y"], name="key"), name="data"), + ), + ( + ["x", "x", "x", "x", "y"], + [dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1)], + False, + Series([2, 1], index=pd.Index(["x", "y"], name="key"), name="data"), + ), + ], +) +def test_nunique_with_NaT(key, data, dropna, expected): + # GH 27951 + df = pd.DataFrame({"key": key, "data": data}) + result = df.groupby(["key"])["data"].nunique(dropna=dropna) + tm.assert_series_equal(result, expected) + + def test_nunique_preserves_column_level_names(): # GH 23222 test = pd.DataFrame([1, 2, 2], columns=pd.Index(["A"], name="level_0")) From c3b1252f9cada5f6f2696e34783e9dbeadb7beba Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 7 Sep 2019 10:17:31 -0700 Subject: [PATCH 76/95] CLN: split_and_operate (#28327) --- pandas/core/internals/blocks.py | 56 ++++++++++++++++----------------- 1 file changed, 27 insertions(+), 29 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 33698d245e9ffc..2a44177d445df8 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -416,15 +416,16 @@ def fillna(self, value, limit=None, inplace=False, downcast=None): return self if inplace else self.copy() # operate column-by-column - def f(m, v, i): + def f(mask, val, idx): block = self.coerce_to_target_dtype(value) # slice out our block - if i is not None: - block = block.getitem_block(slice(i, i + 1)) + if idx is not None: + # i.e. self.ndim == 2 + block = block.getitem_block(slice(idx, idx + 1)) return block.fillna(value, limit=limit, inplace=inplace, downcast=None) - return self.split_and_operate(mask, f, inplace) + return self.split_and_operate(None, f, inplace) def split_and_operate(self, mask, f, inplace: bool): """ @@ -444,7 +445,8 @@ def split_and_operate(self, mask, f, inplace: bool): """ if mask is None: - mask = np.ones(self.shape, dtype=bool) + mask = np.broadcast_to(True, shape=self.shape) + new_values = self.values def make_a_block(nv, ref_loc): @@ -523,19 +525,14 @@ def downcast(self, dtypes=None): raise ValueError( "downcast must have a dictionary or 'infer' as its argument" ) + elif dtypes != "infer": + raise AssertionError("dtypes as dict is not supported yet") # operate column-by-column # this is expensive as it splits the blocks items-by-item - def f(m, v, i): - - if dtypes == "infer": - dtype = "infer" - else: - raise AssertionError("dtypes as dict is not supported yet") - - if dtype is not None: - v = maybe_downcast_to_dtype(v, dtype) - return v + def f(mask, val, idx): + val = maybe_downcast_to_dtype(val, dtype="infer") + return val return self.split_and_operate(None, f, False) @@ -1002,15 +999,15 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0, transpose=False) new = new.reshape(tuple(new_shape)) # operate column-by-column - def f(m, v, i): + def f(mask, val, idx): - if i is None: + if idx is None: # ndim==1 case. n = new else: if isinstance(new, np.ndarray): - n = np.squeeze(new[i % new.shape[0]]) + n = np.squeeze(new[idx % new.shape[0]]) else: n = np.array(new) @@ -1020,7 +1017,7 @@ def f(m, v, i): # we need to explicitly astype here to make a copy n = n.astype(dtype) - nv = _putmask_smart(v, m, n) + nv = _putmask_smart(val, mask, n) return nv new_blocks = self.split_and_operate(mask, f, inplace) @@ -2627,10 +2624,10 @@ def convert( """ # operate column-by-column - def f(m, v, i): - shape = v.shape + def f(mask, val, idx): + shape = val.shape values = soft_convert_objects( - v.ravel(), + val.ravel(), datetime=datetime, numeric=numeric, timedelta=timedelta, @@ -3172,14 +3169,15 @@ def _safe_reshape(arr, new_shape): return arr -def _putmask_smart(v, m, n): +def _putmask_smart(v, mask, n): """ Return a new ndarray, try to preserve dtype if possible. Parameters ---------- v : `values`, updated in-place (array like) - m : `mask`, applies to both sides (array like) + mask : np.ndarray + Applies to both sides (array like). n : `new values` either scalar or an array like aligned with `values` Returns @@ -3197,12 +3195,12 @@ def _putmask_smart(v, m, n): # n should be the length of the mask or a scalar here if not is_list_like(n): - n = np.repeat(n, len(m)) + n = np.repeat(n, len(mask)) # see if we are only masking values that if putted # will work in the current dtype try: - nn = n[m] + nn = n[mask] except TypeError: # TypeError: only integer scalar arrays can be converted to a scalar index pass @@ -3227,16 +3225,16 @@ def _putmask_smart(v, m, n): comp = nn == nn_at if is_list_like(comp) and comp.all(): nv = v.copy() - nv[m] = nn_at + nv[mask] = nn_at return nv n = np.asarray(n) def _putmask_preserve(nv, n): try: - nv[m] = n[m] + nv[mask] = n[mask] except (IndexError, ValueError): - nv[m] = n + nv[mask] = n return nv # preserves dtype if possible From 0a00ebe61849f2ad9b7ee6a65b27f92a6491969d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 7 Sep 2019 10:17:47 -0700 Subject: [PATCH 77/95] CLN: eval_kwargs (#28328) --- pandas/core/computation/expressions.py | 12 ++++++------ pandas/core/ops/__init__.py | 24 +++++------------------- 2 files changed, 11 insertions(+), 25 deletions(-) diff --git a/pandas/core/computation/expressions.py b/pandas/core/computation/expressions.py index 29c8239fa518fc..90bb12b4cd727f 100644 --- a/pandas/core/computation/expressions.py +++ b/pandas/core/computation/expressions.py @@ -62,8 +62,9 @@ def set_numexpr_threads(n=None): ne.set_num_threads(n) -def _evaluate_standard(op, op_str, a, b, **eval_kwargs): +def _evaluate_standard(op, op_str, a, b, reversed=False): """ standard evaluation """ + # `reversed` kwarg is included for compatibility with _evaluate_numexpr if _TEST_MODE: _store_test_result(False) with np.errstate(all="ignore"): @@ -96,7 +97,7 @@ def _can_use_numexpr(op, op_str, a, b, dtype_check): return False -def _evaluate_numexpr(op, op_str, a, b, truediv=True, reversed=False, **eval_kwargs): +def _evaluate_numexpr(op, op_str, a, b, reversed=False): result = None if _can_use_numexpr(op, op_str, a, b, "evaluate"): @@ -111,8 +112,6 @@ def _evaluate_numexpr(op, op_str, a, b, truediv=True, reversed=False, **eval_kwa "a_value {op} b_value".format(op=op_str), local_dict={"a_value": a_value, "b_value": b_value}, casting="safe", - truediv=truediv, - **eval_kwargs ) except ValueError as detail: if "unknown type object" in str(detail): @@ -201,7 +200,7 @@ def _bool_arith_check( return True -def evaluate(op, op_str, a, b, use_numexpr=True, **eval_kwargs): +def evaluate(op, op_str, a, b, use_numexpr=True, reversed=False): """ Evaluate and return the expression of the op on a and b. @@ -214,11 +213,12 @@ def evaluate(op, op_str, a, b, use_numexpr=True, **eval_kwargs): b : right operand use_numexpr : bool, default True Whether to try to use numexpr. + reversed : bool, default False """ use_numexpr = use_numexpr and _bool_arith_check(op_str, a, b) if use_numexpr: - return _evaluate(op, op_str, a, b, **eval_kwargs) + return _evaluate(op, op_str, a, b, reversed=reversed) return _evaluate_standard(op, op_str, a, b) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 60fa1bef01f3dc..f1f4777cedbc57 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -213,12 +213,6 @@ def _gen_eval_kwargs(name): # Exclude commutative operations kwargs["reversed"] = True - if name in ["truediv", "rtruediv"]: - kwargs["truediv"] = True - - if name in ["ne"]: - kwargs["masker"] = True - return kwargs @@ -247,7 +241,7 @@ def _get_frame_op_default_axis(name): return "columns" -def _get_opstr(op, cls): +def _get_opstr(op): """ Find the operation string, if any, to pass to numexpr for this operation. @@ -255,19 +249,11 @@ def _get_opstr(op, cls): Parameters ---------- op : binary operator - cls : class Returns ------- op_str : string or None """ - # numexpr is available for non-sparse classes - subtyp = getattr(cls, "_subtyp", "") - use_numexpr = "sparse" not in subtyp - - if not use_numexpr: - # if we're not using numexpr, then don't pass a str_rep - return None return { operator.add: "+", @@ -624,7 +610,7 @@ def _arith_method_SERIES(cls, op, special): Wrapper function for Series arithmetic operations, to avoid code duplication. """ - str_rep = _get_opstr(op, cls) + str_rep = _get_opstr(op) op_name = _get_op_name(op, special) eval_kwargs = _gen_eval_kwargs(op_name) construct_result = ( @@ -999,7 +985,7 @@ def to_series(right): def _arith_method_FRAME(cls, op, special): - str_rep = _get_opstr(op, cls) + str_rep = _get_opstr(op) op_name = _get_op_name(op, special) eval_kwargs = _gen_eval_kwargs(op_name) default_axis = _get_frame_op_default_axis(op_name) @@ -1041,7 +1027,7 @@ def f(self, other, axis=default_axis, level=None, fill_value=None): def _flex_comp_method_FRAME(cls, op, special): - str_rep = _get_opstr(op, cls) + str_rep = _get_opstr(op) op_name = _get_op_name(op, special) default_axis = _get_frame_op_default_axis(op_name) @@ -1082,7 +1068,7 @@ def f(self, other, axis=default_axis, level=None): def _comp_method_FRAME(cls, func, special): - str_rep = _get_opstr(func, cls) + str_rep = _get_opstr(func) op_name = _get_op_name(func, special) @Appender("Wrapper for comparison method {name}".format(name=op_name)) From 6b23fb8d006309b3c050c1ccde280349328c2aae Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Sat, 7 Sep 2019 12:18:53 -0500 Subject: [PATCH 78/95] Clean groupby error message (#28324) --- pandas/core/groupby/ops.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index bcda25bf3ce394..1a3f0da3cf92bf 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -463,9 +463,7 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1, **kwargs): # categoricals are only 1d, so we # are not setup for dim transforming if is_categorical_dtype(values) or is_sparse(values): - raise NotImplementedError( - "{} are not support in cython ops".format(values.dtype) - ) + raise NotImplementedError("{} dtype not supported".format(values.dtype)) elif is_datetime64_any_dtype(values): if how in ["add", "prod", "cumsum", "cumprod"]: raise NotImplementedError( From e24d9e51b5f7e84d08d9b6b246ebdfa2d3eab6fa Mon Sep 17 00:00:00 2001 From: William Ayd Date: Sat, 7 Sep 2019 10:20:02 -0700 Subject: [PATCH 79/95] Removed PyString refs from extension modules (#28322) * Removed PyString refs from extension modules * Reverted macro --- pandas/_libs/src/parse_helper.h | 5 ----- pandas/_libs/src/ujson/python/objToJSON.c | 4 ++-- pandas/_libs/tslibs/util.pxd | 5 ----- pandas/_libs/writers.pyx | 9 +-------- 4 files changed, 3 insertions(+), 20 deletions(-) diff --git a/pandas/_libs/src/parse_helper.h b/pandas/_libs/src/parse_helper.h index 1db4c813bb4930..0a767dd27b6580 100644 --- a/pandas/_libs/src/parse_helper.h +++ b/pandas/_libs/src/parse_helper.h @@ -25,11 +25,6 @@ int to_double(char *item, double *p_value, char sci, char decimal, return (error == 0) && (!*p_end); } -#if PY_VERSION_HEX < 0x02060000 -#define PyBytes_Check PyString_Check -#define PyBytes_AS_STRING PyString_AS_STRING -#endif // PY_VERSION_HEX - int floatify(PyObject *str, double *result, int *maybe_int) { int status; char *data; diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 4b612bb033761d..dc9b906c8d76c4 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -435,7 +435,7 @@ static void *PyFloatToDOUBLE(JSOBJ _obj, JSONTypeContext *tc, void *outValue, return NULL; } -static void *PyStringToUTF8(JSOBJ _obj, JSONTypeContext *tc, void *outValue, +static void *PyBytesToUTF8(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) { PyObject *obj = (PyObject *)_obj; *_outLen = PyBytes_GET_SIZE(obj); @@ -1869,7 +1869,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { return; } else if (PyBytes_Check(obj)) { PRINTMARK(); - pc->PyTypeToJSON = PyStringToUTF8; + pc->PyTypeToJSON = PyBytesToUTF8; tc->type = JT_UTF8; return; } else if (PyUnicode_Check(obj)) { diff --git a/pandas/_libs/tslibs/util.pxd b/pandas/_libs/tslibs/util.pxd index 07c2805dd0ef61..65f4e98708f47e 100644 --- a/pandas/_libs/tslibs/util.pxd +++ b/pandas/_libs/tslibs/util.pxd @@ -4,11 +4,7 @@ from cpython cimport PyTypeObject cdef extern from *: """ PyObject* char_to_string(const char* data) { - #if PY_VERSION_HEX >= 0x03000000 return PyUnicode_FromString(data); - #else - return PyString_FromString(data); - #endif } """ object char_to_string(const char* data) @@ -18,7 +14,6 @@ cdef extern from "Python.h": # Note: importing extern-style allows us to declare these as nogil # functions, whereas `from cpython cimport` does not. bint PyUnicode_Check(object obj) nogil - bint PyString_Check(object obj) nogil bint PyBool_Check(object obj) nogil bint PyFloat_Check(object obj) nogil bint PyComplex_Check(object obj) nogil diff --git a/pandas/_libs/writers.pyx b/pandas/_libs/writers.pyx index d1aecf0a9d2947..e5d78dae9c0233 100644 --- a/pandas/_libs/writers.pyx +++ b/pandas/_libs/writers.pyx @@ -3,11 +3,6 @@ from cython import Py_ssize_t from cpython cimport PyBytes_GET_SIZE, PyUnicode_GET_SIZE -try: - from cpython cimport PyString_GET_SIZE -except ImportError: - from cpython cimport PyUnicode_GET_SIZE as PyString_GET_SIZE - import numpy as np from numpy cimport ndarray, uint8_t @@ -126,11 +121,9 @@ def max_len_string_array(pandas_string[:] arr) -> Py_ssize_t: for i in range(length): val = arr[i] if isinstance(val, str): - l = PyString_GET_SIZE(val) + l = PyUnicode_GET_SIZE(val) elif isinstance(val, bytes): l = PyBytes_GET_SIZE(val) - elif isinstance(val, unicode): - l = PyUnicode_GET_SIZE(val) if l > m: m = l From 53ad571d86449fba1b854dfede6de76657930282 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 7 Sep 2019 10:23:07 -0700 Subject: [PATCH 80/95] CLN: catch Exception less (#28309) --- pandas/core/apply.py | 14 ++++++++------ pandas/core/arrays/datetimes.py | 3 ++- pandas/core/dtypes/concat.py | 5 ++--- pandas/core/indexes/accessors.py | 2 +- pandas/core/series.py | 3 --- pandas/plotting/_core.py | 14 ++++++-------- pandas/plotting/_matplotlib/converter.py | 4 +++- pandas/plotting/_matplotlib/core.py | 2 +- setup.py | 4 ++-- 9 files changed, 25 insertions(+), 26 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index b96b3c75720315..e6766a33a613b2 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -199,20 +199,21 @@ def apply_empty_result(self): return self.obj.copy() # we may need to infer - reduce = self.result_type == "reduce" + should_reduce = self.result_type == "reduce" from pandas import Series - if not reduce: + if not should_reduce: EMPTY_SERIES = Series([]) try: r = self.f(EMPTY_SERIES, *self.args, **self.kwds) - reduce = not isinstance(r, Series) except Exception: pass + else: + should_reduce = not isinstance(r, Series) - if reduce: + if should_reduce: return self.obj._constructor_sliced(np.nan, index=self.agg_axis) else: return self.obj.copy() @@ -306,10 +307,11 @@ def apply_series_generator(self): for i, v in enumerate(series_gen): try: results[i] = self.f(v) - keys.append(v.name) - successes.append(i) except Exception: pass + else: + keys.append(v.name) + successes.append(i) # so will work with MultiIndex if len(successes) < len(res_index): diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 732f819e743a47..5dff1f93264c3e 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -2284,7 +2284,8 @@ def _infer_tz_from_endpoints(start, end, tz): """ try: inferred_tz = timezones.infer_tzinfo(start, end) - except Exception: + except AssertionError: + # infer_tzinfo raises AssertionError if passed mismatched timezones raise TypeError( "Start and end cannot both be tz-aware with different timezones" ) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 12f3fd2c75dc8a..1094ab22238e97 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -89,10 +89,9 @@ def concat_compat(to_concat, axis=0): # filter empty arrays # 1-d dtypes always are included here def is_nonempty(x): - try: - return x.shape[axis] > 0 - except Exception: + if x.ndim <= axis: return True + return x.shape[axis] > 0 # If all arrays are empty, there's nothing to convert, just short-cut to # the concatenation, #3121. diff --git a/pandas/core/indexes/accessors.py b/pandas/core/indexes/accessors.py index 2036728e702f30..11b6cb2ca3ed4b 100644 --- a/pandas/core/indexes/accessors.py +++ b/pandas/core/indexes/accessors.py @@ -316,7 +316,7 @@ def __new__(cls, data): # do all the validation here. from pandas import Series - if not isinstance(data, Series): + if not isinstance(data, ABCSeries): raise TypeError( "cannot convert an object of type {0} to a " "datetimelike index".format(type(data)) diff --git a/pandas/core/series.py b/pandas/core/series.py index 6fb39c422de932..10d50e89ca92eb 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1114,9 +1114,6 @@ def __getitem__(self, key): return self.__getitem__(new_key) raise - except Exception: - raise - if is_iterator(key): key = list(key) diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index d3c9e8ccfa51ca..837b01974be930 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -1,22 +1,20 @@ import importlib -from typing import List, Type # noqa import warnings +from pandas._config import get_option + +from pandas.compat._optional import import_optional_dependency from pandas.util._decorators import Appender from pandas.core.dtypes.common import is_integer, is_list_like from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries -import pandas from pandas.core.base import PandasObject # Trigger matplotlib import, which implicitly registers our # converts. Implicit registration is deprecated, and when enforced # we can lazily import matplotlib. -try: - import pandas.plotting._matplotlib # noqa -except ImportError: - pass +import_optional_dependency("pandas.plotting._matplotlib", raise_on_missing=False) def hist_series( @@ -732,7 +730,7 @@ def __call__(self, *args, **kwargs): # `x` parameter, and return a Series with the parameter `y` as values. data = self._parent.copy() - if isinstance(data, pandas.core.dtypes.generic.ABCSeries): + if isinstance(data, ABCSeries): kwargs["reuse_plot"] = True if kind in self._dataframe_kinds: @@ -1603,7 +1601,7 @@ def _get_plot_backend(backend=None): The backend is imported lazily, as matplotlib is a soft dependency, and pandas can be used without it being installed. """ - backend = backend or pandas.get_option("plotting.backend") + backend = backend or get_option("plotting.backend") if backend == "matplotlib": # Because matplotlib is an optional dependency and first-party backend, diff --git a/pandas/plotting/_matplotlib/converter.py b/pandas/plotting/_matplotlib/converter.py index 893854ab26e37d..446350cb5d9152 100644 --- a/pandas/plotting/_matplotlib/converter.py +++ b/pandas/plotting/_matplotlib/converter.py @@ -329,7 +329,7 @@ def __init__(self, locator, tz=None, defaultfmt="%Y-%m-%d"): class PandasAutoDateLocator(dates.AutoDateLocator): def get_locator(self, dmin, dmax): - "Pick the best locator based on a distance." + """Pick the best locator based on a distance.""" _check_implicitly_registered() delta = relativedelta(dmax, dmin) @@ -382,6 +382,7 @@ def __call__(self): dmax, dmin = dmin, dmax # We need to cap at the endpoints of valid datetime + # FIXME: dont leave commented-out # TODO(wesm) unused? # delta = relativedelta(dmax, dmin) # try: @@ -448,6 +449,7 @@ def autoscale(self): # We need to cap at the endpoints of valid datetime + # FIXME: dont leave commented-out # TODO(wesm): unused? # delta = relativedelta(dmax, dmin) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 6ff3f284403039..346949cb82c4d0 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -1,5 +1,5 @@ import re -from typing import Optional # noqa +from typing import Optional import warnings import numpy as np diff --git a/setup.py b/setup.py index a86527ace092b3..76db96870c36a1 100755 --- a/setup.py +++ b/setup.py @@ -300,12 +300,12 @@ def run(self): for clean_me in self._clean_me: try: os.unlink(clean_me) - except Exception: + except OSError: pass for clean_tree in self._clean_trees: try: shutil.rmtree(clean_tree) - except Exception: + except OSError: pass From a72b24059dd647f1d8357e6241f267bc58fc8bc7 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Sat, 7 Sep 2019 10:43:01 -0700 Subject: [PATCH 81/95] Added cpp files to build clean (#28320) --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 76db96870c36a1..05e5f5250e2506 100755 --- a/setup.py +++ b/setup.py @@ -277,6 +277,7 @@ def initialize_options(self): ".pyo", ".pyd", ".c", + ".cpp", ".orig", ): self._clean_me.append(filepath) From 71119275b93b0be2fef6304cc42fef685ae6cef9 Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Sat, 7 Sep 2019 12:59:25 -0500 Subject: [PATCH 82/95] PERF: Speed up Spearman calculation (#28151) --- asv_bench/benchmarks/stat_ops.py | 11 +++++++++++ doc/source/whatsnew/v1.0.0.rst | 2 +- pandas/_libs/algos.pyx | 20 ++++++++++++++++---- 3 files changed, 28 insertions(+), 5 deletions(-) diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py index 6032bee41958e1..ed5ebfa61594ec 100644 --- a/asv_bench/benchmarks/stat_ops.py +++ b/asv_bench/benchmarks/stat_ops.py @@ -113,12 +113,23 @@ def setup(self, method, use_bottleneck): nanops._USE_BOTTLENECK = use_bottleneck self.df = pd.DataFrame(np.random.randn(1000, 30)) self.df2 = pd.DataFrame(np.random.randn(1000, 30)) + self.df_wide = pd.DataFrame(np.random.randn(1000, 200)) + self.df_wide_nans = self.df_wide.where(np.random.random((1000, 200)) < 0.9) self.s = pd.Series(np.random.randn(1000)) self.s2 = pd.Series(np.random.randn(1000)) def time_corr(self, method, use_bottleneck): self.df.corr(method=method) + def time_corr_wide(self, method, use_bottleneck): + self.df_wide.corr(method=method) + + def time_corr_wide_nans(self, method, use_bottleneck): + self.df_wide_nans.corr(method=method) + + def peakmem_corr_wide(self, method, use_bottleneck): + self.df_wide.corr(method=method) + def time_corr_series(self, method, use_bottleneck): self.s.corr(self.s2, method=method) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 2f72de25c579ba..628e2e708e4f12 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -75,9 +75,9 @@ Performance improvements - Performance improvement in indexing with a non-unique :class:`IntervalIndex` (:issue:`27489`) - Performance improvement in `MultiIndex.is_monotonic` (:issue:`27495`) - Performance improvement in :func:`cut` when ``bins`` is an :class:`IntervalIndex` (:issue:`27668`) +- Performance improvement in :meth:`DataFrame.corr` when ``method`` is ``"spearman"`` (:issue:`28139`) - Performance improvement in :meth:`DataFrame.replace` when provided a list of values to replace (:issue:`28099`) - .. _whatsnew_1000.bug_fixes: Bug fixes diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 038447ad252fe2..0f91f612994c7b 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -296,6 +296,7 @@ def nancorr_spearman(const float64_t[:, :] mat, Py_ssize_t minp=1): cdef: Py_ssize_t i, j, xi, yi, N, K ndarray[float64_t, ndim=2] result + ndarray[float64_t, ndim=2] ranked_mat ndarray[float64_t, ndim=1] maskedx ndarray[float64_t, ndim=1] maskedy ndarray[uint8_t, ndim=2] mask @@ -307,10 +308,18 @@ def nancorr_spearman(const float64_t[:, :] mat, Py_ssize_t minp=1): result = np.empty((K, K), dtype=np.float64) mask = np.isfinite(mat).view(np.uint8) + ranked_mat = np.empty((N, K), dtype=np.float64) + + for i in range(K): + ranked_mat[:, i] = rank_1d_float64(mat[:, i]) + for xi in range(K): for yi in range(xi + 1): nobs = 0 + # Keep track of whether we need to recompute ranks + all_ranks = True for i in range(N): + all_ranks &= not (mask[i, xi] ^ mask[i, yi]) if mask[i, xi] and mask[i, yi]: nobs += 1 @@ -320,13 +329,16 @@ def nancorr_spearman(const float64_t[:, :] mat, Py_ssize_t minp=1): maskedx = np.empty(nobs, dtype=np.float64) maskedy = np.empty(nobs, dtype=np.float64) j = 0 + for i in range(N): if mask[i, xi] and mask[i, yi]: - maskedx[j] = mat[i, xi] - maskedy[j] = mat[i, yi] + maskedx[j] = ranked_mat[i, xi] + maskedy[j] = ranked_mat[i, yi] j += 1 - maskedx = rank_1d_float64(maskedx) - maskedy = rank_1d_float64(maskedy) + + if not all_ranks: + maskedx = rank_1d_float64(maskedx) + maskedy = rank_1d_float64(maskedy) mean = (nobs + 1) / 2. From 1cd7ae6a4b65366773124fc115292687db27397a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 7 Sep 2019 12:21:08 -0700 Subject: [PATCH 83/95] BUG: datetime64 - Timestamp incorrectly raising TypeError (#28286) --- doc/source/whatsnew/v1.0.0.rst | 2 ++ pandas/_libs/tslibs/c_timestamp.pyx | 5 +++++ pandas/tests/scalar/timestamp/test_arithmetic.py | 14 ++++++++++++++ 3 files changed, 21 insertions(+) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 628e2e708e4f12..161ebf9783e1bb 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -98,6 +98,8 @@ Datetimelike - Bug in :meth:`Series.dt` property lookups when the underlying data is read-only (:issue:`27529`) - Bug in ``HDFStore.__getitem__`` incorrectly reading tz attribute created in Python 2 (:issue:`26443`) - Bug in :meth:`pandas.core.groupby.SeriesGroupBy.nunique` where ``NaT`` values were interfering with the count of unique values (:issue:`27951`) +- Bug in :class:`Timestamp` subtraction when subtracting a :class:`Timestamp` from a ``np.datetime64`` object incorrectly raising ``TypeError`` (:issue:`28286`) +- Timedelta diff --git a/pandas/_libs/tslibs/c_timestamp.pyx b/pandas/_libs/tslibs/c_timestamp.pyx index 41e2ae6b5b59b6..e3456edbf7e627 100644 --- a/pandas/_libs/tslibs/c_timestamp.pyx +++ b/pandas/_libs/tslibs/c_timestamp.pyx @@ -312,6 +312,11 @@ cdef class _Timestamp(datetime): except (OverflowError, OutOfBoundsDatetime): pass + elif is_datetime64_object(self): + # GH#28286 cython semantics for __rsub__, `other` is actually + # the Timestamp + return type(other)(self) - other + # scalar Timestamp/datetime - Timedelta -> yields a Timestamp (with # same timezone if specified) return datetime.__sub__(self, other) diff --git a/pandas/tests/scalar/timestamp/test_arithmetic.py b/pandas/tests/scalar/timestamp/test_arithmetic.py index 2ef4fe79eeacf5..7b00f00fc9ec49 100644 --- a/pandas/tests/scalar/timestamp/test_arithmetic.py +++ b/pandas/tests/scalar/timestamp/test_arithmetic.py @@ -66,6 +66,20 @@ def test_delta_preserve_nanos(self): result = val + timedelta(1) assert result.nanosecond == val.nanosecond + def test_rsub_dtscalars(self, tz_naive_fixture): + # In particular, check that datetime64 - Timestamp works GH#28286 + td = Timedelta(1235345642000) + ts = Timestamp.now(tz_naive_fixture) + other = ts + td + + assert other - ts == td + assert other.to_pydatetime() - ts == td + if tz_naive_fixture is None: + assert other.to_datetime64() - ts == td + else: + with pytest.raises(TypeError, match="subtraction must have"): + other.to_datetime64() - ts + def test_timestamp_sub_datetime(self): dt = datetime(2013, 10, 12) ts = Timestamp(datetime(2013, 10, 13)) From 7161b907748b46e535b3a3444d2ab47c37a95612 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 7 Sep 2019 12:39:07 -0700 Subject: [PATCH 84/95] CLN: catch specific Exceptions in _config (#28310) --- pandas/_config/display.py | 5 ++++- pandas/_config/localization.py | 12 ++++-------- pandas/tests/io/formats/test_console.py | 8 +++++--- 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/pandas/_config/display.py b/pandas/_config/display.py index 6e5fabe2706e5e..067b7c503baabf 100644 --- a/pandas/_config/display.py +++ b/pandas/_config/display.py @@ -28,7 +28,10 @@ def detect_console_encoding(): if not encoding or "ascii" in encoding.lower(): try: encoding = locale.getpreferredencoding() - except Exception: + except locale.Error: + # can be raised by locale.setlocale(), which is + # called by getpreferredencoding + # (on some systems, see stdlib locale docs) pass # when all else fails. this will usually be "ascii" diff --git a/pandas/_config/localization.py b/pandas/_config/localization.py index 46802c64609594..9f750d8447c6ab 100644 --- a/pandas/_config/localization.py +++ b/pandas/_config/localization.py @@ -98,13 +98,7 @@ def _valid_locales(locales, normalize): def _default_locale_getter(): - try: - raw_locales = subprocess.check_output(["locale -a"], shell=True) - except subprocess.CalledProcessError as e: - raise type(e)( - "{exception}, the 'locale -a' command cannot be found " - "on your system".format(exception=e) - ) + raw_locales = subprocess.check_output(["locale -a"], shell=True) return raw_locales @@ -139,7 +133,9 @@ def get_locales(prefix=None, normalize=True, locale_getter=_default_locale_gette """ try: raw_locales = locale_getter() - except Exception: + except subprocess.CalledProcessError: + # Raised on (some? all?) Windows platforms because Note: "locale -a" + # is not defined return None try: diff --git a/pandas/tests/io/formats/test_console.py b/pandas/tests/io/formats/test_console.py index f4bee99296a834..e56d14885f11e3 100644 --- a/pandas/tests/io/formats/test_console.py +++ b/pandas/tests/io/formats/test_console.py @@ -1,3 +1,5 @@ +import locale + import pytest from pandas._config import detect_console_encoding @@ -50,11 +52,11 @@ def test_detect_console_encoding_fallback_to_locale(monkeypatch, encoding): "std,locale", [ ["ascii", "ascii"], - ["ascii", Exception], + ["ascii", locale.Error], [AttributeError, "ascii"], - [AttributeError, Exception], + [AttributeError, locale.Error], [IOError, "ascii"], - [IOError, Exception], + [IOError, locale.Error], ], ) def test_detect_console_encoding_fallback_to_default(monkeypatch, std, locale): From 9aa9db9b85ee0285e11fc950f570a886233bc5b1 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 7 Sep 2019 12:43:42 -0700 Subject: [PATCH 85/95] catch more specific (#28198) --- pandas/core/groupby/ops.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 1a3f0da3cf92bf..40517eefe4d5db 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -212,9 +212,12 @@ def apply(self, f, data, axis=0): # This Exception is also raised if `f` triggers an exception # but it is preferable to raise the exception in Python. pass - except TypeError: - # occurs if we have any EAs - pass + except TypeError as err: + if "Cannot convert" in str(err): + # via apply_frame_axis0 if we pass a non-ndarray + pass + else: + raise for key, (i, group) in zip(group_keys, splitter): object.__setattr__(group, "name", key) From f04c4db6ad7da21752705808063780572baf5172 Mon Sep 17 00:00:00 2001 From: Mohamed Amine ZGHAL Date: Sun, 8 Sep 2019 19:08:18 +0200 Subject: [PATCH 86/95] Pandas.series.astype docstring PR02 (#28340) --- pandas/core/generic.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index b427b1f0ac8580..831543ee660392 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5780,11 +5780,11 @@ def astype(self, dtype, copy=True, errors="raise", **kwargs): Control raising of exceptions on invalid data for provided dtype. - ``raise`` : allow exceptions to be raised - - ``ignore`` : suppress exceptions. On error return original object + - ``ignore`` : suppress exceptions. On error return original object. .. versionadded:: 0.20.0 - kwargs : keyword arguments to pass on to the constructor + **kwargs : keyword arguments to pass on to the constructor Returns ------- @@ -5845,7 +5845,7 @@ def astype(self, dtype, copy=True, errors="raise", **kwargs): Convert to ordered categorical type with custom ordering: >>> cat_dtype = pd.api.types.CategoricalDtype( - ... categories=[2, 1], ordered=True) + ... categories=[2, 1], ordered=True) >>> ser.astype(cat_dtype) 0 1 1 2 @@ -5855,7 +5855,7 @@ def astype(self, dtype, copy=True, errors="raise", **kwargs): Note that using ``copy=False`` and changing data on a new pandas object may propagate changes: - >>> s1 = pd.Series([1,2]) + >>> s1 = pd.Series([1, 2]) >>> s2 = s1.astype('int64', copy=False) >>> s2[0] = 10 >>> s1 # note that s1[0] has changed too From 5c57e7bc066d86564084b23e832f645f35e06c0e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 8 Sep 2019 10:09:54 -0700 Subject: [PATCH 87/95] BUG: Timestamp+int should raise NullFrequencyError, not ValueError (#28268) --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/_libs/tslibs/__init__.py | 3 ++ pandas/_libs/tslibs/c_timestamp.pyx | 34 +++++++++++-------- pandas/errors/__init__.py | 10 +----- pandas/tests/arithmetic/test_timedelta64.py | 5 +-- .../tests/scalar/timestamp/test_arithmetic.py | 8 +++-- pandas/tests/tslibs/test_api.py | 1 + 7 files changed, 32 insertions(+), 30 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 161ebf9783e1bb..e1fe2f7fe77e2c 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -99,6 +99,7 @@ Datetimelike - Bug in ``HDFStore.__getitem__`` incorrectly reading tz attribute created in Python 2 (:issue:`26443`) - Bug in :meth:`pandas.core.groupby.SeriesGroupBy.nunique` where ``NaT`` values were interfering with the count of unique values (:issue:`27951`) - Bug in :class:`Timestamp` subtraction when subtracting a :class:`Timestamp` from a ``np.datetime64`` object incorrectly raising ``TypeError`` (:issue:`28286`) +- Addition and subtraction of integer or integer-dtype arrays with :class:`Timestamp` will now raise ``NullFrequencyError`` instead of ``ValueError`` (:issue:`28268`) - diff --git a/pandas/_libs/tslibs/__init__.py b/pandas/_libs/tslibs/__init__.py index 67a323782a836a..8d3b00e4a44b91 100644 --- a/pandas/_libs/tslibs/__init__.py +++ b/pandas/_libs/tslibs/__init__.py @@ -7,3 +7,6 @@ from .timedeltas import Timedelta, delta_to_nanoseconds, ints_to_pytimedelta from .timestamps import Timestamp from .tzconversion import tz_convert_single + +# import fails if we do this before np_datetime +from .c_timestamp import NullFrequencyError # isort:skip diff --git a/pandas/_libs/tslibs/c_timestamp.pyx b/pandas/_libs/tslibs/c_timestamp.pyx index e3456edbf7e627..a45b8c9b35dfab 100644 --- a/pandas/_libs/tslibs/c_timestamp.pyx +++ b/pandas/_libs/tslibs/c_timestamp.pyx @@ -42,6 +42,15 @@ from pandas._libs.tslibs.timezones import UTC from pandas._libs.tslibs.tzconversion cimport tz_convert_single +class NullFrequencyError(ValueError): + """ + Error raised when a null `freq` attribute is used in an operation + that needs a non-null frequency, particularly `DatetimeIndex.shift`, + `TimedeltaIndex.shift`, `PeriodIndex.shift`. + """ + pass + + def maybe_integer_op_deprecated(obj): # GH#22535 add/sub of integers and int-arrays is deprecated if obj.freq is not None: @@ -227,8 +236,8 @@ cdef class _Timestamp(datetime): # to be compat with Period return NaT elif self.freq is None: - raise ValueError("Cannot add integral value to Timestamp " - "without freq.") + raise NullFrequencyError( + "Cannot add integral value to Timestamp without freq.") return self.__class__((self.freq * other).apply(self), freq=self.freq) @@ -246,17 +255,15 @@ cdef class _Timestamp(datetime): result = self.__class__(self.value + nanos, tz=self.tzinfo, freq=self.freq) - if getattr(other, 'normalize', False): - # DateOffset - result = result.normalize() return result elif is_array(other): if other.dtype.kind in ['i', 'u']: maybe_integer_op_deprecated(self) if self.freq is None: - raise ValueError("Cannot add integer-dtype array " - "to Timestamp without freq.") + raise NullFrequencyError( + "Cannot add integer-dtype array " + "to Timestamp without freq.") return self.freq * other + self # index/series like @@ -270,6 +277,7 @@ cdef class _Timestamp(datetime): return result def __sub__(self, other): + if (is_timedelta64_object(other) or is_integer_object(other) or PyDelta_Check(other) or hasattr(other, 'delta')): # `delta` attribute is for offsets.Tick or offsets.Week obj @@ -280,15 +288,16 @@ cdef class _Timestamp(datetime): if other.dtype.kind in ['i', 'u']: maybe_integer_op_deprecated(self) if self.freq is None: - raise ValueError("Cannot subtract integer-dtype array " - "from Timestamp without freq.") + raise NullFrequencyError( + "Cannot subtract integer-dtype array " + "from Timestamp without freq.") return self - self.freq * other typ = getattr(other, '_typ', None) if typ is not None: return NotImplemented - elif other is NaT: + if other is NaT: return NaT # coerce if necessary if we are a Timestamp-like @@ -311,15 +320,12 @@ cdef class _Timestamp(datetime): return Timedelta(self.value - other.value) except (OverflowError, OutOfBoundsDatetime): pass - elif is_datetime64_object(self): # GH#28286 cython semantics for __rsub__, `other` is actually # the Timestamp return type(other)(self) - other - # scalar Timestamp/datetime - Timedelta -> yields a Timestamp (with - # same timezone if specified) - return datetime.__sub__(self, other) + return NotImplemented cdef int64_t _maybe_convert_value_to_local(self): """Convert UTC i8 value to local i8 value if tz exists""" diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index 3177937ac4ba19..a85fc8bfb14142 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -4,7 +4,7 @@ Expose public exceptions & warnings """ -from pandas._libs.tslibs import OutOfBoundsDatetime +from pandas._libs.tslibs import NullFrequencyError, OutOfBoundsDatetime class PerformanceWarning(Warning): @@ -157,14 +157,6 @@ class MergeError(ValueError): """ -class NullFrequencyError(ValueError): - """ - Error raised when a null `freq` attribute is used in an operation - that needs a non-null frequency, particularly `DatetimeIndex.shift`, - `TimedeltaIndex.shift`, `PeriodIndex.shift`. - """ - - class AccessorRegistrationWarning(Warning): """Warning for attribute conflicts in accessor registration.""" diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index ee27ce97f269e9..d480b26e30fff6 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -241,10 +241,7 @@ def test_subtraction_ops(self): with pytest.raises(TypeError, match=msg): tdi - dti - msg = ( - r"descriptor '__sub__' requires a 'datetime\.datetime' object" - " but received a 'Timedelta'" - ) + msg = r"unsupported operand type\(s\) for -" with pytest.raises(TypeError, match=msg): td - dt diff --git a/pandas/tests/scalar/timestamp/test_arithmetic.py b/pandas/tests/scalar/timestamp/test_arithmetic.py index 7b00f00fc9ec49..9634c6d8222368 100644 --- a/pandas/tests/scalar/timestamp/test_arithmetic.py +++ b/pandas/tests/scalar/timestamp/test_arithmetic.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas.errors import NullFrequencyError + from pandas import Timedelta, Timestamp import pandas.util.testing as tm @@ -177,12 +179,12 @@ def test_timestamp_add_timedelta64_unit(self, other, expected_difference): ], ) def test_add_int_no_freq_raises(self, ts, other): - with pytest.raises(ValueError, match="without freq"): + with pytest.raises(NullFrequencyError, match="without freq"): ts + other - with pytest.raises(ValueError, match="without freq"): + with pytest.raises(NullFrequencyError, match="without freq"): other + ts - with pytest.raises(ValueError, match="without freq"): + with pytest.raises(NullFrequencyError, match="without freq"): ts - other with pytest.raises(TypeError): other - ts diff --git a/pandas/tests/tslibs/test_api.py b/pandas/tests/tslibs/test_api.py index 47e398dfe3d167..7a8a6d511aa69a 100644 --- a/pandas/tests/tslibs/test_api.py +++ b/pandas/tests/tslibs/test_api.py @@ -29,6 +29,7 @@ def test_namespace(): "NaTType", "iNaT", "is_null_datetimelike", + "NullFrequencyError", "OutOfBoundsDatetime", "Period", "IncompatibleFrequency", From df3d9b2cdfe2ebab3a5b22e6f5359a393e519af4 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 8 Sep 2019 12:36:55 -0700 Subject: [PATCH 88/95] CLN: handle bare exceptions im timedeltas, timestamps, reduction (#28346) --- pandas/_libs/reduction.pyx | 4 +++- pandas/_libs/tslibs/c_timestamp.pyx | 3 ++- pandas/_libs/tslibs/timedeltas.pyx | 11 ++++++++--- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index c892c1cf1b8a3e..bf940eb03e06f4 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -528,7 +528,8 @@ def apply_frame_axis0(object frame, object f, object names, try: piece = f(chunk) - except: + except Exception: + # We can't be more specific without knowing something about `f` raise InvalidApply('Let this error raise above us') # Need to infer if low level index slider will cause segfaults @@ -539,6 +540,7 @@ def apply_frame_axis0(object frame, object f, object names, else: mutated = True except AttributeError: + # `piece` might not have an index, could be e.g. an int pass results.append(piece) diff --git a/pandas/_libs/tslibs/c_timestamp.pyx b/pandas/_libs/tslibs/c_timestamp.pyx index a45b8c9b35dfab..dfa66d7e2d8626 100644 --- a/pandas/_libs/tslibs/c_timestamp.pyx +++ b/pandas/_libs/tslibs/c_timestamp.pyx @@ -140,7 +140,8 @@ cdef class _Timestamp(datetime): try: stamp += zone.strftime(' %%Z') - except: + except AttributeError: + # e.g. tzlocal has no `strftime` pass tz = ", tz='{0}'".format(zone) if zone is not None else "" diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index d24aafae0967df..ad7c32ca319405 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -228,8 +228,13 @@ def array_to_timedelta64(object[:] values, unit='ns', errors='raise'): # this is where all of the error handling will take place. try: for i in range(n): - result[i] = parse_timedelta_string(values[i]) - except: + if values[i] is NaT: + # we allow this check in the fast-path because NaT is a C-object + # so this is an inexpensive check + iresult[i] = NPY_NAT + else: + result[i] = parse_timedelta_string(values[i]) + except (TypeError, ValueError): unit = parse_timedelta_unit(unit) for i in range(n): try: @@ -309,7 +314,7 @@ cdef inline int64_t cast_from_unit(object ts, object unit) except? -1: return (base * m) + (frac * m) -cdef inline parse_timedelta_string(object ts): +cdef inline int64_t parse_timedelta_string(str ts) except? -1: """ Parse a regular format timedelta string. Return an int64_t (in ns) or raise a ValueError on an invalid parse. From 7d5425fdf1e0b010edc3d06bb79d9ff74fcc4f31 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 9 Sep 2019 04:53:37 -0700 Subject: [PATCH 89/95] PERF: lazify type-check import (#28342) --- pandas/io/formats/format.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index f8db1b19dadfa8..4a66ad48d13185 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -5,6 +5,7 @@ import codecs from contextlib import contextmanager +from datetime import tzinfo import decimal from functools import partial from io import StringIO @@ -27,8 +28,6 @@ ) from unicodedata import east_asian_width -from dateutil.tz.tz import tzutc -from dateutil.zoneinfo import tzfile import numpy as np from pandas._config.config import get_option, set_option @@ -1552,9 +1551,7 @@ def _is_dates_only( def _format_datetime64( - x: Union[NaTType, Timestamp], - tz: Optional[Union[tzfile, tzutc]] = None, - nat_rep: str = "NaT", + x: Union[NaTType, Timestamp], tz: Optional[tzinfo] = None, nat_rep: str = "NaT" ) -> str: if x is None or (is_scalar(x) and isna(x)): return nat_rep From 17f73aaac1071a4b861c96f7957b1dd88e4c466c Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 9 Sep 2019 04:54:41 -0700 Subject: [PATCH 90/95] CLN: avoid bare except in libfrequencies (#28344) --- pandas/_libs/tslibs/frequencies.pyx | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/pandas/_libs/tslibs/frequencies.pyx b/pandas/_libs/tslibs/frequencies.pyx index f2dcd37b191edf..b29c8418960720 100644 --- a/pandas/_libs/tslibs/frequencies.pyx +++ b/pandas/_libs/tslibs/frequencies.pyx @@ -138,6 +138,10 @@ cpdef get_freq_code(freqstr): ------- return : tuple of base frequency code and stride (mult) + Raises + ------ + TypeError : if passed a tuple witth incorrect types + Examples -------- >>> get_freq_code('3D') @@ -156,16 +160,16 @@ cpdef get_freq_code(freqstr): if is_integer_object(freqstr[0]) and is_integer_object(freqstr[1]): # e.g., freqstr = (2000, 1) return freqstr + elif is_integer_object(freqstr[0]): + # Note: passing freqstr[1] below will raise TypeError if that + # is not a str + code = _period_str_to_code(freqstr[1]) + stride = freqstr[0] + return code, stride else: # e.g., freqstr = ('T', 5) - try: - code = _period_str_to_code(freqstr[0]) - stride = freqstr[1] - except: - if is_integer_object(freqstr[1]): - raise - code = _period_str_to_code(freqstr[1]) - stride = freqstr[0] + code = _period_str_to_code(freqstr[0]) + stride = freqstr[1] return code, stride if is_integer_object(freqstr): @@ -177,7 +181,7 @@ cpdef get_freq_code(freqstr): return code, stride -cpdef _base_and_stride(freqstr): +cpdef _base_and_stride(str freqstr): """ Return base freq and stride info from string representation @@ -207,7 +211,7 @@ cpdef _base_and_stride(freqstr): return base, stride -cpdef _period_str_to_code(freqstr): +cpdef _period_str_to_code(str freqstr): freqstr = _lite_rule_alias.get(freqstr, freqstr) if freqstr not in _dont_uppercase: From 5d1440e8d1a3cbb24b5c43ac4a1bb981e5fd3d24 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 9 Sep 2019 04:55:05 -0700 Subject: [PATCH 91/95] CLN: avoid bare except in tslib and tslibs.parsing (#28345) --- pandas/_libs/tslib.pyx | 11 +++++------ pandas/_libs/tslibs/parsing.pyx | 27 ++++++++++----------------- 2 files changed, 15 insertions(+), 23 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 01e500a80dcc41..dc06a30004d19d 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -344,14 +344,13 @@ def array_with_unit_to_datetime(ndarray values, object unit, # try a quick conversion to i8 # if we have nulls that are not type-compat # then need to iterate - try: + if values.dtype.kind == "i": + # Note: this condition makes the casting="same_kind" redundant iresult = values.astype('i8', casting='same_kind', copy=False) mask = iresult == NPY_NAT iresult[mask] = 0 fvalues = iresult.astype('f8') * m need_to_iterate = False - except: - pass # check the bounds if not need_to_iterate: @@ -406,7 +405,7 @@ def array_with_unit_to_datetime(ndarray values, object unit, elif is_ignore: raise AssertionError iresult[i] = NPY_NAT - except: + except OverflowError: if is_raise: raise OutOfBoundsDatetime( "cannot convert input {val} with the unit " @@ -447,7 +446,7 @@ def array_with_unit_to_datetime(ndarray values, object unit, else: try: oresult[i] = Timestamp(cast_from_unit(val, unit)) - except: + except OverflowError: oresult[i] = val elif isinstance(val, str): @@ -574,7 +573,7 @@ cpdef array_to_datetime(ndarray[object] values, str errors='raise', # datetimes/strings, then we must coerce) try: iresult[i] = cast_from_unit(val, 'ns') - except: + except OverflowError: iresult[i] = NPY_NAT elif isinstance(val, str): diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index eb99f090e85657..3da3d1e4b1b414 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -587,15 +587,11 @@ def try_parse_dates(object[:] values, parser=None, else: parse_date = parser - try: - for i in range(n): - if values[i] == '': - result[i] = np.nan - else: - result[i] = parse_date(values[i]) - except Exception: - # raise if passed parser and it failed - raise + for i in range(n): + if values[i] == '': + result[i] = np.nan + else: + result[i] = parse_date(values[i]) return result.base # .base to access underlying ndarray @@ -814,7 +810,7 @@ def _guess_datetime_format(dt_str, dayfirst=False, dt_str_parse=du_parse, if dt_str_parse is None or dt_str_split is None: return None - if not isinstance(dt_str, (str, unicode)): + if not isinstance(dt_str, str): return None day_attribute_and_format = (('day',), '%d', 2) @@ -840,19 +836,16 @@ def _guess_datetime_format(dt_str, dayfirst=False, dt_str_parse=du_parse, try: parsed_datetime = dt_str_parse(dt_str, dayfirst=dayfirst) - except: + except (ValueError, OverflowError): # In case the datetime can't be parsed, its format cannot be guessed return None if parsed_datetime is None: return None - try: - tokens = dt_str_split(dt_str) - except: - # In case the datetime string can't be split, its format cannot - # be guessed - return None + # the default dt_str_split from dateutil will never raise here; we assume + # that any user-provided function will not either. + tokens = dt_str_split(dt_str) format_guess = [None] * len(tokens) found_attrs = set() From e6bafb5eadcbb85a051525c0af1b992e4df172ff Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 9 Sep 2019 04:56:15 -0700 Subject: [PATCH 92/95] CLN: Exception catching in io (#28349) * stop catching exception * CLN: catching Exception --- pandas/core/indexes/accessors.py | 23 ++++++++++------------- pandas/core/indexes/frozen.py | 5 ----- pandas/io/common.py | 18 ++++++++---------- pandas/io/parsers.py | 1 - pandas/io/pickle.py | 4 ++-- 5 files changed, 20 insertions(+), 31 deletions(-) diff --git a/pandas/core/indexes/accessors.py b/pandas/core/indexes/accessors.py index 11b6cb2ca3ed4b..cc8ecc0e64684f 100644 --- a/pandas/core/indexes/accessors.py +++ b/pandas/core/indexes/accessors.py @@ -326,18 +326,15 @@ def __new__(cls, data): if orig is not None: data = Series(orig.values.categories, name=orig.name, copy=False) - try: - if is_datetime64_dtype(data.dtype): - return DatetimeProperties(data, orig) - elif is_datetime64tz_dtype(data.dtype): - return DatetimeProperties(data, orig) - elif is_timedelta64_dtype(data.dtype): - return TimedeltaProperties(data, orig) - elif is_period_arraylike(data): - return PeriodProperties(data, orig) - elif is_datetime_arraylike(data): - return DatetimeProperties(data, orig) - except Exception: - pass # we raise an attribute error anyway + if is_datetime64_dtype(data.dtype): + return DatetimeProperties(data, orig) + elif is_datetime64tz_dtype(data.dtype): + return DatetimeProperties(data, orig) + elif is_timedelta64_dtype(data.dtype): + return TimedeltaProperties(data, orig) + elif is_period_arraylike(data): + return PeriodProperties(data, orig) + elif is_datetime_arraylike(data): + return DatetimeProperties(data, orig) raise AttributeError("Can only use .dt accessor with datetimelike values") diff --git a/pandas/core/indexes/frozen.py b/pandas/core/indexes/frozen.py index 329456e25bdedc..a6c39d049c50cf 100644 --- a/pandas/core/indexes/frozen.py +++ b/pandas/core/indexes/frozen.py @@ -70,12 +70,7 @@ def difference(self, other): # TODO: Consider deprecating these in favor of `union` (xref gh-15506) __add__ = __iadd__ = union - # Python 2 compat - def __getslice__(self, i, j): - return self.__class__(super().__getslice__(i, j)) - def __getitem__(self, n): - # Python 3 compat if isinstance(n, slice): return self.__class__(super().__getitem__(n)) return super().__getitem__(n) diff --git a/pandas/io/common.py b/pandas/io/common.py index ac8dee8467370d..0bbac8a8b7c1cf 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -90,7 +90,8 @@ def __next__(self): def _is_url(url) -> bool: - """Check to see if a URL has a valid protocol. + """ + Check to see if a URL has a valid protocol. Parameters ---------- @@ -101,10 +102,9 @@ def _is_url(url) -> bool: isurl : bool If `url` has a valid protocol return True otherwise False. """ - try: - return parse_url(url).scheme in _VALID_URLS - except Exception: + if not isinstance(url, str): return False + return parse_url(url).scheme in _VALID_URLS def _expand_user( @@ -171,18 +171,16 @@ def _stringify_path( def is_s3_url(url) -> bool: """Check for an s3, s3n, or s3a url""" - try: - return parse_url(url).scheme in ["s3", "s3n", "s3a"] - except Exception: + if not isinstance(url, str): return False + return parse_url(url).scheme in ["s3", "s3n", "s3a"] def is_gcs_url(url) -> bool: """Check for a gcs url""" - try: - return parse_url(url).scheme in ["gcs", "gs"] - except Exception: + if not isinstance(url, str): return False + return parse_url(url).scheme in ["gcs", "gs"] def urlopen(*args, **kwargs): diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index a3ff837bc7f52c..72f1adf0aad3dc 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1064,7 +1064,6 @@ def _clean_options(self, options, engine): ) if result.get(arg, depr_default) != depr_default: - # raise Exception(result.get(arg, depr_default), depr_default) depr_warning += msg + "\n\n" else: result[arg] = parser_default diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 4e390de87fc607..4b9a52a1fb8f33 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -153,10 +153,10 @@ def read_pickle(path, compression="infer"): # We want to silence any warnings about, e.g. moved modules. warnings.simplefilter("ignore", Warning) return pickle.load(f) - except Exception: # noqa: E722 + except Exception: try: return pc.load(f, encoding=None) - except Exception: # noqa: E722 + except Exception: return pc.load(f, encoding="latin1") finally: f.close() From 3f5b5c45f481fe0cbb704f6463578675318bb1f6 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 9 Sep 2019 04:57:20 -0700 Subject: [PATCH 93/95] CLN: raise ValueError instead of Exception (#28352) --- pandas/core/groupby/generic.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index e514162f84c374..e731cffea0671a 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -324,7 +324,11 @@ def _aggregate_item_by_item(self, func, *args, **kwargs): if cast: result[item] = self._try_cast(result[item], data) - except ValueError: + except ValueError as err: + if "Must produce aggregated value" in str(err): + # raised in _aggregate_named, handle at higher level + # see test_apply_with_mutated_index + raise cannot_agg.append(item) continue except TypeError as e: @@ -1009,7 +1013,7 @@ def _aggregate_named(self, func, *args, **kwargs): group.name = name output = func(group, *args, **kwargs) if isinstance(output, (Series, Index, np.ndarray)): - raise Exception("Must produce aggregated value") + raise ValueError("Must produce aggregated value") result[name] = self._try_cast(output, group) return result From e0c63b4cfaa821dfe310f4a8a1f84929ced5f5bd Mon Sep 17 00:00:00 2001 From: Noritada Kobayashi Date: Mon, 9 Sep 2019 21:06:00 +0900 Subject: [PATCH 94/95] BUG/TST: fix and test for timezone drop in GroupBy.shift/bfill/ffill (#27992) --- doc/source/whatsnew/v0.25.2.rst | 1 + pandas/core/groupby/groupby.py | 12 ++--- pandas/tests/groupby/test_groupby.py | 66 ++++++++++++++++++++++++++++ 3 files changed, 74 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.25.2.rst b/doc/source/whatsnew/v0.25.2.rst index 1cdf213d81a74b..69f324211e5b28 100644 --- a/doc/source/whatsnew/v0.25.2.rst +++ b/doc/source/whatsnew/v0.25.2.rst @@ -78,6 +78,7 @@ Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Bug incorrectly raising an ``IndexError`` when passing a list of quantiles to :meth:`pandas.core.groupby.DataFrameGroupBy.quantile` (:issue:`28113`). +- Bug in :meth:`pandas.core.groupby.GroupBy.shift`, :meth:`pandas.core.groupby.GroupBy.bfill` and :meth:`pandas.core.groupby.GroupBy.ffill` where timezone information would be dropped (:issue:`19995`, :issue:`27992`) - - - diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 55def024cb1d46..e010e615e176e6 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2263,26 +2263,28 @@ def _get_cythonized_result( base_func = getattr(libgroupby, how) for name, obj in self._iterate_slices(): + values = obj._data._values + if aggregate: result_sz = ngroups else: - result_sz = len(obj.values) + result_sz = len(values) if not cython_dtype: - cython_dtype = obj.values.dtype + cython_dtype = values.dtype result = np.zeros(result_sz, dtype=cython_dtype) func = partial(base_func, result, labels) inferences = None if needs_values: - vals = obj.values + vals = values if pre_processing: vals, inferences = pre_processing(vals) func = partial(func, vals) if needs_mask: - mask = isna(obj.values).view(np.uint8) + mask = isna(values).view(np.uint8) func = partial(func, mask) if needs_ngroups: @@ -2291,7 +2293,7 @@ def _get_cythonized_result( func(**kwargs) # Call func to modify indexer values in place if result_is_index: - result = algorithms.take_nd(obj.values, result) + result = algorithms.take_nd(values, result) if post_processing: result = post_processing(result, inferences) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 4556b22b572797..bec5cbc5fecb8b 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1882,3 +1882,69 @@ def test_groupby_axis_1(group_name): results = df.groupby(group_name, axis=1).sum() expected = df.T.groupby(group_name).sum().T assert_frame_equal(results, expected) + + +@pytest.mark.parametrize( + "op, expected", + [ + ( + "shift", + { + "time": [ + None, + None, + Timestamp("2019-01-01 12:00:00"), + Timestamp("2019-01-01 12:30:00"), + None, + None, + ] + }, + ), + ( + "bfill", + { + "time": [ + Timestamp("2019-01-01 12:00:00"), + Timestamp("2019-01-01 12:30:00"), + Timestamp("2019-01-01 14:00:00"), + Timestamp("2019-01-01 14:30:00"), + Timestamp("2019-01-01 14:00:00"), + Timestamp("2019-01-01 14:30:00"), + ] + }, + ), + ( + "ffill", + { + "time": [ + Timestamp("2019-01-01 12:00:00"), + Timestamp("2019-01-01 12:30:00"), + Timestamp("2019-01-01 12:00:00"), + Timestamp("2019-01-01 12:30:00"), + Timestamp("2019-01-01 14:00:00"), + Timestamp("2019-01-01 14:30:00"), + ] + }, + ), + ], +) +def test_shift_bfill_ffill_tz(tz_naive_fixture, op, expected): + # GH19995, GH27992: Check that timezone does not drop in shift, bfill, and ffill + tz = tz_naive_fixture + data = { + "id": ["A", "B", "A", "B", "A", "B"], + "time": [ + Timestamp("2019-01-01 12:00:00"), + Timestamp("2019-01-01 12:30:00"), + None, + None, + Timestamp("2019-01-01 14:00:00"), + Timestamp("2019-01-01 14:30:00"), + ], + } + df = DataFrame(data).assign(time=lambda x: x.time.dt.tz_localize(tz)) + + grouped = df.groupby("id") + result = getattr(grouped, op)() + expected = DataFrame(expected).assign(time=lambda x: x.time.dt.tz_localize(tz)) + assert_frame_equal(result, expected) From 96bf66108ef7a37e7b68414c4e72182e1ecdd5b4 Mon Sep 17 00:00:00 2001 From: Mak Sze Chun Date: Tue, 10 Sep 2019 10:28:47 +0800 Subject: [PATCH 95/95] [ENH] Use default EA repr for IntervalArray (#26316) --- doc/source/whatsnew/v1.0.0.rst | 20 +- pandas/core/arrays/interval.py | 210 ++++++++++++------ pandas/core/indexes/interval.py | 107 ++++++++- pandas/tests/arrays/interval/test_interval.py | 15 +- 4 files changed, 267 insertions(+), 85 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index e1fe2f7fe77e2c..329018bdf4bfb4 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -37,7 +37,25 @@ Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - :class:`pandas.core.groupby.GroupBy.transform` now raises on invalid operation names (:issue:`27489`). -- +- :class:`pandas.core.arrays.IntervalArray` adopts a new ``__repr__`` in accordance with other array classes (:issue:`25022`) + +*pandas 0.25.x* + +.. code-block:: ipython + + In [1]: pd.arrays.IntervalArray.from_tuples([(0, 1), (2, 3)]) + Out[2]: + IntervalArray([(0, 1], (2, 3]], + closed='right', + dtype='interval[int64]') + + +*pandas 1.0.0* + +.. ipython:: python + + pd.arrays.IntervalArray.from_tuples([(0, 1), (2, 3)]) + .. _whatsnew_1000.api.other: diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 7a14d6f1b619aa..1f4b76a259f00c 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -129,9 +129,9 @@ ``Interval`` objects: >>> pd.arrays.IntervalArray([pd.Interval(0, 1), pd.Interval(1, 5)]) - IntervalArray([(0, 1], (1, 5]], - closed='right', - dtype='interval[int64]') + + [(0, 1], (1, 5]] + Length: 2, closed: right, dtype: interval[int64] It may also be constructed using one of the constructor methods: :meth:`IntervalArray.from_arrays`, @@ -248,9 +248,8 @@ def _from_factorized(cls, values, original): values = values.astype(original.dtype.subtype) return cls(values, closed=original.closed) - _interval_shared_docs[ - "from_breaks" - ] = """ + _interval_shared_docs["from_breaks"] = textwrap.dedent( + """ Construct an %(klass)s from an array of splits. Parameters @@ -277,24 +276,34 @@ def _from_factorized(cls, values, original): %(klass)s.from_arrays : Construct from a left and right array. %(klass)s.from_tuples : Construct from a sequence of tuples. - Examples - -------- - >>> pd.%(qualname)s.from_breaks([0, 1, 2, 3]) - %(klass)s([(0, 1], (1, 2], (2, 3]], - closed='right', - dtype='interval[int64]') + %(examples)s\ """ + ) @classmethod - @Appender(_interval_shared_docs["from_breaks"] % _shared_docs_kwargs) + @Appender( + _interval_shared_docs["from_breaks"] + % dict( + klass="IntervalArray", + examples=textwrap.dedent( + """\ + Examples + -------- + >>> pd.arrays.IntervalArray.from_breaks([0, 1, 2, 3]) + + [(0, 1], (1, 2], (2, 3]] + Length: 3, closed: right, dtype: interval[int64] + """ + ), + ) + ) def from_breaks(cls, breaks, closed="right", copy=False, dtype=None): breaks = maybe_convert_platform_interval(breaks) return cls.from_arrays(breaks[:-1], breaks[1:], closed, copy=copy, dtype=dtype) - _interval_shared_docs[ - "from_arrays" - ] = """ + _interval_shared_docs["from_arrays"] = textwrap.dedent( + """ Construct from two arrays defining the left and right bounds. Parameters @@ -340,16 +349,25 @@ def from_breaks(cls, breaks, closed="right", copy=False, dtype=None): using an unsupported type for `left` or `right`. At the moment, 'category', 'object', and 'string' subtypes are not supported. - Examples - -------- - >>> %(klass)s.from_arrays([0, 1, 2], [1, 2, 3]) - %(klass)s([(0, 1], (1, 2], (2, 3]], - closed='right', - dtype='interval[int64]') + %(examples)s\ """ + ) @classmethod - @Appender(_interval_shared_docs["from_arrays"] % _shared_docs_kwargs) + @Appender( + _interval_shared_docs["from_arrays"] + % dict( + klass="IntervalArray", + examples=textwrap.dedent( + """\ + >>> pd.arrays.IntervalArray.from_arrays([0, 1, 2], [1, 2, 3]) + + [(0, 1], (1, 2], (2, 3]] + Length: 3, closed: right, dtype: interval[int64] + """ + ), + ) + ) def from_arrays(cls, left, right, closed="right", copy=False, dtype=None): left = maybe_convert_platform_interval(left) right = maybe_convert_platform_interval(right) @@ -358,9 +376,8 @@ def from_arrays(cls, left, right, closed="right", copy=False, dtype=None): left, right, closed, copy=copy, dtype=dtype, verify_integrity=True ) - _interval_shared_docs[ - "from_tuples" - ] = """ + _interval_shared_docs["from_tuples"] = textwrap.dedent( + """ Construct an %(klass)s from an array-like of tuples. Parameters @@ -389,15 +406,27 @@ def from_arrays(cls, left, right, closed="right", copy=False, dtype=None): %(klass)s.from_breaks : Construct an %(klass)s from an array of splits. - Examples - -------- - >>> pd.%(qualname)s.from_tuples([(0, 1), (1, 2)]) - %(klass)s([(0, 1], (1, 2]], - closed='right', dtype='interval[int64]') + %(examples)s\ """ + ) @classmethod - @Appender(_interval_shared_docs["from_tuples"] % _shared_docs_kwargs) + @Appender( + _interval_shared_docs["from_tuples"] + % dict( + klass="IntervalArray", + examples=textwrap.dedent( + """\ + Examples + -------- + >>> pd.arrays.IntervalArray.from_tuples([(0, 1), (1, 2)]) + + [(0, 1], (1, 2]] + Length: 2, closed: right, dtype: interval[int64] + """ + ), + ) + ) def from_tuples(cls, data, closed="right", copy=False, dtype=None): if len(data): left, right = [], [] @@ -832,16 +861,20 @@ def _format_data(self): return summary def __repr__(self): - tpl = textwrap.dedent( - """\ - {cls}({data}, - {lead}closed='{closed}', - {lead}dtype='{dtype}')""" + template = ( + "{class_name}" + "{data}\n" + "Length: {length}, closed: {closed}, dtype: {dtype}" ) - return tpl.format( - cls=self.__class__.__name__, - data=self._format_data(), - lead=" " * len(self.__class__.__name__) + " ", + # the short repr has no trailing newline, while the truncated + # repr does. So we include a newline in our template, and strip + # any trailing newlines from format_object_summary + data = self._format_data() + class_name = "<{}>\n".format(self.__class__.__name__) + return template.format( + class_name=class_name, + data=data, + length=len(self), closed=self.closed, dtype=self.dtype, ) @@ -874,9 +907,8 @@ def closed(self): """ return self._closed - _interval_shared_docs[ - "set_closed" - ] = """ + _interval_shared_docs["set_closed"] = textwrap.dedent( + """ Return an %(klass)s identical to the current one, but closed on the specified side. @@ -892,20 +924,31 @@ def closed(self): ------- new_index : %(klass)s + %(examples)s\ + """ + ) + + @Appender( + _interval_shared_docs["set_closed"] + % dict( + klass="IntervalArray", + examples=textwrap.dedent( + """\ Examples -------- - >>> index = pd.interval_range(0, 3) + >>> index = pd.arrays.IntervalArray.from_breaks(range(4)) >>> index - IntervalIndex([(0, 1], (1, 2], (2, 3]], - closed='right', - dtype='interval[int64]') + + [(0, 1], (1, 2], (2, 3]] + Length: 3, closed: right, dtype: interval[int64] >>> index.set_closed('both') - IntervalIndex([[0, 1], [1, 2], [2, 3]], - closed='both', - dtype='interval[int64]') + + [[0, 1], [1, 2], [2, 3]] + Length: 3, closed: both, dtype: interval[int64] """ - - @Appender(_interval_shared_docs["set_closed"] % _shared_docs_kwargs) + ), + ) + ) def set_closed(self, closed): if closed not in _VALID_CLOSED: msg = "invalid option for 'closed': {closed}" @@ -1028,9 +1071,8 @@ def repeat(self, repeats, axis=None): right_repeat = self.right.repeat(repeats) return self._shallow_copy(left=left_repeat, right=right_repeat) - _interval_shared_docs[ - "contains" - ] = """ + _interval_shared_docs["contains"] = textwrap.dedent( + """ Check elementwise if the Intervals contain the value. Return a boolean mask whether the value is contained in the Intervals @@ -1055,16 +1097,27 @@ def repeat(self, repeats, axis=None): Examples -------- - >>> intervals = pd.%(qualname)s.from_tuples([(0, 1), (1, 3), (2, 4)]) - >>> intervals - %(klass)s([(0, 1], (1, 3], (2, 4]], - closed='right', - dtype='interval[int64]') + %(examples)s >>> intervals.contains(0.5) array([ True, False, False]) """ + ) - @Appender(_interval_shared_docs["contains"] % _shared_docs_kwargs) + @Appender( + _interval_shared_docs["contains"] + % dict( + klass="IntervalArray", + examples=textwrap.dedent( + """\ + >>> intervals = pd.arrays.IntervalArray.from_tuples([(0, 1), (1, 3), (2, 4)]) + >>> intervals + + [(0, 1], (1, 3], (2, 4]] + Length: 3, closed: right, dtype: interval[int64] + """ + ), + ) + ) def contains(self, other): if isinstance(other, Interval): raise NotImplementedError("contains not implemented for two intervals") @@ -1073,9 +1126,8 @@ def contains(self, other): other < self.right if self.open_right else other <= self.right ) - _interval_shared_docs[ - "overlaps" - ] = """ + _interval_shared_docs["overlaps"] = textwrap.dedent( + """ Check elementwise if an Interval overlaps the values in the %(klass)s. Two intervals overlap if they share a common point, including closed @@ -1086,7 +1138,7 @@ def contains(self, other): Parameters ---------- - other : Interval + other : %(klass)s Interval to check against for an overlap. Returns @@ -1100,11 +1152,7 @@ def contains(self, other): Examples -------- - >>> intervals = pd.%(qualname)s.from_tuples([(0, 1), (1, 3), (2, 4)]) - >>> intervals - %(klass)s([(0, 1], (1, 3], (2, 4]], - closed='right', - dtype='interval[int64]') + %(examples)s >>> intervals.overlaps(pd.Interval(0.5, 1.5)) array([ True, True, False]) @@ -1117,9 +1165,25 @@ def contains(self, other): >>> intervals.overlaps(pd.Interval(1, 2, closed='right')) array([False, True, False]) - """ + """ + ) - @Appender(_interval_shared_docs["overlaps"] % _shared_docs_kwargs) + @Appender( + _interval_shared_docs["overlaps"] + % dict( + klass="IntervalArray", + examples=textwrap.dedent( + """\ + >>> data = [(0, 1), (1, 3), (2, 4)] + >>> intervals = pd.arrays.IntervalArray.from_tuples(data) + >>> intervals + + [(0, 1], (1, 3], (2, 4]] + Length: 3, closed: right, dtype: interval[int64] + """ + ), + ) + ) def overlaps(self, other): if isinstance(other, (IntervalArray, ABCIntervalIndex)): raise NotImplementedError diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 7c581a12764b1e..29e297cb28a3b8 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -250,7 +250,22 @@ def _simple_new(cls, array, name, closed=None): return result @classmethod - @Appender(_interval_shared_docs["from_breaks"] % _index_doc_kwargs) + @Appender( + _interval_shared_docs["from_breaks"] + % dict( + klass="IntervalIndex", + examples=textwrap.dedent( + """\ + Examples + -------- + >>> pd.IntervalIndex.from_breaks([0, 1, 2, 3]) + IntervalIndex([(0, 1], (1, 2], (2, 3]], + closed='right', + dtype='interval[int64]') + """ + ), + ) + ) def from_breaks(cls, breaks, closed="right", name=None, copy=False, dtype=None): with rewrite_exception("IntervalArray", cls.__name__): array = IntervalArray.from_breaks( @@ -259,7 +274,22 @@ def from_breaks(cls, breaks, closed="right", name=None, copy=False, dtype=None): return cls._simple_new(array, name=name) @classmethod - @Appender(_interval_shared_docs["from_arrays"] % _index_doc_kwargs) + @Appender( + _interval_shared_docs["from_arrays"] + % dict( + klass="IntervalIndex", + examples=textwrap.dedent( + """\ + Examples + -------- + >>> pd.IntervalIndex.from_arrays([0, 1, 2], [1, 2, 3]) + IntervalIndex([(0, 1], (1, 2], (2, 3]], + closed='right', + dtype='interval[int64]') + """ + ), + ) + ) def from_arrays( cls, left, right, closed="right", name=None, copy=False, dtype=None ): @@ -270,7 +300,22 @@ def from_arrays( return cls._simple_new(array, name=name) @classmethod - @Appender(_interval_shared_docs["from_tuples"] % _index_doc_kwargs) + @Appender( + _interval_shared_docs["from_tuples"] + % dict( + klass="IntervalIndex", + examples=textwrap.dedent( + """\ + Examples + -------- + >>> pd.IntervalIndex.from_tuples([(0, 1), (1, 2)]) + IntervalIndex([(0, 1], (1, 2]], + closed='right', + dtype='interval[int64]') + """ + ), + ) + ) def from_tuples(cls, data, closed="right", name=None, copy=False, dtype=None): with rewrite_exception("IntervalArray", cls.__name__): arr = IntervalArray.from_tuples(data, closed=closed, copy=copy, dtype=dtype) @@ -367,7 +412,27 @@ def closed(self): """ return self._data._closed - @Appender(_interval_shared_docs["set_closed"] % _index_doc_kwargs) + @Appender( + _interval_shared_docs["set_closed"] + % dict( + klass="IntervalIndex", + examples=textwrap.dedent( + """\ + Examples + -------- + >>> index = pd.interval_range(0, 3) + >>> index + IntervalIndex([(0, 1], (1, 2], (2, 3]], + closed='right', + dtype='interval[int64]') + >>> index.set_closed('both') + IntervalIndex([[0, 1], [1, 2], [2, 3]], + closed='both', + dtype='interval[int64]') + """ + ), + ) + ) def set_closed(self, closed): if closed not in _VALID_CLOSED: msg = "invalid option for 'closed': {closed}" @@ -1168,11 +1233,41 @@ def equals(self, other): and self.closed == other.closed ) - @Appender(_interval_shared_docs["contains"] % _index_doc_kwargs) + @Appender( + _interval_shared_docs["contains"] + % dict( + klass="IntervalIndex", + examples=textwrap.dedent( + """\ + >>> intervals = pd.IntervalIndex.from_tuples([(0, 1), (1, 3), (2, 4)]) + >>> intervals + IntervalIndex([(0, 1], (1, 3], (2, 4]], + closed='right', + dtype='interval[int64]') + >>> intervals.contains(0.5) + array([ True, False, False]) + """ + ), + ) + ) def contains(self, other): return self._data.contains(other) - @Appender(_interval_shared_docs["overlaps"] % _index_doc_kwargs) + @Appender( + _interval_shared_docs["overlaps"] + % dict( + klass="IntervalIndex", + examples=textwrap.dedent( + """\ + >>> intervals = pd.IntervalIndex.from_tuples([(0, 1), (1, 3), (2, 4)]) + >>> intervals + IntervalIndex([(0, 1], (1, 3], (2, 4]], + closed='right', + dtype='interval[int64]') + """ + ), + ) + ) def overlaps(self, other): return self._data.overlaps(other) diff --git a/pandas/tests/arrays/interval/test_interval.py b/pandas/tests/arrays/interval/test_interval.py index 6a86289b6fcc60..655a6e717119b1 100644 --- a/pandas/tests/arrays/interval/test_interval.py +++ b/pandas/tests/arrays/interval/test_interval.py @@ -93,8 +93,13 @@ def test_set_na(self, left_right_dtypes): tm.assert_extension_array_equal(result, expected) -def test_repr_matches(): - idx = IntervalIndex.from_breaks([1, 2, 3]) - a = repr(idx) - b = repr(idx.values) - assert a.replace("Index", "Array") == b +def test_repr(): + # GH 25022 + arr = IntervalArray.from_tuples([(0, 1), (1, 2)]) + result = repr(arr) + expected = ( + "\n" + "[(0, 1], (1, 2]]\n" + "Length: 2, closed: right, dtype: interval[int64]" + ) + assert result == expected